In [None]:
# Transformers installation
! pip install -U transformers datasets fsspec kaggle evaluate optuna

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


# Fine-tune a pretrained model

## Prepare a dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
  0% 0.00/52.6M [00:00<?, ?B/s]
100% 52.6M/52.6M [00:00<00:00, 705MB/s]


In [None]:
!unzip -o jigsaw-toxic-comment-classification-challenge.zip -d jigsaw-toxic-comment-classification
!cd jigsaw-toxic-comment-classification
!unzip -o jigsaw-toxic-comment-classification/train.csv.zip -d jigsaw-toxic-comment-classification
!unzip -o jigsaw-toxic-comment-classification/test.csv.zip -d jigsaw-toxic-comment-classification
!unzip -o jigsaw-toxic-comment-classification/test_labels.csv.zip -d jigsaw-toxic-comment-classification

Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: jigsaw-toxic-comment-classification/sample_submission.csv.zip  
  inflating: jigsaw-toxic-comment-classification/test.csv.zip  
  inflating: jigsaw-toxic-comment-classification/test_labels.csv.zip  
  inflating: jigsaw-toxic-comment-classification/train.csv.zip  
Archive:  jigsaw-toxic-comment-classification/train.csv.zip
  inflating: jigsaw-toxic-comment-classification/train.csv  
Archive:  jigsaw-toxic-comment-classification/test.csv.zip
  inflating: jigsaw-toxic-comment-classification/test.csv  
Archive:  jigsaw-toxic-comment-classification/test_labels.csv.zip
  inflating: jigsaw-toxic-comment-classification/test_labels.csv  


In [None]:
from datasets import load_dataset

dataset = load_dataset("google/jigsaw_toxicity_pred", data_dir="jigsaw-toxic-comment-classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.37k [00:00<?, ?B/s]

jigsaw_toxicity_pred.py:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

The repository for google/jigsaw_toxicity_pred contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/google/jigsaw_toxicity_pred.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Generating train split:   0%|          | 0/159571 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/63978 [00:00<?, ? examples/s]

As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def convert_to_binary_classifier(comments):
  label_columns = ['toxic',
  'severe_toxic',
  'obscene',
  'threat',
  'insult',
  'identity_hate']
  is_toxic_labels = [any(comments[label][i] == 1 for label in label_columns) for i in range(len(comments['comment_text']))]
  tokenized_output = tokenizer(comments["comment_text"], padding="max_length", truncation=True)
  tokenized_output["labels"] = [1 if is_toxic else 0 for is_toxic in is_toxic_labels]
  return tokenized_output

tokenized_datasets = dataset.map(convert_to_binary_classifier, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

Map:   0%|          | 0/63978 [00:00<?, ? examples/s]

## Train

### Evaluate

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return accuracy.compute(predictions=predictions, references=labels)

### Trainer

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer", num_train_epochs = 1)

Create a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) object with your model, training arguments, training and test datasets, and evaluation function:

In [None]:
from transformers import AutoModelForSequenceClassification
def model_init(trial):
  return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
from transformers import Trainer
trainer = Trainer(
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    model_init=model_init
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Then fine-tune your model by calling [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train):

In [None]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8]),
    }

best_trials = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=10,
)
best_trials

[I 2025-06-09 21:14:46,020] A new study created in memory with name: no-name-f7f9465d-761a-4318-980e-26f77b1949cd
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,0.02507
train/global_step,500.0
train/grad_norm,0.33071
train/learning_rate,8e-05
train/loss,0.2245


Step,Training Loss
500,0.3597
1000,0.205
1500,0.156
2000,0.1428


[W 2025-06-09 21:30:27,295] Trial 0 failed with parameters: {'learning_rate': 1.1092475568752265e-06, 'per_device_train_batch_size': 8} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/integrations/integration_utils.py", line 255, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2240, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2560, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
[W 2025

KeyboardInterrupt: 

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", **best_trials.hyperparameters)

In [None]:
trainer.train()

<a id='pytorch_native'></a>