##### Import Libs

In [13]:
import pandas as pd
from transformers import pipeline, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from datasets import load_dataset
from sklearn.metrics import classification_report

##### Load the Dataset

In [14]:
# Load the CSV dataset
# Can't specify split because the dataset has not been divided to train and test yet
# So everything is loaded to "train"
dataset = load_dataset('csv', data_files='./datasets/youtoxic_english_1000.csv')

# Define a function to do some preprocessing - rename target and map values
# I'll rename the target column "IsToxic" to "labels".
def preprocess_data(data):
    # Rename columns    
    data['labels'] = data.pop('IsToxic')
    data['text'] = data.pop('Text')
    # Map true/false to 1/0
    data['labels'] = 1 if data['labels'] == True else 0
    return data

# Run the preprocessing
dict_dataset = dataset.map(preprocess_data)

# Split the entire dataset into train-test
dict_train_test_split = dict_dataset["train"].train_test_split(test_size=0.2, seed=823)

# Further split the train to train-val
dict_train_val_split = dict_train_test_split["train"].train_test_split(test_size=0.3, seed=823)

# Access the train and test sets
dict_train = dict_train_val_split["train"]
dict_val = dict_train_val_split["test"]
dict_test = dict_train_test_split["test"]

# Define the tokenization function
# Here is where you specify the input column
def tokenize_function(input_string, tokenizer):
    return tokenizer(input_string['text'], padding='max_length', truncation=True, truncation_strategy='longest_first')

##### (1) Fine-Tuning a Pre-trained Model (originally trained for a different task) using the custom data

##### Initialize the Tokenizer and the Model

In [15]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# ^Here, the AutoModelForSequenceClassification automatically adds a classification layer on top of the pre-trained BERT/DistilBERT model,
# making it suitable for sequence classification tasks. 
# The underlying transformer model (DistilBERT in this case) serves as a feature extractor and the classification head predicts the class label.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Tokenize the Dataset

In [16]:
# Tokenize
dict_train_tokenized = dict_train.map(lambda x: tokenize_function(x, tokenizer), batched=True)
dict_test_tokenized = dict_test.map(lambda x: tokenize_function(x, tokenizer), batched=True)
dict_val_tokenized = dict_val.map(lambda x: tokenize_function(x, tokenizer), batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map: 100%|██████████| 200/200 [00:00<00:00, 2161.55 examples/s]


##### Setup Hyperparms & Training Args

In [17]:
# Hyperparams + other config
num_train_epochs = 5
per_device_train_batch_size = 8
per_device_eval_batch_size = 16
weight_decay = 0.01
learning_rate = 2e-5
logging_steps = 10
warmup_steps = 20 # Calculated using total_train_steps = 0.1 * len(dict_train_tokenized) // per_device_train_batch_size*num_train_epochs

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results/run1',                               # Directory to save results
    num_train_epochs=num_train_epochs,                         # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,   # Batch size per device during training
    per_device_eval_batch_size=per_device_eval_batch_size,     # Batch size per device during evaluation
    warmup_steps=warmup_steps,                                 # Number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                                 # Strength of weight decay
    learning_rate=learning_rate,                               # Learning Rate
    logging_dir='./logs',                                      # Directory for storing logs
    logging_steps=logging_steps,                               # Frequency of logging steps
    evaluation_strategy="epoch",
    save_strategy="epoch",                                  
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1 
    # When save_total_limit=1 and load_best_model_at_end=True, 
    # it is possible that two checkpoints are saved: the last one and the best one (if they are different).
)



##### Initialize the Trainer

In [18]:
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=dict_train_tokenized,   # The dataset to train on
    eval_dataset=dict_val_tokenized,

)

##### Start training

In [19]:
trainer.train()

ClearML Task: created new task id=97d9333c192a4f2081e63ef4707a8eeb
2024-08-25 16:34:41,999 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/a280739eaca04ea6b7be4b98665e026b/experiments/97d9333c192a4f2081e63ef4707a8eeb/output/log


  2%|▎         | 10/400 [00:13<07:52,  1.21s/it]

{'loss': 0.6935, 'grad_norm': 0.9224388599395752, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.25}


  5%|▌         | 20/400 [00:25<07:25,  1.17s/it]

{'loss': 0.6929, 'grad_norm': 2.55173921585083, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.5}


  8%|▊         | 30/400 [00:37<07:11,  1.17s/it]

{'loss': 0.6818, 'grad_norm': 1.7722618579864502, 'learning_rate': 1.2e-05, 'epoch': 0.75}


 10%|▉         | 39/400 [00:47<07:01,  1.17s/it]

##### (2) Using a Pre-trained Model directly

In [8]:
model = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)  # Adjust num_labels as needed
classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

# Use pipeline(task="text-classification") for simplicity and ease of use, especially for standard tasks where you don't need much customization.
# Use TextClassificationPipeline when you need more control over the pipeline components or when you're working on a more complex or customized task.

In [21]:
inputs_test = dict_test['text']
actual_labels = dict_test["labels"]
# Make predictions
predictions = classifier(inputs_test)
# Predicted labels are "non-toxic" and "toxic", so we still need to map that
label_mapping = {"non-toxic": 0, "toxic": 1}
predicted_labels = [label_mapping[pred['label']] for pred in predictions]

In [23]:
# Print classification report
print(classification_report(actual_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.71      0.85      0.78       118
           1       0.70      0.51      0.59        82

    accuracy                           0.71       200
   macro avg       0.71      0.68      0.68       200
weighted avg       0.71      0.71      0.70       200




##### (3) Fine-Tuning using the on Custom Data

In [34]:
# Tokenize the dataset
# Here is where you specify the input column
def tokenize_function(input_string):
    return tokenizer(input_string['text'], padding='max_length', truncation=True, truncation_strategy='longest_first')

# Apply the tokenization function to your dataset
dict_train_tokenized = dict_train.map(tokenize_function, batched=True)
dict_test_tokenized = dict_test.map(tokenize_function, batched=True)
dict_val_tokenized = dict_val.map(tokenize_function, batched=True)

# Format dataset for PyTorch
# Here is where you specify the label/target column
dict_train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dict_test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dict_val_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [37]:
# Hyperparams + other config
num_train_epochs = 10
per_device_train_batch_size = 16
per_device_eval_batch_size = 32
weight_decay = 0.01
learning_rate = 2e-5
logging_steps = 10
warmup_steps = 50 # Calculated using total_train_steps = 0.1 * len(dict_train_tokenized) // per_device_train_batch_size*num_train_epochs

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',                                    # Directory to save results
    num_train_epochs=num_train_epochs,                         # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,   # Batch size per device during training
    per_device_eval_batch_size=per_device_eval_batch_size,     # Batch size per device during evaluation
    warmup_steps=warmup_steps,                                 # Number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                                 # Strength of weight decay
    learning_rate=learning_rate,                               # Learning Rate
    logging_dir='./logs',                                      # Directory for storing logs
    logging_steps=logging_steps,                               # Frequency of logging steps
    evaluation_strategy="epoch",
    save_strategy="epoch",                                  
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1 
    # When save_total_limit=1 and load_best_model_at_end=True, 
    # it is possible that two checkpoints are saved: the last one and the best one (if they are different).
)

In [38]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=dict_train_tokenized,   # The dataset to train on
    eval_dataset=dict_val_tokenized,

)

In [39]:
# Start training
trainer.train()

ClearML Task: created new task id=f2bd721c76c848919ebcf0b58ad36b89
2024-08-20 13:22:03,766 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/a280739eaca04ea6b7be4b98665e026b/experiments/f2bd721c76c848919ebcf0b58ad36b89/output/log


  2%|▎         | 10/400 [00:36<24:17,  3.74s/it]

{'loss': 0.7439, 'grad_norm': 12.64841365814209, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.25}


  5%|▌         | 20/400 [01:15<24:36,  3.89s/it]

{'loss': 0.8521, 'grad_norm': 7.488814353942871, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.5}


  8%|▊         | 30/400 [01:52<22:10,  3.60s/it]

{'loss': 0.8086, 'grad_norm': 299.7953186035156, 'learning_rate': 1.2e-05, 'epoch': 0.75}


 10%|█         | 40/400 [02:28<21:11,  3.53s/it]

{'loss': 0.7156, 'grad_norm': 3.7490789890289307, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


                                                
 10%|█         | 40/400 [02:47<21:11,  3.53s/it]

{'eval_loss': 0.6793511509895325, 'eval_runtime': 19.0435, 'eval_samples_per_second': 8.402, 'eval_steps_per_second': 0.263, 'epoch': 1.0}
2024-08-20 13:25:27,485 - clearml.storage - INFO - Starting upload: C:\Users\djbac\AppData\Local\Temp\model_package.vz4lml_p.zip => https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-40.zip


 12%|█▎        | 50/400 [03:41<23:08,  3.97s/it]  

{'loss': 0.6041, 'grad_norm': 12.563817977905273, 'learning_rate': 2e-05, 'epoch': 1.25}


 15%|█▌        | 60/400 [04:17<19:09,  3.38s/it]

{'loss': 0.6205, 'grad_norm': 3.959160089492798, 'learning_rate': 1.942857142857143e-05, 'epoch': 1.5}


 18%|█▊        | 70/400 [04:53<18:46,  3.41s/it]

{'loss': 0.6236, 'grad_norm': 5.251299858093262, 'learning_rate': 1.885714285714286e-05, 'epoch': 1.75}


 20%|██        | 80/400 [05:31<20:07,  3.77s/it]

{'loss': 0.6487, 'grad_norm': 3.435767650604248, 'learning_rate': 1.8285714285714288e-05, 'epoch': 2.0}


                                                
 20%|██        | 80/400 [05:51<20:07,  3.77s/it]

{'eval_loss': 0.6117069125175476, 'eval_runtime': 20.5631, 'eval_samples_per_second': 7.781, 'eval_steps_per_second': 0.243, 'epoch': 2.0}


Could not remove checkpoint `checkpoint-40` after going over the `save_total_limit`. Error is: Could not remove model id=7451857fbf1e495eabedff7dad3743e9 weights file 'https://files.clear.ml/uploading_file': Could not remove model id=7451857fbf1e495eabedff7dad3743e9 weights file: https://files.clear.ml/uploading_file
 22%|██▎       | 90/400 [06:49<22:01,  4.26s/it]  

{'loss': 0.7548, 'grad_norm': 2.6698801517486572, 'learning_rate': 1.7714285714285717e-05, 'epoch': 2.25}


 25%|██▌       | 100/400 [07:25<18:29,  3.70s/it]

{'loss': 0.5406, 'grad_norm': 6.709475040435791, 'learning_rate': 1.7142857142857142e-05, 'epoch': 2.5}


 28%|██▊       | 110/400 [08:04<18:13,  3.77s/it]

{'loss': 0.6495, 'grad_norm': 2.365192174911499, 'learning_rate': 1.6571428571428574e-05, 'epoch': 2.75}


 30%|███       | 120/400 [08:39<16:12,  3.47s/it]

{'loss': 0.613, 'grad_norm': 4.1874799728393555, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.0}


                                                 
 30%|███       | 120/400 [09:00<16:12,  3.47s/it]

{'eval_loss': 0.6756014823913574, 'eval_runtime': 21.6122, 'eval_samples_per_second': 7.403, 'eval_steps_per_second': 0.231, 'epoch': 3.0}
2024-08-20 13:31:40,381 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-120.zip)
2024-08-20 13:32:10,394 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-120.zip)
2024-08-20 13:32:40,396 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-120.zip)
2024-08-20 13:33:10,411 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-120.zip)
2024-08-2

Could not remove checkpoint `checkpoint-40` after going over the `save_total_limit`. Error is: Could not remove model id=7451857fbf1e495eabedff7dad3743e9: <400/201: models.delete/v1.0 (Invalid model id: company=ba9b1e32450c4261bb4aab543465389a, id=7451857fbf1e495eabedff7dad3743e9)>
 32%|███▎      | 130/400 [48:26<2:24:38, 32.14s/it]  

{'loss': 0.6894, 'grad_norm': 5.3642706871032715, 'learning_rate': 1.542857142857143e-05, 'epoch': 3.25}


 35%|███▌      | 140/400 [49:03<19:28,  4.49s/it]  

{'loss': 0.581, 'grad_norm': 42.47323989868164, 'learning_rate': 1.4857142857142858e-05, 'epoch': 3.5}


 38%|███▊      | 150/400 [49:40<15:30,  3.72s/it]

{'loss': 0.7223, 'grad_norm': 17.145112991333008, 'learning_rate': 1.4285714285714287e-05, 'epoch': 3.75}


 40%|████      | 160/400 [50:17<14:46,  3.69s/it]

{'loss': 0.6418, 'grad_norm': 3.012005567550659, 'learning_rate': 1.3714285714285716e-05, 'epoch': 4.0}


                                                 
 40%|████      | 160/400 [50:35<14:46,  3.69s/it]

{'eval_loss': 0.7141662240028381, 'eval_runtime': 17.8451, 'eval_samples_per_second': 8.966, 'eval_steps_per_second': 0.28, 'epoch': 4.0}
2024-08-20 14:13:15,303 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-160.zip)
2024-08-20 14:13:45,308 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-160.zip)
2024-08-20 14:14:15,320 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-160.zip)
2024-08-20 14:14:45,329 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.f2bd721c76c848919ebcf0b58ad36b89/models/checkpoint-160.zip)


In [None]:
# Load the best model (optional jic you got disconnected to the kernel, here's how you load the model)
model_path = './results'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [10]:
# Evaluate the model on the test set
eval_results = trainer.evaluate(eval_dataset=dict_test_tokenized)
print(f"Evaluation results: {eval_results}")

100%|██████████| 7/7 [00:38<00:00,  5.54s/it]

Evaluation results: {'eval_loss': 0.6198015809059143, 'eval_runtime': 47.3297, 'eval_samples_per_second': 4.226, 'eval_steps_per_second': 0.148, 'epoch': 10.0}





In [11]:
# Make predictions on the test set
predictions, labels, _ = trainer.predict(test_dataset=dict_test_tokenized)
# Convert predictions to labels
predicted_labels = predictions.argmax(axis=-1)

100%|██████████| 7/7 [00:38<00:00,  5.48s/it]


In [12]:
from sklearn.metrics import classification_report
print(classification_report(labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.84      0.66      0.74       118
           1       0.63      0.82      0.71        82

    accuracy                           0.72       200
   macro avg       0.73      0.74      0.72       200
weighted avg       0.75      0.72      0.73       200



In [197]:
predicted_labels

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0], dtype=int64)