In [1]:
# install required libraries
!pip install -q transformers
!pip install -q datasets
!pip install -q accelerate -U

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pybids 0.15.1 requires sqlalchemy<1.4.0.dev0, but you have sqlalchemy 2.0.21 which is incompatible.[0m[31m
[0m

In [2]:
# change working directory to root
import os 

os.chdir('../')
os.getcwd()

'/usr4/ugrad/cjxu/Documents/trusty-ai'

In [3]:
# load jigsaw dataset downloaded from https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data
from datasets import load_dataset

data_dir = 'jigsaw-toxic-comment-classification-challenge'

dataset = load_dataset('jigsaw_toxicity_pred', data_dir=data_dir)
dataset['train'][0]

Downloading builder script:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.37k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/159571 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/63978 [00:00<?, ? examples/s]

{'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [5]:
# get labels
labels = [label for label in dataset['train'].features.keys() if label not in ['comment_text']]
labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [6]:
# load tokenizer
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [7]:
import numpy as np

def preprocess_data(examples):
    text = examples['comment_text']
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

In [8]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

Map:   0%|          | 0/63978 [00:00<?, ? examples/s]

In [9]:
# check for gpu availibilty
import torch

device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [10]:
# load model onto gpu
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=len(labels)
                                                          )
                                            
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [12]:
# define hyperparameters
BATCH_SIZE = 32
EPOCHS = 5
METRIC_NAME = 'f1'
LR = 2e-5

In [13]:
# define training arguments
from transformers import TrainingArguments, Trainer

model_dir = f"models/{model_name}-finetuned-jigasaw"
args = TrainingArguments(
    output_dir = model_dir, 
    save_strategy = "epoch", 
    evaluation_strategy = "epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    metric_for_best_model=METRIC_NAME,
    #push_to_hub=True,
)


In [14]:
# define metrics
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [15]:
# finetune model 
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /usr4/ugrad/cjxu/.netrc
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0376,0.074759,0.664137,0.903216,0.86164
2,0.0336,0.065008,0.680043,0.892165,0.875801
3,0.0284,0.067434,0.685323,0.888251,0.878364
4,0.0207,0.079763,0.671853,0.895988,0.871503
5,0.0154,0.08782,0.668641,0.892834,0.867705


TrainOutput(global_step=24935, training_loss=0.02945957342560506, metrics={'train_runtime': 6210.1748, 'train_samples_per_second': 128.475, 'train_steps_per_second': 4.015, 'total_flos': 2.642432891579136e+16, 'train_loss': 0.02945957342560506, 'epoch': 5.0})

In [17]:
# make prediction
output = trainer.predict(encoded_dataset["test"])
output.metrics

{'test_loss': 0.08782042562961578,
 'test_f1': 0.6686412439837097,
 'test_roc_auc': 0.8928338920247979,
 'test_accuracy': 0.867704523429929,
 'test_runtime': 139.9905,
 'test_samples_per_second': 457.017,
 'test_steps_per_second': 14.287}

In [36]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)
     

In [37]:
# calculate hamming score
from sklearn import metrics

y_true = np.array(encoded_dataset['test']['labels'])

probs = sigmoid(torch.Tensor(output.predictions))
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1

val_hamming_loss = metrics.hamming_loss(y_true, y_pred)
val_hamming_score = hamming_score(np.array(y_true), np.array(y_pred))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.8989756895599529
Hamming Loss = 0.028064334615023914
