# DistilBERT model
This notebook trains the DistilBERT model and exports a set of predictions for a test dataset.

**Trains on:** Waseem and Hovy (2016)

First we need to install the required packages.

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install "ray[tune]"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
!pip install wordsegment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizer
import numpy as np
from datasets import load_metric, load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pickle
import torch
from google.colab import drive
import preprocessor as p
import html
from torch.utils.data import DataLoader
import torch.nn.functional as nn
from ray.tune.schedulers import PopulationBasedTraining

# Mount drive for loading the datasets
drive.mount('/content/drive')
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/')

from reader import Reader

FILENAME = "drive/MyDrive/Colab Notebooks/data/twitter_data.pkl"
NUM_LABELS = 2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Split and tokenize the datasets

In [8]:
class HateDataset(torch.utils.data.Dataset):
    """Dataset class used for combining the data encodings and labels."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
X, y = Reader.load(FILENAME)
X = Reader.preprocess(X)

mapping = {'racism': 1,'sexism': 1, 'none': 0}
y = [mapping[b] for b in y]

# Split dataset into train, test, and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=10, test_size=.2)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize all datasets
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# Combine the encodings with the labels to Torch datasets
train_dataset = HateDataset(train_encodings, y_train)
val_dataset = HateDataset(val_encodings, y_val)
test_dataset = HateDataset(test_encodings, y_test)

## Load accuracy metric for the model's evaluation

In [10]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Setup DistilBERT model

In [13]:
def model_init():
    return DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=NUM_LABELS)
    
training_args = TrainingArguments(output_dir="train", evaluation_strategy="epoch")

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset ,
    compute_metrics=compute_metrics
)

scheduler = PopulationBasedTraining(
        metric='objective',
        mode='max',
        perturbation_interval=600.0,
        hyperparam_mutations={
            "per_device_train_batch_size": [16, 32],
            "learning_rate": [2e-5, 3e-5, 5e-5],
            "num_train_epochs": [2, 3, 4]
        })

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,

## Training

In [14]:
best_trial = trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=10,
    scheduler=scheduler
)

Output hidden; open in https://colab.research.google.com to view.

In [15]:
print(best_trial)

BestRun(run_id='73c1a_00005', objective=0.8619261304798067, hyperparameters={'learning_rate': 1.1207606211860595e-05, 'num_train_epochs': 4, 'seed': 1.8994345766152145, 'per_device_train_batch_size': 16})


In [21]:
learning_rate = best_trial.hyperparameters['learning_rate']
num_train_epochs = best_trial.hyperparameters['num_train_epochs']
per_device_train_batch_size = best_trial.hyperparameters['per_device_train_batch_size']
seed = best_trial.hyperparameters['seed']

In [26]:
setattr(trainer.args, 'learning_rate', learning_rate)
setattr(trainer.args, 'num_train_epochs', num_train_epochs)
setattr(trainer.args, 'per_device_train_batch_size', per_device_train_batch_size)
setattr(trainer.args, 'seed', 42)

trainer.train()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.2",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3867,0.376223,0.846393
2,0.3327,0.371816,0.853642
3,0.2542,0.458454,0.856403
4,0.2062,0.524698,0.85951


Saving model checkpoint to train/checkpoint-500
Configuration saved in train/checkpoint-500/config.json
Model weights saved in train/checkpoint-500/pytorch_model.bin
Saving model checkpoint to train/checkpoint-1000
Configuration saved in train/checkpoint-1000/config.json
Model weights saved in train/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2897
  Batch size = 8
Saving model checkpoint to train/checkpoint-1500
Configuration saved in train/checkpoint-1500/config.json
Model weights saved in train/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to train/checkpoint-2000
Configuration saved in train/checkpoint-2000/config.json
Model weights saved in train/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to train/checkpoint-2500
Configuration saved in train/checkpoint-2500/config.json
Model weights saved in train/checkpoint-2500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2897
  Batch size = 8
Saving model check

TrainOutput(global_step=5792, training_loss=0.2972884586502834, metrics={'train_runtime': 346.3271, 'train_samples_per_second': 133.793, 'train_steps_per_second': 16.724, 'total_flos': 935087367112704.0, 'train_loss': 0.2972884586502834, 'epoch': 4.0})

In [30]:
path = F"drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth" 
trainer.save_model(path)

Saving model checkpoint to drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth
Configuration saved in drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth/config.json
Model weights saved in drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth/pytorch_model.bin


In [31]:
model = DistilBertForSequenceClassification.from_pretrained(path)

loading configuration file drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "vocab_size": 30522
}

loading weights file drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy.pth/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassificatio

## Model calibration
We use temperature scaling to calibrate the model on the validation set by finding the optimal T value.

In [32]:
import sys
sys.path.append("drive/MyDrive/Colab Notebooks")
from temperature_scaling import ModelWithTemperature


In [33]:
calibrated_model = ModelWithTemperature(model)
val_loader = DataLoader(val_dataset)

# Find optimal T value to calibrate the model
calibrated_model.set_temperature(val_loader)


Before temperature - NLL: 0.406, ECE: 0.036
Optimal temperature: 1.338
After temperature - NLL: 0.403, ECE: 0.042


ModelWithTemperature(
  (model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (

## Export model

In [34]:
path = F"drive/MyDrive/Colab Notebooks/output/distilbert-waseem-hovy-calibrated.pth" 
torch.save(calibrated_model, path)