In [None]:
## BERT and Friends - Project ##

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 15.0 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 52.9 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 76.3 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 64.6 MB/s 
Installing coll

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the emotion dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the emotion dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

raw_datasets = load_dataset('emotion') 
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
## Checking if GPU is available ##

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
### Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ### - BERT Student model ###

# Using BERT-base model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-emotion-KD-BERT"

In [None]:
## Teacher model: https://huggingface.co/gokuls/bert-base-emotion-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "gokuls/bert-base-emotion-intent" ## Our pre-trained BERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/995 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,3.8247,2.516968,0.7745
2,1.9864,1.343567,0.874
3,1.1126,0.829896,0.894
4,0.6924,0.649953,0.9025
5,0.5272,0.609659,0.908
6,0.4298,0.591322,0.904
7,0.3936,0.516514,0.9135
8,0.3238,0.512023,0.9075
9,0.3018,0.498932,0.916
10,0.2605,0.480992,0.9175


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to bert-tiny-emotion-KD-BERT/checkpoint-1000
Configuration saved in bert-tiny-emotion-KD-BERT/checkpoint-1000/config.json
Model weights saved in bert-tiny-emotion-KD-BERT/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-BERT/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in bert-tiny-emotion-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-tiny-emotion-KD-BERT/checkpoint-2000
Configuration saved in bert-tiny-emotion-KD-BERT/checkpoint-2000/config.json
Model weights saved in bert-tiny-emotion-KD-BERT/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-BERT/checkpoint-2000/tokenizer_config.json
Special tokens file saved in

TrainOutput(global_step=13000, training_loss=0.8098189380352314, metrics={'train_runtime': 676.5944, 'train_samples_per_second': 1182.392, 'train_steps_per_second': 73.9, 'total_flos': 24236036181312.0, 'train_loss': 0.8098189380352314, 'epoch': 13.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.480991929769516,
 'eval_accuracy': 0.9175,
 'eval_runtime': 4.5346,
 'eval_samples_per_second': 441.051,
 'eval_steps_per_second': 27.566,
 'epoch': 13.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.4005022644996643, 'test_accuracy': 0.911, 'test_runtime': 5.2454, 'test_samples_per_second': 381.29, 'test_steps_per_second': 23.831}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 5]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.911
              precision    recall  f1-score   support

     sadness       0.95      0.94      0.95       581
         joy       0.94      0.94      0.94       695
        love       0.80      0.81      0.81       159
       anger       0.90      0.91      0.90       275
        fear       0.88      0.89      0.89       224
    surprise       0.73      0.73      0.73        66

    accuracy                           0.91      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.91      0.91      0.91      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-emotion-KD-BERT
Configuration saved in bert-tiny-emotion-KD-BERT/config.json
Model weights saved in bert-tiny-emotion-KD-BERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file logs/events.out.tfevents.1664046419.85a9cd62cad8.69.0:  33%|###2      | 3.34k/10.3k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1664047101.85a9cd62cad8.69.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT
   548ed19..eab72c2  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT
   548ed19..eab72c2  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT


# Using distilBERT model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-emotion-KD-distilBERT"

In [None]:
## Teacher model: https://huggingface.co/gokuls/distilbert-emotion-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "gokuls/distilbert-emotion-intent" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-emotion-KD-distilBERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,4.2533,2.835832,0.7675
2,2.3274,1.5893,0.8675
3,1.3974,1.028552,0.891
4,0.9035,0.753356,0.8955
5,0.6619,0.63504,0.905
6,0.5482,0.61797,0.899
7,0.4937,0.544813,0.91
8,0.4013,0.549313,0.906
9,0.3839,0.548072,0.9095
10,0.3281,0.552757,0.9115


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-tiny-emotion-KD-distilBERT/checkpoint-1000
Configuration saved in bert-tiny-emotion-KD-distilBERT/checkpoint-1000/config.json
Model weights saved in bert-tiny-emotion-KD-distilBERT/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-distilBERT/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-distilBERT/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in bert-tiny-emotion-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-distilBERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-tiny-emotion-KD-distilBERT/checkpoint-2000
Configuration saved in bert-tiny-emotion-KD-distilBERT/checkpoint-2000/config.json
Model weights saved in bert-tiny-emotion-KD-distilBERT/checkpoint-2000/pytorch_mo

TrainOutput(global_step=16000, training_loss=0.8240785818099976, metrics={'train_runtime': 675.7872, 'train_samples_per_second': 1183.805, 'train_steps_per_second': 73.988, 'total_flos': 29831722744896.0, 'train_loss': 0.8240785818099976, 'epoch': 16.0})

In [None]:
### BERT-tiny model's convergence of loss takes a very very long time ###

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.5444262623786926,
 'eval_accuracy': 0.913,
 'eval_runtime': 3.1885,
 'eval_samples_per_second': 627.257,
 'eval_steps_per_second': 39.204,
 'epoch': 16.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.51967453956604, 'test_accuracy': 0.903, 'test_runtime': 3.8729, 'test_samples_per_second': 516.403, 'test_steps_per_second': 32.275}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 5]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.903
              precision    recall  f1-score   support

     sadness       0.94      0.95      0.95       581
         joy       0.94      0.91      0.92       695
        love       0.74      0.84      0.78       159
       anger       0.90      0.89      0.90       275
        fear       0.93      0.83      0.87       224
    surprise       0.66      0.82      0.73        66

    accuracy                           0.90      2000
   macro avg       0.85      0.87      0.86      2000
weighted avg       0.91      0.90      0.90      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-emotion-KD-distilBERT
Configuration saved in bert-tiny-emotion-KD-distilBERT/config.json
Model weights saved in bert-tiny-emotion-KD-distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-distilBERT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file logs/events.out.tfevents.1664047312.85a9cd62cad8.3184.0:  29%|##8       | 3.34k/11.7k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664047991.85a9cd62cad8.3184.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-distilBERT
   e4bdff9..91260f1  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-distilBERT
   e4bdff9..91260f1  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-emotion-KD-distilBERT


# Using BERT-base model and distilBERT model (multiple teacher model) for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-emotion-KD-BERT_and_distilBERT"

In [None]:
## Teacher model : https://huggingface.co/gokuls/bert-base-emotion-intent ##
## Teacher model 2: https://huggingface.co/gokuls/distilbert-emotion-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id_1 = "gokuls/bert-base-emotion-intent" ## Our pre-trained BERT model is used as teacher
teacher_id_2 = "gokuls/distilbert-emotion-intent" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer1 = AutoTokenizer.from_pretrained(teacher_id_1)
teacher_tokenizer2 = AutoTokenizer.from_pretrained(teacher_id_2)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer 1: ', teacher_tokenizer1(sample))
print('Teacher tokenizer 2: ', teacher_tokenizer2(sample))
print('Student tokenizer: ', student_tokenizer(sample))

## Tokenization outputs are similar ##

Teacher tokenizer 1:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Teacher tokenizer 2:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id_2) ## Don't use the tokenizer from teacher_id_1 (BERT), since it produces token_type_id which is not needed by distilBERT ##

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)



In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model_1=None, teacher_model_2=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher1 = teacher_model_1
        self.teacher2 = teacher_model_2
        # student and teacher on same device #
        self._move_model_to_device(self.teacher1,self.model.device)
        self._move_model_to_device(self.teacher2,self.model.device)
        self.teacher1.eval()
        self.teacher2.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss # output from gold labels
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher1 = self.teacher1(**inputs)
            outputs_teacher2 = self.teacher2(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher1.logits.size()
        assert outputs_student.logits.size() == outputs_teacher2.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher1.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2)) + (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher2.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher models #
teacher_model1 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_1,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

teacher_model2 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_2,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model_1=teacher_model1,
    teacher_model_2=teacher_model2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT_and_distilBERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,7.1848,4.740427,0.774
2,3.856,2.731707,0.8685
3,2.3973,1.83286,0.8895
4,1.5273,1.293771,0.898
5,1.113,1.129755,0.8985
6,0.9099,1.074632,0.907
7,0.831,1.007138,0.907
8,0.6813,0.955601,0.9115
9,0.6432,0.974569,0.913
10,0.5745,0.878009,0.918


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-1000
Configuration saved in bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-1000/config.json
Model weights saved in bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-2000
Configuration saved in bert-tiny-emotion-KD-BERT_and_distilBERT/checkpoint-2000/config.json


TrainOutput(global_step=13000, training_loss=1.6293554241473858, metrics={'train_runtime': 786.9304, 'train_samples_per_second': 1016.608, 'train_steps_per_second': 63.538, 'total_flos': 24236036181312.0, 'train_loss': 1.6293554241473858, 'epoch': 13.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.8780092597007751,
 'eval_accuracy': 0.918,
 'eval_runtime': 5.5475,
 'eval_samples_per_second': 360.524,
 'eval_steps_per_second': 22.533,
 'epoch': 13.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.7950055599212646, 'test_accuracy': 0.9135, 'test_runtime': 6.3504, 'test_samples_per_second': 314.94, 'test_steps_per_second': 19.684}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 5]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.9135
              precision    recall  f1-score   support

     sadness       0.95      0.95      0.95       581
         joy       0.96      0.92      0.94       695
        love       0.76      0.89      0.82       159
       anger       0.92      0.90      0.91       275
        fear       0.88      0.89      0.89       224
    surprise       0.72      0.74      0.73        66

    accuracy                           0.91      2000
   macro avg       0.86      0.88      0.87      2000
weighted avg       0.92      0.91      0.91      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-emotion-KD-BERT_and_distilBERT
Configuration saved in bert-tiny-emotion-KD-BERT_and_distilBERT/config.json
Model weights saved in bert-tiny-emotion-KD-BERT_and_distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-emotion-KD-BERT_and_distilBERT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file logs/events.out.tfevents.1664048191.85a9cd62cad8.6887.0:  32%|###2      | 3.34k/10.3k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664048984.85a9cd62cad8.6887.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT_and_distilBERT
   06e41b4..37689f1  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT_and_distilBERT
   06e41b4..37689f1  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-emotion-KD-BERT_and_distilBERT
