In [None]:
## BERT and Friends - Project ##

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 6.9 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 24.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 72.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 50.5 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 70.9 MB/s 
Installing collected p

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the Stanford Sentiment Treebank (SST-2) dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the sst-2 dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

raw_datasets = load_dataset('glue', 'sst2')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
## Checking if GPU is available ##

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
### Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ### - BERT Student model ###

# Using BERT-base model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-sst2-KD-BERT"

In [None]:
## Teacher model: https://huggingface.co/gokuls/bert-base-sst2 ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "gokuls/bert-base-sst2" ## Our pre-trained BERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/851 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 67349
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 210500
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7521,0.734532,0.823394
2,0.4301,0.774758,0.830275
3,0.3335,0.825743,0.834862
4,0.2831,0.914461,0.818807
5,0.2419,0.909604,0.817661
6,0.2149,0.840979,0.823394


***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to bert-tiny-sst2-KD-BERT/checkpoint-4210
Configuration saved in bert-tiny-sst2-KD-BERT/checkpoint-4210/config.json
Model weights saved in bert-tiny-sst2-KD-BERT/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-BERT/checkpoint-4210/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in bert-tiny-sst2-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-tiny-sst2-KD-BERT/checkpoint-8420
Configuration saved in bert-tiny-sst2-KD-BERT/checkpoint-8420/config.json
Model weights saved in bert-tiny-sst2-KD-BERT/checkpoint-8420/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-BERT/checkpoint-8420/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT/checkpoint

TrainOutput(global_step=25260, training_loss=0.37592134573869257, metrics={'train_runtime': 957.132, 'train_samples_per_second': 3518.271, 'train_steps_per_second': 219.928, 'total_flos': 35262370276620.0, 'train_loss': 0.37592134573869257, 'epoch': 6.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.8257425427436829,
 'eval_accuracy': 0.8348623853211009,
 'eval_runtime': 2.0341,
 'eval_samples_per_second': 428.7,
 'eval_steps_per_second': 27.04,
 'epoch': 6.0}

In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-sst2-KD-BERT
Configuration saved in bert-tiny-sst2-KD-BERT/config.json
Model weights saved in bert-tiny-sst2-KD-BERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT/special_tokens_map.json


Upload file logs/events.out.tfevents.1664047572.007f0899e8e4.69.0:  49%|####8     | 3.34k/6.86k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1664048590.007f0899e8e4.69.2: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT
   ecd863a..654f25d  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT
   ecd863a..654f25d  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT


# Using distilBERT model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-sst2-KD-distilBERT"

In [None]:
## Teacher model: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "distilbert-base-uncased-finetuned-sst-2-english" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-sst2-KD-distilBERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx. If idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 210500
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2008,1.131899,0.817661
2,0.6821,1.103546,0.832569
3,0.5315,1.227063,0.824541
4,0.4486,1.442598,0.817661
5,0.3857,1.430889,0.830275


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx. If idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-tiny-sst2-KD-distilBERT/checkpoint-4210
Configuration saved in bert-tiny-sst2-KD-distilBERT/checkpoint-4210/config.json
Model weights saved in bert-tiny-sst2-KD-distilBERT/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-distilBERT/checkpoint-4210/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-distilBERT/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in bert-tiny-sst2-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-distilBERT/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argum

TrainOutput(global_step=21050, training_loss=0.6497366296763658, metrics={'train_runtime': 662.903, 'train_samples_per_second': 5079.854, 'train_steps_per_second': 317.543, 'total_flos': 29373330335940.0, 'train_loss': 0.6497366296763658, 'epoch': 5.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx. If idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 1.1035457849502563,
 'eval_accuracy': 0.8325688073394495,
 'eval_runtime': 1.7765,
 'eval_samples_per_second': 490.844,
 'eval_steps_per_second': 30.959,
 'epoch': 5.0}

In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-sst2-KD-distilBERT
Configuration saved in bert-tiny-sst2-KD-distilBERT/config.json
Model weights saved in bert-tiny-sst2-KD-distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-distilBERT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/16.7M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664049370.007f0899e8e4.1684.2: 100%|##########| 369/369 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664048706.007f0899e8e4.1684.0:  52%|#####2    | 3.34k/6.41k [00:00<?, ?B…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-distilBERT
   f7de303..ae1ce5b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-distilBERT
   f7de303..ae1ce5b  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-sst2-KD-distilBERT


# Using BERT-base model and distilBERT model (multiple teacher model) for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-sst2-KD-BERT_and_distilBERT"

In [None]:
## Teacher model : https://huggingface.co/gokuls/bert-base-sst2 ##
## Teacher model 2: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id_1 = "gokuls/bert-base-sst2" ## Our pre-trained BERT model is used as teacher
teacher_id_2 = "distilbert-base-uncased-finetuned-sst-2-english" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer1 = AutoTokenizer.from_pretrained(teacher_id_1)
teacher_tokenizer2 = AutoTokenizer.from_pretrained(teacher_id_2)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer 1: ', teacher_tokenizer1(sample))
print('Teacher tokenizer 2: ', teacher_tokenizer2(sample))
print('Student tokenizer: ', student_tokenizer(sample))

## Tokenization outputs are similar ##

Teacher tokenizer 1:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Teacher tokenizer 2:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id_2) ## Don't use the tokenizer from teacher_id_1 (BERT), since it produces token_type_id which is not needed by distilBERT ##

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]



In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model_1=None, teacher_model_2=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher1 = teacher_model_1
        self.teacher2 = teacher_model_2
        # student and teacher on same device #
        self._move_model_to_device(self.teacher1,self.model.device)
        self._move_model_to_device(self.teacher2,self.model.device)
        self.teacher1.eval()
        self.teacher2.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss # output from gold labels
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher1 = self.teacher1(**inputs)
            outputs_teacher2 = self.teacher2(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher1.logits.size()
        assert outputs_student.logits.size() == outputs_teacher2.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher1.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2)) + (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher2.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher models #
teacher_model1 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_1,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

teacher_model2 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_2,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model_1=teacher_model1,
    teacher_model_2=teacher_model2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT_and_distilBERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 67349
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 210500
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.7317,1.588668,0.822248
2,1.0068,1.553031,0.832569
3,0.7961,1.707177,0.824541
4,0.6852,1.879401,0.817661
5,0.6039,1.869101,0.81422


***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-4210
Configuration saved in bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-4210/config.json
Model weights saved in bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-4210/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-8420
Configuration saved in bert-tiny-sst2-KD-BERT_and_distilBERT/checkpoint-8420/config.json
Model weights saved in bert-t

TrainOutput(global_step=21050, training_loss=0.9647214013323931, metrics={'train_runtime': 944.4831, 'train_samples_per_second': 3565.389, 'train_steps_per_second': 222.873, 'total_flos': 29373330335940.0, 'train_loss': 0.9647214013323931, 'epoch': 5.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 1.5530306100845337,
 'eval_accuracy': 0.8325688073394495,
 'eval_runtime': 2.7084,
 'eval_samples_per_second': 321.963,
 'eval_steps_per_second': 20.307,
 'epoch': 5.0}

In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-sst2-KD-BERT_and_distilBERT
Configuration saved in bert-tiny-sst2-KD-BERT_and_distilBERT/config.json
Model weights saved in bert-tiny-sst2-KD-BERT_and_distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-sst2-KD-BERT_and_distilBERT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/16.7M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664050573.007f0899e8e4.3013.2: 100%|##########| 369/369 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664049625.007f0899e8e4.3013.0:  52%|#####1    | 3.34k/6.44k [00:00<?, ?B…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT_and_distilBERT
   ef86e3b..2c90aba  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT_and_distilBERT
   ef86e3b..2c90aba  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-sst2-KD-BERT_and_distilBERT
