In [None]:
## BERT and Friends Project - Part 1 ##

In [None]:
## Installing Dependencies ##

!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 5.0 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.1 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 64.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 75.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 69.4 MB/s 
Installing collected p

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the Stanford Sentiment Treebank (SST-2) dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the sst-2 dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset("glue","sst2")
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
Counter(raw_datasets['train']['label']) ## Getting the number of samples in each class ##

Counter({0: 29780, 1: 37569})

In [None]:
raw_datasets['train'].features['label'].names ## intents in the dataset - Total: 6 ##

['negative', 'positive']

In [None]:
## Checking if GPU is available ##
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


## BERT-base Model finetuning

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-base-sst2"

In [None]:
checkpoint = "bert-base-uncased" ## Model used for fine-tuning ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
label2id

{'negative': '0', 'positive': '1'}

In [None]:
id2label

{'0': 'negative', '1': 'positive'}

In [None]:
### Training the Model ###

In [None]:
training_args = TrainingArguments(checkpoint)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_nam

In [None]:
### Training Arguments ###

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id) ## Number of classes = 2 ##

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/bert-base-sst2 into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 67349
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63150
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.243,0.373493,0.90367
2,0.1557,0.390712,0.892202
3,0.1248,0.368996,0.894495
4,0.1017,0.546599,0.883028


***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-base-sst2/checkpoint-4210
Configuration saved in bert-base-sst2/checkpoint-4210/config.json
Model weights saved in bert-base-sst2/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in bert-base-sst2/checkpoint-4210/tokenizer_config.json
Special tokens file saved in bert-base-sst2/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in bert-base-sst2/tokenizer_config.json
Special tokens file saved in bert-base-sst2/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to bert-base-sst2/checkpoint-8420
Configuration saved in bert-base-sst2/checkpoint-8420/config.json
Model weights saved in bert-base-sst2/checkpoint-8420/pytorch_model.bin
tokenizer config file saved in bert-base-sst2/checkpoint-8420/tokenizer_config.json
Special tokens file saved in bert-base-sst2/checkpoint-8420/special_tokens_map.j

TrainOutput(global_step=16840, training_loss=0.15630688520055486, metrics={'train_runtime': 1721.1866, 'train_samples_per_second': 586.941, 'train_steps_per_second': 36.69, 'total_flos': 4865624357902320.0, 'train_loss': 0.15630688520055486, 'epoch': 4.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.37349286675453186,
 'eval_accuracy': 0.9036697247706422,
 'eval_runtime': 1.539,
 'eval_samples_per_second': 566.604,
 'eval_steps_per_second': 35.738,
 'epoch': 4.0}

In [None]:
## Saving the model on the hugging face hub ##

import os

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-base-sst2
Configuration saved in bert-base-sst2/config.json
Model weights saved in bert-base-sst2/pytorch_model.bin
tokenizer config file saved in bert-base-sst2/tokenizer_config.json
Special tokens file saved in bert-base-sst2/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file logs/events.out.tfevents.1664040586.88f748532117.3660.0:  56%|#####6    | 3.34k/5.95k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664042309.88f748532117.3660.2: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-sst2
   c64f1ec..1ae815b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-sst2
   c64f1ec..1ae815b  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-base-sst2


In [None]:
### (Note: Restart the runtime and run the following to avoid priniting all the logs) ###

## DistilRoBERTa model finetuning

In [None]:
## Loading libraries and dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch


raw_datasets = load_dataset('glue', 'sst2')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "distilroberta-sst2"

In [None]:
checkpoint = "distilroberta-base" ## Model used for fine-tuning ## Ref: https://huggingface.co/distilroberta-base ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/distilroberta-sst2 into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 67349
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63150
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2928,0.349864,0.887615
2,0.1908,0.34507,0.908257
3,0.1489,0.343996,0.904817
4,0.119,0.496349,0.891055
5,0.0974,0.464518,0.888761


***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to distilroberta-sst2/checkpoint-4210
Configuration saved in distilroberta-sst2/checkpoint-4210/config.json
Model weights saved in distilroberta-sst2/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in distilroberta-sst2/checkpoint-4210/tokenizer_config.json
Special tokens file saved in distilroberta-sst2/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in distilroberta-sst2/tokenizer_config.json
Special tokens file saved in distilroberta-sst2/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to distilroberta-sst2/checkpoint-8420
Configuration saved in distilroberta-sst2/checkpoint-8420/config.json
Model weights saved in distilroberta-sst2/checkpoint-8420/pytorch_model.bin
tokenizer config file saved in distilroberta-sst2/checkpoint-8420/tokenizer_config.json
Special tokens file saved in distilr

TrainOutput(global_step=21050, training_loss=0.16976502189726841, metrics={'train_runtime': 1425.419, 'train_samples_per_second': 708.728, 'train_steps_per_second': 44.303, 'total_flos': 3124087844618184.0, 'train_loss': 0.16976502189726841, 'epoch': 5.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.34507009387016296,
 'eval_accuracy': 0.908256880733945,
 'eval_runtime': 1.1609,
 'eval_samples_per_second': 751.143,
 'eval_steps_per_second': 47.377,
 'epoch': 5.0}

In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to distilroberta-sst2
Configuration saved in distilroberta-sst2/config.json
Model weights saved in distilroberta-sst2/pytorch_model.bin
tokenizer config file saved in distilroberta-sst2/tokenizer_config.json
Special tokens file saved in distilroberta-sst2/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/313M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664038605.88f748532117.2842.0:  52%|#####1    | 3.34k/6.45k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664040032.88f748532117.2842.2: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-sst2
   10d6100..fc2907c  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-sst2
   10d6100..fc2907c  main -> main



Model webpage link: https://huggingface.co/gokuls/distilroberta-sst2


### Sanity Check

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilbert model have the same tokenization output ###

In [None]:
## Bert and distilbert ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilbert-base-uncased"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## They produce same result except the token_type_ids. The token_type_ids can be none and BERT functions without any problem ##

Since our student model is a BERT-based model it uses the same tokenizer like BERT. The distilBERT tokenizer also produce similar output like BERT. So, it for transfering the knowledge to our student model. 

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilRoBERTa model have the same tokenization output ###

In [None]:
## Bert and distilRoBERTa ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilroberta-base"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

## Produces different outputs ##

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'input_ids': [0, 47446, 19233, 6315, 4, 152, 16, 163, 18854, 8, 7837, 695, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## The tokenizers have created different outputs. This would be problematic while performing the knowledge distillation to our the (BERT-based) student model ##

## BERT-tiny model finetuning

In [None]:
### Student model - Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ###

Here, instead of performing knowledge distillation, we are fine-tuning the student model. By this way, we could able to compare the performace of dirctly fine-tuned student model and student model trained by knowledge distillation with the help of teacher model.

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch


raw_datasets = load_dataset("glue","sst2")
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "BERT-tiny-sst2"

In [None]:
checkpoint = "google/bert_uncased_L-2_H-128_A-2" ## Model used for fine-tuning ## Ref: google/bert_uncased_L-2_H-128_A-2 ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

/content/BERT-tiny-sst2 is already a clone of https://huggingface.co/gokuls/BERT-tiny-sst2. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 67349
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 210500
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3914,0.438287,0.821101
2,0.2577,0.442219,0.837156
3,0.212,0.545979,0.808486
4,0.1862,0.588533,0.824541
5,0.1671,0.715943,0.809633


***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to BERT-tiny-sst2/checkpoint-4210
Configuration saved in BERT-tiny-sst2/checkpoint-4210/config.json
Model weights saved in BERT-tiny-sst2/checkpoint-4210/pytorch_model.bin
tokenizer config file saved in BERT-tiny-sst2/checkpoint-4210/tokenizer_config.json
Special tokens file saved in BERT-tiny-sst2/checkpoint-4210/special_tokens_map.json
tokenizer config file saved in BERT-tiny-sst2/tokenizer_config.json
Special tokens file saved in BERT-tiny-sst2/special_tokens_map.json
Deleting older checkpoint [BERT-tiny-sst2/checkpoint-8420] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16
Saving model checkpoint to BERT-tiny-sst2/checkpoint-8420
Configuration saved in BERT-tiny-sst2/checkpoint-8420/config.json
Model weights saved in BERT-tiny-sst2/checkpoint-8420/pytorch_model.bin
tokenizer config file saved in BERT-tiny-sst2/checkpoint-8420/tokenizer_conf

TrainOutput(global_step=21050, training_loss=0.24288795453069328, metrics={'train_runtime': 466.2425, 'train_samples_per_second': 7222.53, 'train_steps_per_second': 451.482, 'total_flos': 29373330335940.0, 'train_loss': 0.24288795453069328, 'epoch': 5.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.44221875071525574,
 'eval_accuracy': 0.8371559633027523,
 'eval_runtime': 1.1715,
 'eval_samples_per_second': 744.346,
 'eval_steps_per_second': 46.948,
 'epoch': 5.0}

In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to BERT-tiny-sst2
Configuration saved in BERT-tiny-sst2/config.json
Model weights saved in BERT-tiny-sst2/pytorch_model.bin
tokenizer config file saved in BERT-tiny-sst2/tokenizer_config.json
Special tokens file saved in BERT-tiny-sst2/special_tokens_map.json


Upload file logs/events.out.tfevents.1664038010.88f748532117.1345.0:  53%|#####2    | 3.34k/6.35k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664038478.88f748532117.1345.2: 100%|##########| 698/698 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-sst2
   e6b21a3..33bed21  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-sst2
   e6b21a3..33bed21  main -> main



Model webpage link: https://huggingface.co/gokuls/BERT-tiny-sst2
