In [None]:
## BERT and Friends Project - Part 1 ##

In [None]:
## Installing Dependencies ##

!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 6.5 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 53.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 52.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 48.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 68.2 MB/s 
Installing collected package

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the Emotion dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the Emotion dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset('emotion')
raw_datasets

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]



Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
Counter(raw_datasets['train']['label']) ## Getting the number of samples in each class ##

Counter({0: 4666, 3: 2159, 2: 1304, 5: 572, 4: 1937, 1: 5362})

In [None]:
raw_datasets['train'].features['label'].names ## intents in the dataset - Total: 6 ##

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [None]:
## Checking if GPU is available ##
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


## BERT-base Model finetuning

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-base-emotion-intent"

In [None]:
checkpoint = "bert-base-uncased" ## Model used for fine-tuning ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
label2id

{'sadness': '0',
 'joy': '1',
 'love': '2',
 'anger': '3',
 'fear': '4',
 'surprise': '5'}

In [None]:
id2label

{'0': 'sadness',
 '1': 'joy',
 '2': 'love',
 '3': 'anger',
 '4': 'fear',
 '5': 'surprise'}

In [None]:
### Training the Model ###

In [None]:
training_args = TrainingArguments(checkpoint)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_nam

In [None]:
### Training Arguments ###

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6) ## Number of classes = 6 ##

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/bert-base-emotion-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 15000
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4058,0.242148,0.9265
2,0.1541,0.195159,0.9385
3,0.1279,0.180702,0.9345
4,0.1069,0.229181,0.9365
5,0.081,0.331544,0.936


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to bert-base-emotion-intent/checkpoint-1000
Configuration saved in bert-base-emotion-intent/checkpoint-1000/config.json
Model weights saved in bert-base-emotion-intent/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert-base-emotion-intent/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-base-emotion-intent/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in bert-base-emotion-intent/tokenizer_config.json
Special tokens file saved in bert-base-emotion-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to bert-base-emotion-intent/checkpoint-2000
Configuration saved in bert-base-emotion-intent/checkpoint-2000/config.json
Model weights saved in bert-base-emotion-intent/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in bert-base-emotion-intent/checkpoint-2000/tokenizer_config.json
Special tokens file saved in bert-base-

TrainOutput(global_step=5000, training_loss=0.17514371795654296, metrics={'train_runtime': 616.9184, 'train_samples_per_second': 389.03, 'train_steps_per_second': 24.314, 'total_flos': 1926313438390848.0, 'train_loss': 0.17514371795654296, 'epoch': 5.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.19515934586524963,
 'eval_accuracy': 0.9385,
 'eval_runtime': 3.3969,
 'eval_samples_per_second': 588.773,
 'eval_steps_per_second': 36.798,
 'epoch': 5.0}

In [None]:
## Evaluating the model on Test set ##

trainer.predict(tokenized_datasets["test"])

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


PredictionOutput(predictions=array([[ 7.875 , -1.599 , -1.466 , -1.662 , -1.41  , -1.855 ],
       [ 8.016 , -1.824 , -1.751 , -1.725 , -0.8384, -1.745 ],
       [ 7.88  , -1.586 , -1.627 , -1.551 , -1.328 , -1.881 ],
       ...,
       [-2.05  ,  7.895 , -0.6504, -1.79  , -2.34  , -1.723 ],
       [-2.059 ,  7.746 , -0.887 , -1.791 , -1.997 , -1.674 ],
       [-1.341 , -2.852 , -2.268 , -0.59  ,  4.324 ,  3.133 ]],
      dtype=float16), label_ids=array([0, 0, 0, ..., 1, 1, 4]), metrics={'test_loss': 0.1819990873336792, 'test_accuracy': 0.9295, 'test_runtime': 3.9898, 'test_samples_per_second': 501.278, 'test_steps_per_second': 31.33})

In [None]:
print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.1819990873336792, 'test_accuracy': 0.9295, 'test_runtime': 3.4129, 'test_samples_per_second': 586.006, 'test_steps_per_second': 36.625}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 4]
Actual values :  [0 0 0 ... 1 1 4]


In [None]:
## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##

In [None]:
## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

Accuracy :  0.9295
              precision    recall  f1-score   support

     sadness       0.99      0.95      0.97       581
         joy       0.93      0.97      0.95       695
        love       0.88      0.71      0.79       159
       anger       0.91      0.95      0.93       275
        fear       0.87      0.96      0.91       224
    surprise       0.79      0.70      0.74        66

    accuracy                           0.93      2000
   macro avg       0.90      0.87      0.88      2000
weighted avg       0.93      0.93      0.93      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-base-emotion-intent
Configuration saved in bert-base-emotion-intent/config.json
Model weights saved in bert-base-emotion-intent/pytorch_model.bin
tokenizer config file saved in bert-base-emotion-intent/tokenizer_config.json
Special tokens file saved in bert-base-emotion-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664024756.6f1754e52420.68.0:  51%|#####     | 3.34k/6.58k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1664025425.6f1754e52420.68.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-emotion-intent
   6cf2d41..8e3942b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-emotion-intent
   6cf2d41..8e3942b  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-base-emotion-intent


In [None]:
### (Note: Restart the runtime and run the following to avoid priniting all the logs) ###

## DistilRoBERTa model finetuning

In [None]:
## Loading libraries and dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch


raw_datasets = load_dataset('emotion')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "distilroberta-emotion-intent"

In [None]:
checkpoint = "distilroberta-base" ## Model used for fine-tuning ## Ref: https://huggingface.co/distilroberta-base ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/distilroberta-emotion-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 15000
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4501,0.243203,0.924
2,0.1947,0.164562,0.934
3,0.1497,0.138201,0.9405
4,0.1316,0.149622,0.9435
5,0.1145,0.168445,0.9385
6,0.1,0.234161,0.943
7,0.0828,0.280748,0.939


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to distilroberta-emotion-intent/checkpoint-1000
Configuration saved in distilroberta-emotion-intent/checkpoint-1000/config.json
Model weights saved in distilroberta-emotion-intent/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in distilroberta-emotion-intent/checkpoint-1000/tokenizer_config.json
Special tokens file saved in distilroberta-emotion-intent/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in distilroberta-emotion-intent/tokenizer_config.json
Special tokens file saved in distilroberta-emotion-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to distilroberta-emotion-intent/checkpoint-2000
Configuration saved in distilroberta-emotion-intent/checkpoint-2000/config.json
Model weights saved in distilroberta-emotion-intent/checkpoint-2000/pytorch_model.bin
tokenizer config file 

TrainOutput(global_step=7000, training_loss=0.17476078033447265, metrics={'train_runtime': 548.2405, 'train_samples_per_second': 437.764, 'train_steps_per_second': 27.36, 'total_flos': 1347542079453504.0, 'train_loss': 0.17476078033447265, 'epoch': 7.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.14962154626846313,
 'eval_accuracy': 0.9435,
 'eval_runtime': 2.3643,
 'eval_samples_per_second': 845.915,
 'eval_steps_per_second': 52.87,
 'epoch': 7.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.16849346458911896, 'test_accuracy': 0.9315, 'test_runtime': 2.764, 'test_samples_per_second': 723.582, 'test_steps_per_second': 45.224}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 4]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.9315
              precision    recall  f1-score   support

     sadness       0.97      0.97      0.97       581
         joy       0.96      0.95      0.95       695
        love       0.84      0.86      0.85       159
       anger       0.92      0.93      0.93       275
        fear       0.88      0.88      0.88       224
    surprise       0.76      0.71      0.73        66

    accuracy                           0.93      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.93      0.93      0.93      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to distilroberta-emotion-intent
Configuration saved in distilroberta-emotion-intent/config.json
Model weights saved in distilroberta-emotion-intent/pytorch_model.bin
tokenizer config file saved in distilroberta-emotion-intent/tokenizer_config.json
Special tokens file saved in distilroberta-emotion-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/313M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664026553.6f1754e52420.830.2: 100%|##########| 363/363 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664026002.6f1754e52420.830.0:  44%|####4     | 3.34k/7.54k [00:00<?, ?B/…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-emotion-intent
   336ea35..a620e56  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-emotion-intent
   336ea35..a620e56  main -> main



Model webpage link: https://huggingface.co/gokuls/distilroberta-emotion-intent


### Sanity Check

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilbert model have the same tokenization output ###

In [None]:
## Bert and distilbert ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilbert-base-uncased"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## They produce same result except the token_type_ids. The token_type_ids can be none and BERT functions without any problem ##

Since our student model is a BERT-based model it uses the same tokenizer like BERT. The distilBERT tokenizer also produce similar output like BERT. So, it for transfering the knowledge to our student model. 

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilRoBERTa model have the same tokenization output ###

In [None]:
## Bert and distilRoBERTa ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilroberta-base"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

## Produces different outputs ##

{'input_ids': [0, 47446, 19233, 6315, 4, 152, 16, 163, 18854, 8, 7837, 695, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## The tokenizers have created different outputs. This would be problematic while performing the knowledge distillation to our the (BERT-based) student model ##

## DistilBERT model finetuning

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch


raw_datasets = load_dataset('emotion')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "distilbert-emotion-intent"

In [None]:
checkpoint = "distilbert-base-uncased" ## Model used for fine-tuning ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/distilbert-emotion-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 15000
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3939,0.212276,0.9285
2,0.1539,0.163507,0.936
3,0.1213,0.181984,0.931
4,0.1016,0.198869,0.937
5,0.0713,0.268061,0.935
6,0.0462,0.303394,0.9365
7,0.027,0.353803,0.937


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to distilbert-emotion-intent/checkpoint-1000
Configuration saved in distilbert-emotion-intent/checkpoint-1000/config.json
Model weights saved in distilbert-emotion-intent/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in distilbert-emotion-intent/checkpoint-1000/tokenizer_config.json
Special tokens file saved in distilbert-emotion-intent/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in distilbert-emotion-intent/tokenizer_config.json
Special tokens file saved in distilbert-emotion-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to distilbert-emotion-intent/checkpoint-2000
Configuration saved in distilbert-emotion-intent/checkpoint-2000/config.json
Model weights saved in distilbert-emotion-intent/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in distilbert-emotion-in

TrainOutput(global_step=7000, training_loss=0.13075483894348144, metrics={'train_runtime': 487.9239, 'train_samples_per_second': 491.88, 'train_steps_per_second': 30.742, 'total_flos': 1359113104157184.0, 'train_loss': 0.13075483894348144, 'epoch': 7.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.19886881113052368,
 'eval_accuracy': 0.937,
 'eval_runtime': 2.3311,
 'eval_samples_per_second': 857.957,
 'eval_steps_per_second': 53.622,
 'epoch': 7.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.23882879316806793, 'test_accuracy': 0.9255, 'test_runtime': 2.7807, 'test_samples_per_second': 719.255, 'test_steps_per_second': 44.953}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 5]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.9255
              precision    recall  f1-score   support

     sadness       0.96      0.97      0.96       581
         joy       0.96      0.93      0.94       695
        love       0.75      0.91      0.82       159
       anger       0.95      0.92      0.94       275
        fear       0.93      0.85      0.89       224
    surprise       0.71      0.85      0.77        66

    accuracy                           0.93      2000
   macro avg       0.88      0.90      0.89      2000
weighted avg       0.93      0.93      0.93      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to distilbert-emotion-intent
Configuration saved in distilbert-emotion-intent/config.json
Model weights saved in distilbert-emotion-intent/pytorch_model.bin
tokenizer config file saved in distilbert-emotion-intent/tokenizer_config.json
Special tokens file saved in distilbert-emotion-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/255M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664027569.6f1754e52420.1493.2: 100%|##########| 363/363 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1664027078.6f1754e52420.1493.0:  45%|####5     | 3.34k/7.41k [00:00<?, ?B…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilbert-emotion-intent
   b3905f7..358c0e5  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilbert-emotion-intent
   b3905f7..358c0e5  main -> main



Model webpage link: https://huggingface.co/gokuls/distilbert-emotion-intent


## BERT-tiny model finetuning

In [None]:
### Student model - Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ###

Here, instead of performing knowledge distillation, we are fine-tuning the student model. By this way, we could able to compare the performace of dirctly fine-tuned student model and student model trained by knowledge distillation with the help of teacher model.

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch


raw_datasets = load_dataset('emotion')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "BERT-tiny-emotion-intent"

In [None]:
checkpoint = "google/bert_uncased_L-2_H-128_A-2" ## Model used for fine-tuning ## Ref: google/bert_uncased_L-2_H-128_A-2 ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/16 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['text']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/BERT-tiny-emotion-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 16000
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2603,0.77662,0.7815
2,0.5919,0.411706,0.884
3,0.367,0.318837,0.8995
4,0.2848,0.292827,0.8985
5,0.2395,0.290591,0.898
6,0.2094,0.288702,0.907
7,0.1884,0.283141,0.9065
8,0.1603,0.30443,0.9065
9,0.1519,0.312385,0.9095
10,0.1291,0.325623,0.9065


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to BERT-tiny-emotion-intent/checkpoint-1000
Configuration saved in BERT-tiny-emotion-intent/checkpoint-1000/config.json
Model weights saved in BERT-tiny-emotion-intent/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BERT-tiny-emotion-intent/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BERT-tiny-emotion-intent/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in BERT-tiny-emotion-intent/tokenizer_config.json
Special tokens file saved in BERT-tiny-emotion-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to BERT-tiny-emotion-intent/checkpoint-2000
Configuration saved in BERT-tiny-emotion-intent/checkpoint-2000/config.json
Model weights saved in BERT-tiny-emotion-intent/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in BERT-tiny-emotion-intent/checkp

TrainOutput(global_step=15000, training_loss=0.27242815958658856, metrics={'train_runtime': 529.9718, 'train_samples_per_second': 1509.514, 'train_steps_per_second': 94.345, 'total_flos': 27965314573440.0, 'train_loss': 0.27242815958658856, 'epoch': 15.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.3619643747806549,
 'eval_accuracy': 0.91,
 'eval_runtime': 2.009,
 'eval_samples_per_second': 995.544,
 'eval_steps_per_second': 62.222,
 'epoch': 15.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16



Test results : 

 {'test_loss': 0.37736842036247253, 'test_accuracy': 0.902, 'test_runtime': 2.6062, 'test_samples_per_second': 767.404, 'test_steps_per_second': 47.963}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


Predicted values :  [0 0 0 ... 1 1 5]
Actual values :  [0 0 0 ... 1 1 4]
Accuracy :  0.902
              precision    recall  f1-score   support

     sadness       0.94      0.94      0.94       581
         joy       0.92      0.93      0.92       695
        love       0.75      0.79      0.77       159
       anger       0.90      0.92      0.91       275
        fear       0.94      0.82      0.88       224
    surprise       0.67      0.85      0.75        66

    accuracy                           0.90      2000
   macro avg       0.86      0.87      0.86      2000
weighted avg       0.91      0.90      0.90      2000



In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to BERT-tiny-emotion-intent
Configuration saved in BERT-tiny-emotion-intent/config.json
Model weights saved in BERT-tiny-emotion-intent/pytorch_model.bin
tokenizer config file saved in BERT-tiny-emotion-intent/tokenizer_config.json
Special tokens file saved in BERT-tiny-emotion-intent/special_tokens_map.json


Upload file logs/events.out.tfevents.1664028105.6f1754e52420.2150.0:  30%|##9       | 3.34k/11.2k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1664028637.6f1754e52420.2150.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-emotion-intent
   4b89649..296c4fd  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-emotion-intent
   4b89649..296c4fd  main -> main



Model webpage link: https://huggingface.co/gokuls/BERT-tiny-emotion-intent
