In [None]:
## BERT and Friends Project - Part 1 ##

In [None]:
## Installing Dependencies ##

!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 5.1 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 70.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 56.6 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 68.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 56.5 MB/s 
Installing collected p

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the Amazon Massive dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the Amazon Massive dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset('AmazonScience/massive', 'en-US') ## Considering only the English dataset ##
raw_datasets

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading and preparing dataset massive/en-US to /root/.cache/huggingface/datasets/AmazonScience___massive/en-US/1.0.0/c06e96faea378f5cbba9f6ba50b7ea33fb3f91d4256bfa26cd23a54421d154e5...


Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset massive downloaded and prepared to /root/.cache/huggingface/datasets/AmazonScience___massive/en-US/1.0.0/c06e96faea378f5cbba9f6ba50b7ea33fb3f91d4256bfa26cd23a54421d154e5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2974
    })
})

In [None]:
Counter(raw_datasets['train']['intent']) ## Getting the number of samples in each class ##

Counter({48: 182,
         46: 110,
         1: 125,
         40: 153,
         31: 76,
         34: 93,
         32: 566,
         45: 639,
         12: 555,
         5: 25,
         0: 350,
         38: 52,
         3: 122,
         52: 78,
         23: 130,
         22: 503,
         43: 113,
         57: 154,
         18: 76,
         16: 135,
         13: 573,
         28: 51,
         25: 72,
         7: 14,
         29: 18,
         56: 124,
         14: 110,
         24: 48,
         41: 22,
         8: 52,
         35: 52,
         4: 152,
         36: 283,
         19: 173,
         49: 544,
         50: 810,
         20: 150,
         58: 193,
         27: 108,
         42: 227,
         33: 354,
         55: 70,
         59: 198,
         51: 112,
         2: 127,
         6: 190,
         44: 418,
         11: 117,
         37: 4,
         26: 267,
         30: 312,
         53: 164,
         9: 207,
         17: 127,
         21: 177,
         54: 100,
         39: 78,
  

In [None]:
raw_datasets['train'].features['intent'].names ## intents in the dataset - Total: 60 ##

['datetime_query',
 'iot_hue_lightchange',
 'transport_ticket',
 'takeaway_query',
 'qa_stock',
 'general_greet',
 'recommendation_events',
 'music_dislikeness',
 'iot_wemo_off',
 'cooking_recipe',
 'qa_currency',
 'transport_traffic',
 'general_quirky',
 'weather_query',
 'audio_volume_up',
 'email_addcontact',
 'takeaway_order',
 'email_querycontact',
 'iot_hue_lightup',
 'recommendation_locations',
 'play_audiobook',
 'lists_createoradd',
 'news_query',
 'alarm_query',
 'iot_wemo_on',
 'general_joke',
 'qa_definition',
 'social_query',
 'music_settings',
 'audio_volume_other',
 'calendar_remove',
 'iot_hue_lightdim',
 'calendar_query',
 'email_sendemail',
 'iot_cleaning',
 'audio_volume_down',
 'play_radio',
 'cooking_query',
 'datetime_convert',
 'qa_maths',
 'iot_hue_lightoff',
 'iot_hue_lighton',
 'transport_query',
 'music_likeness',
 'email_query',
 'play_music',
 'audio_volume_mute',
 'social_post',
 'alarm_set',
 'qa_factoid',
 'calendar_set',
 'play_game',
 'alarm_remove',
 

In [None]:
## Checking if GPU is available ##
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


## BERT-base Model finetuning

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-base-Massive-intent"

In [None]:
checkpoint = "bert-base-uncased" ## Model used for fine-tuning ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/3 [00:00<?, ?ba/s]



In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
## Identify number of intents and checking class labels ##

print('Lables (minimum): ', min(tokenized_datasets["train"]['labels']))
print('Lables (maximum): ', max(tokenized_datasets["train"]['labels']))

Lables (minimum):  tensor(0)
Lables (maximum):  tensor(59)


The are 60 classes (label-0 to label-59)

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
label2id

{'datetime_query': '0',
 'iot_hue_lightchange': '1',
 'transport_ticket': '2',
 'takeaway_query': '3',
 'qa_stock': '4',
 'general_greet': '5',
 'recommendation_events': '6',
 'music_dislikeness': '7',
 'iot_wemo_off': '8',
 'cooking_recipe': '9',
 'qa_currency': '10',
 'transport_traffic': '11',
 'general_quirky': '12',
 'weather_query': '13',
 'audio_volume_up': '14',
 'email_addcontact': '15',
 'takeaway_order': '16',
 'email_querycontact': '17',
 'iot_hue_lightup': '18',
 'recommendation_locations': '19',
 'play_audiobook': '20',
 'lists_createoradd': '21',
 'news_query': '22',
 'alarm_query': '23',
 'iot_wemo_on': '24',
 'general_joke': '25',
 'qa_definition': '26',
 'social_query': '27',
 'music_settings': '28',
 'audio_volume_other': '29',
 'calendar_remove': '30',
 'iot_hue_lightdim': '31',
 'calendar_query': '32',
 'email_sendemail': '33',
 'iot_cleaning': '34',
 'audio_volume_down': '35',
 'play_radio': '36',
 'cooking_query': '37',
 'datetime_convert': '38',
 'qa_maths': '39

In [None]:
id2label

{'0': 'datetime_query',
 '1': 'iot_hue_lightchange',
 '2': 'transport_ticket',
 '3': 'takeaway_query',
 '4': 'qa_stock',
 '5': 'general_greet',
 '6': 'recommendation_events',
 '7': 'music_dislikeness',
 '8': 'iot_wemo_off',
 '9': 'cooking_recipe',
 '10': 'qa_currency',
 '11': 'transport_traffic',
 '12': 'general_quirky',
 '13': 'weather_query',
 '14': 'audio_volume_up',
 '15': 'email_addcontact',
 '16': 'takeaway_order',
 '17': 'email_querycontact',
 '18': 'iot_hue_lightup',
 '19': 'recommendation_locations',
 '20': 'play_audiobook',
 '21': 'lists_createoradd',
 '22': 'news_query',
 '23': 'alarm_query',
 '24': 'iot_wemo_on',
 '25': 'general_joke',
 '26': 'qa_definition',
 '27': 'social_query',
 '28': 'music_settings',
 '29': 'audio_volume_other',
 '30': 'calendar_remove',
 '31': 'iot_hue_lightdim',
 '32': 'calendar_query',
 '33': 'email_sendemail',
 '34': 'iot_cleaning',
 '35': 'audio_volume_down',
 '36': 'play_radio',
 '37': 'cooking_query',
 '38': 'datetime_convert',
 '39': 'qa_maths

In [None]:
### Training the Model ###

In [None]:
training_args = TrainingArguments(checkpoint)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_nam

In [None]:
### Training Arguments ###

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=60) ## Number of classes = 60 ##

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/bert-base-Massive-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10800
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6844,0.718994,0.838662
2,0.4713,0.544889,0.872602
3,0.2459,0.589315,0.878997
4,0.1469,0.663071,0.879488
5,0.0874,0.67066,0.885883
6,0.0507,0.718863,0.884407
7,0.0344,0.747968,0.885391
8,0.0225,0.795615,0.884407


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to bert-base-Massive-intent/checkpoint-720
Configuration saved in bert-base-Massive-intent/checkpoint-720/config.json
Model weights saved in bert-base-Massive-intent/checkpoint-720/pytorch_model.bin
tokenizer config file saved in bert-base-Massive-intent/checkpoint-720/tokenizer_config.json
Special tokens file saved in bert-base-Massive-intent/checkpoint-720/special_tokens_map.json
tokenizer config file saved in bert-base-Massive-intent/tokenizer_config.json
Special tokens file saved in bert-base-Massive-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-base-Massive-intent/checkpoint-1440
Configuration saved in bert-base-Massive-intent/checkpoint-1440/config.json
Model weights saved in bert-base-Massive-intent/checkpoint-1440/pytorch_model.bin
tokenizer config file saved in bert-base-Massive-intent/checkpoint-1440/tokenizer_config.json
Special tokens file saved in bert-base-Massi

TrainOutput(global_step=5760, training_loss=0.3429386125670539, metrics={'train_runtime': 682.535, 'train_samples_per_second': 253.042, 'train_steps_per_second': 15.823, 'total_flos': 828982577323872.0, 'train_loss': 0.3429386125670539, 'epoch': 8.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 0.6706598997116089,
 'eval_accuracy': 0.8858829316281358,
 'eval_runtime': 4.396,
 'eval_samples_per_second': 462.461,
 'eval_steps_per_second': 29.117,
 'epoch': 8.0}

In [None]:
## Evaluating the model on Test set ##

trainer.predict(tokenized_datasets["test"])

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


PredictionOutput(predictions=array([[ 2.143  , -1.276  ,  2.045  , ..., -1.513  , -0.8003 , -1.002  ],
       [-4.01   , -0.592  , -1.566  , ..., -0.2673 , -0.467  , -0.3457 ],
       [-2.73   ,  9.61   , -1.881  , ...,  1.773  , -0.9204 , -1.41   ],
       ...,
       [ 0.615  ,  0.3394 , -0.699  , ..., -1.3125 ,  0.537  , -0.817  ],
       [ 0.3452 , -0.4258 , -0.1271 , ..., -1.301  , -0.04776, -0.37   ],
       [ 0.6724 ,  0.382  , -0.8003 , ..., -1.275  ,  0.3098 , -0.842  ]],
      dtype=float16), label_ids=array([48, 46,  1, ..., 44, 44, 44]), metrics={'test_loss': 0.6849051117897034, 'test_accuracy': 0.8833221250840618, 'test_runtime': 9.2358, 'test_samples_per_second': 322.007, 'test_steps_per_second': 20.139})

In [None]:
print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 0.6849051117897034, 'test_accuracy': 0.8833221250840618, 'test_runtime': 6.2739, 'test_samples_per_second': 474.029, 'test_steps_per_second': 29.647}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46  1 ... 44 33 44]
Actual values :  [48 46  1 ... 44 44 44]


In [None]:
## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##
# labels

In [None]:
## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

Accuracy :  0.8833221250840618
                          precision    recall  f1-score   support

          datetime_query       0.90      0.99      0.94        88
     iot_hue_lightchange       0.92      0.97      0.95        36
        transport_ticket       1.00      0.97      0.99        35
          takeaway_query       0.94      0.83      0.88        35
                qa_stock       0.86      0.96      0.91        26
           general_greet       0.00      0.00      0.00         1
   recommendation_events       0.79      0.77      0.78        43
       music_dislikeness       1.00      0.75      0.86         4
            iot_wemo_off       1.00      0.94      0.97        18
          cooking_recipe       0.96      0.89      0.92        72
             qa_currency       0.95      0.97      0.96        39
       transport_traffic       0.75      1.00      0.86        15
          general_quirky       0.71      0.56      0.62       169
           weather_query       0.95      0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-base-Massive-intent
Configuration saved in bert-base-Massive-intent/config.json
Model weights saved in bert-base-Massive-intent/pytorch_model.bin
tokenizer config file saved in bert-base-Massive-intent/tokenizer_config.json
Special tokens file saved in bert-base-Massive-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1663940314.2ea13cb2f63d.563.0:  33%|###2      | 3.34k/10.2k [00:00<?, ?B/…

Upload file logs/events.out.tfevents.1663941056.2ea13cb2f63d.563.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-Massive-intent
   633a693..d53c467  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-base-Massive-intent
   633a693..d53c467  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-base-Massive-intent


In [None]:
### (Note: Restart the runtime and run the following to avoid priniting all the logs) ###

## DistilRoBERTa model finetuning

In [None]:
## Loading libraries and dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset('AmazonScience/massive', 'en-US') ## Considering only the English dataset ##
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2974
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "distilroberta-base-Massive-intent"

In [None]:
checkpoint = "distilroberta-base" ## Model used for fine-tuning ## Ref: https://huggingface.co/distilroberta-base ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=60)

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/distilroberta-base-Massive-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10800
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.41,0.67418,0.828824
2,0.4978,0.514963,0.875061
3,0.3009,0.570518,0.878997
4,0.1953,0.588734,0.879488
5,0.127,0.612347,0.880964
6,0.0914,0.65754,0.883424
7,0.0583,0.661793,0.893753
8,0.0355,0.759136,0.886375
9,0.0259,0.8087,0.878013
10,0.02,0.796405,0.888834


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to distilroberta-base-Massive-intent/checkpoint-720
Configuration saved in distilroberta-base-Massive-intent/checkpoint-720/config.json
Model weights saved in distilroberta-base-Massive-intent/checkpoint-720/pytorch_model.bin
tokenizer config file saved in distilroberta-base-Massive-intent/checkpoint-720/tokenizer_config.json
Special tokens file saved in distilroberta-base-Massive-intent/checkpoint-720/special_tokens_map.json
tokenizer config file saved in distilroberta-base-Massive-intent/tokenizer_config.json
Special tokens file saved in distilroberta-base-Massive-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to distilroberta-base-Massive-intent/checkpoint-1440
Configuration saved in distilroberta-base-Massive-intent/checkpoint-1440/config.json
Model weights saved in distilroberta-base-Massive-intent/checkpoint-

TrainOutput(global_step=7200, training_loss=0.2762030512756771, metrics={'train_runtime': 568.5023, 'train_samples_per_second': 303.798, 'train_steps_per_second': 18.997, 'total_flos': 533398155444000.0, 'train_loss': 0.2762030512756771, 'epoch': 10.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 0.6617934107780457,
 'eval_accuracy': 0.8937530742744713,
 'eval_runtime': 3.579,
 'eval_samples_per_second': 568.037,
 'eval_steps_per_second': 35.764,
 'epoch': 10.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 0.7005173563957214, 'test_accuracy': 0.8819771351714862, 'test_runtime': 5.799, 'test_samples_per_second': 512.85, 'test_steps_per_second': 32.075}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46 45 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.8819771351714862
                          precision    recall  f1-score   support

          datetime_query       0.98      0.93      0.95        88
     iot_hue_lightchange       0.94      0.94      0.94        36
        transport_ticket       0.97      0.97      0.97        35
          takeaway_query       0.91      0.83      0.87        35
                qa_stock       0.81      0.96      0.88        26
           general_greet       0.25      1.00      0.40         1
   recommendation_events       0.82      0.77      0.80        43
       music_dislikeness       1.00      1.00      1.00         4
            iot_wemo_off       1.00      0.94      0.97        18
          cooking_recipe       0.91      0.86      0.89        72
             qa_currency       0.97      0.95      0.96        39
       transport_traffic       0.68      1.00      0.81        15
          general_quirky 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to distilroberta-base-Massive-intent
Configuration saved in distilroberta-base-Massive-intent/config.json
Model weights saved in distilroberta-base-Massive-intent/pytorch_model.bin
tokenizer config file saved in distilroberta-base-Massive-intent/tokenizer_config.json
Special tokens file saved in distilroberta-base-Massive-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/313M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1663946619.2ea13cb2f63d.3335.0:  30%|##9       | 3.34k/11.2k [00:00<?, ?B…

Upload file logs/events.out.tfevents.1663947204.2ea13cb2f63d.3335.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-base-Massive-intent
   7fa970d..820706b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilroberta-base-Massive-intent
   7fa970d..820706b  main -> main



Model webpage link: https://huggingface.co/gokuls/distilroberta-base-Massive-intent


### Sanity Check

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilbert model have the same tokenization output ###

In [None]:
## Bert and distilbert ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilbert-base-uncased"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## They produce same result except the token_type_ids. The token_type_ids can be none and BERT functions without any problem ##

Since our student model is a BERT-based model it uses the same tokenizer like BERT. The distilBERT tokenizer also produce similar output like BERT. So, it for transfering the knowledge to our student model. 

In [None]:
### Performing a Sanity check to confirm both BERT-base and distilRoBERTa model have the same tokenization output ###

In [None]:
## Bert and distilRoBERTa ##

from transformers import AutoTokenizer

## Models ##

model_1 = "distilroberta-base"
model_2 = "bert-base-uncased" 

# tokenizer initialization #
model_1_tokenizer = AutoTokenizer.from_pretrained(model_1)
model_2_tokenizer = AutoTokenizer.from_pretrained(model_2)

# sample input #
sample = "Testing tokenizer. This is BERT and Friends project"


print(model_1_tokenizer(sample))
print(model_2_tokenizer(sample))

## Produces different outputs ##

{'input_ids': [0, 47446, 19233, 6315, 4, 152, 16, 163, 18854, 8, 7837, 695, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 5604, 19204, 17629, 1012, 2023, 2003, 14324, 1998, 2814, 2622, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
## The tokenizers have created different outputs. This would be problematic while performing the knowledge distillation to our the (BERT-based) student model ##

## DistilBERT model finetuning

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset('AmazonScience/massive', 'en-US') ## Considering only the English dataset ##
raw_datasets

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading and preparing dataset massive/en-US to /root/.cache/huggingface/datasets/AmazonScience___massive/en-US/1.0.0/c06e96faea378f5cbba9f6ba50b7ea33fb3f91d4256bfa26cd23a54421d154e5...


Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset massive downloaded and prepared to /root/.cache/huggingface/datasets/AmazonScience___massive/en-US/1.0.0/c06e96faea378f5cbba9f6ba50b7ea33fb3f91d4256bfa26cd23a54421d154e5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2974
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "distilbert-base-Massive-intent"

In [None]:
checkpoint = "distilbert-base-uncased" ## Model used for fine-tuning ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=15, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices


In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=60)

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/distilbert-base-Massive-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10800
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4555,0.598336,0.842597
2,0.407,0.470155,0.877521
3,0.2095,0.531867,0.883424
4,0.1172,0.590202,0.880964
5,0.0683,0.655515,0.880964
6,0.042,0.698926,0.88785
7,0.0253,0.696302,0.892769
8,0.0208,0.731344,0.890802
9,0.0119,0.768318,0.892277
10,0.0093,0.769313,0.894737


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to distilbert-base-Massive-intent/checkpoint-720
Configuration saved in distilbert-base-Massive-intent/checkpoint-720/config.json
Model weights saved in distilbert-base-Massive-intent/checkpoint-720/pytorch_model.bin
tokenizer config file saved in distilbert-base-Massive-intent/checkpoint-720/tokenizer_config.json
Special tokens file saved in distilbert-base-Massive-intent/checkpoint-720/special_tokens_map.json
tokenizer config file saved in distilbert-base-Massive-intent/tokenizer_config.json
Special tokens file saved in distilbert-base-Massive-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to distilbert-base-Massive-intent/checkpoint-1440
Configuration saved in distilbert-base-Massive-intent/checkpoint-1440/config.json
Model weights saved in distilbert-base-Massive-intent/checkpoint-1440/pytorch_model.bin
tokenizer config file saved in distilbert-base-Massive-intent/checkpoint-1440/

TrainOutput(global_step=9360, training_loss=0.18319953644377554, metrics={'train_runtime': 657.9819, 'train_samples_per_second': 262.484, 'train_steps_per_second': 16.414, 'total_flos': 678080451006000.0, 'train_loss': 0.18319953644377554, 'epoch': 13.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 0.7693130970001221,
 'eval_accuracy': 0.8947368421052632,
 'eval_runtime': 2.2921,
 'eval_samples_per_second': 886.944,
 'eval_steps_per_second': 55.843,
 'epoch': 13.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 0.8257732391357422, 'test_accuracy': 0.8860121049092132, 'test_runtime': 3.7792, 'test_samples_per_second': 786.938, 'test_steps_per_second': 49.217}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46  1 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.8860121049092132
                          precision    recall  f1-score   support

          datetime_query       0.93      0.94      0.94        88
     iot_hue_lightchange       0.97      0.97      0.97        36
        transport_ticket       0.97      0.97      0.97        35
          takeaway_query       0.84      0.89      0.86        35
                qa_stock       0.86      0.92      0.89        26
           general_greet       0.25      1.00      0.40         1
   recommendation_events       0.77      0.77      0.77        43
       music_dislikeness       1.00      1.00      1.00         4
            iot_wemo_off       0.94      0.94      0.94        18
          cooking_recipe       0.95      0.97      0.96        72
             qa_currency       0.97      0.95      0.96        39
       transport_traffic       0.79      1.00      0.88        15
          general_quirky 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to distilbert-base-Massive-intent
Configuration saved in distilbert-base-Massive-intent/config.json
Model weights saved in distilbert-base-Massive-intent/pytorch_model.bin
tokenizer config file saved in distilbert-base-Massive-intent/tokenizer_config.json
Special tokens file saved in distilbert-base-Massive-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/256M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1663959740.c54e05edd87b.67.2: 100%|##########| 363/363 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1663959079.c54e05edd87b.67.0:  27%|##6       | 3.34k/12.4k [00:00<?, ?B/s…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilbert-base-Massive-intent
   f1573c4..1282c50  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/distilbert-base-Massive-intent
   f1573c4..1282c50  main -> main



Model webpage link: https://huggingface.co/gokuls/distilbert-base-Massive-intent


## BERT-tiny model finetuning

In [None]:
### Student model - Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ###

Here, instead of performing knowledge distillation, we are fine-tuning the student model. By this way, we could able to compare the performace of dirctly fine-tuned student model and student model trained by knowledge distillation with the help of teacher model.

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch



raw_datasets = load_dataset('AmazonScience/massive', 'en-US') ## Considering only the English dataset ##
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2974
    })
})

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "BERT-tiny-Massive-intent"

In [None]:
checkpoint = "google/bert_uncased_L-2_H-128_A-2" ## Model used for fine-tuning ## Ref: google/bert_uncased_L-2_H-128_A-2 ##
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/12 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
## create label2id, id2label dicts - to store id and label values ##

labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
### Training the Model ###

training_args = TrainingArguments(checkpoint)

training_args = TrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50, ## Epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    )

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
#### Model ####

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=60)

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Trainer ##

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)], ## For early stopping (patience = 3) ##
)

Cloning https://huggingface.co/gokuls/BERT-tiny-Massive-intent into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36000
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,3.6104,3.091125,0.360059
2,2.8025,2.380024,0.516478
3,2.2292,1.913422,0.599115
4,1.818,1.581005,0.674373
5,1.5171,1.352241,0.710772
6,1.2876,1.168555,0.74422
7,1.1049,1.035536,0.768323
8,0.9623,0.946645,0.78849
9,0.8424,0.871772,0.787506
10,0.7473,0.81067,0.802755


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to BERT-tiny-Massive-intent/checkpoint-720
Configuration saved in BERT-tiny-Massive-intent/checkpoint-720/config.json
Model weights saved in BERT-tiny-Massive-intent/checkpoint-720/pytorch_model.bin
tokenizer config file saved in BERT-tiny-Massive-intent/checkpoint-720/tokenizer_config.json
Special tokens file saved in BERT-tiny-Massive-intent/checkpoint-720/special_tokens_map.json
tokenizer config file saved in BERT-tiny-Massive-intent/tokenizer_config.json
Special tokens file saved in BERT-tiny-Massive-intent/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to BERT-tiny-Massive-intent/checkpoint-1440
Configuration saved in BERT-tiny-Massive-intent/checkpoint-1440/config.json
Model weights saved in BERT-tiny-Massive-intent/checkpoint-1440/pytorch_model.bin
tokenizer config file saved in BERT-tiny-Massive-intent/checkpoint-1440/tokenizer_config.json
Special tokens file saved in BERT-tiny-Massi

TrainOutput(global_step=15840, training_loss=1.0077819583391903, metrics={'train_runtime': 619.2659, 'train_samples_per_second': 929.649, 'train_steps_per_second': 58.133, 'total_flos': 11197842571632.0, 'train_loss': 1.0077819583391903, 'epoch': 22.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 0.6740342974662781,
 'eval_accuracy': 0.8475159862272503,
 'eval_runtime': 1.8577,
 'eval_samples_per_second': 1094.344,
 'eval_steps_per_second': 68.901,
 'epoch': 22.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 0.741625189781189, 'test_accuracy': 0.8238063214525891, 'test_runtime': 3.307, 'test_samples_per_second': 899.318, 'test_steps_per_second': 56.245}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46 13 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.8238063214525891
                          precision    recall  f1-score   support

          datetime_query       0.86      0.90      0.88        88
     iot_hue_lightchange       0.91      0.83      0.87        36
        transport_ticket       1.00      0.89      0.94        35
          takeaway_query       0.87      0.74      0.80        35
                qa_stock       0.71      0.92      0.80        26
           general_greet       0.00      0.00      0.00         1
   recommendation_events       0.73      0.74      0.74        43
       music_dislikeness       0.00      0.00      0.00         4
            iot_wemo_off       0.93      0.72      0.81        18
          cooking_recipe       0.88      0.92      0.90        72
             qa_currency       0.95      0.95      0.95        39
       transport_traffic       0.79      1.00      0.88        15
          general_quirky 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to BERT-tiny-Massive-intent
Configuration saved in BERT-tiny-Massive-intent/config.json
Model weights saved in BERT-tiny-Massive-intent/pytorch_model.bin
tokenizer config file saved in BERT-tiny-Massive-intent/tokenizer_config.json
Special tokens file saved in BERT-tiny-Massive-intent/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file logs/events.out.tfevents.1664028942.770a4ac86a1b.67.0:  20%|##        | 3.34k/16.7k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1664029563.770a4ac86a1b.67.2: 100%|##########| 363/363 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-Massive-intent
   356c3ba..f2a82dd  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/BERT-tiny-Massive-intent
   356c3ba..f2a82dd  main -> main



Model webpage link: https://huggingface.co/gokuls/BERT-tiny-Massive-intent
