In [None]:
## BERT and Friends - Project ##

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 9.3 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 56.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 55.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 18.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 54.3 MB/s 
Installing collected p

In this Project, There are three important sections:

**Part 1:** We will fine-tune the BERT-base, distilRoBERTa and DistilBERT and BERT-tiny (student) model on the Amazon Massive dataset.

**Part 2:** We will perform task-specific Knowledge Distillation using the Amazon Massive dataset.

Student model: BERT-tiny (2 layers and 128 hidden dimension and 2 attention heads)

We use our fine-tuned models in part-1 as teachers. The Knowledge distillation is performed in three different settings:

1.   Only with BERT model
2.   Only with distilBERT model
3.   With the combination of two models - BERT and distilBERT model 

**Part 3:** We will analyze the model size and the processing time

In [None]:
## Importing the Libraries and loading the dataset ##

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from huggingface_hub import notebook_login, HfFolder, HfApi
from collections import Counter
import evaluate
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

raw_datasets = load_dataset('AmazonScience/massive', 'en-US') ## Considering only the English dataset ##
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 2974
    })
})

In [None]:
## Checking if GPU is available ##

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
## Logging into Huggingface hub ##

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
### Ref: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2 ### - BERT Student model ###

# Using BERT-base model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-Massive-intent-KD-BERT"

In [None]:
## Teacher model: https://huggingface.co/gokuls/bert-base-Massive-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "gokuls/bert-base-Massive-intent" ## Our pre-trained BERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.48k/16.8M [00:00<?, ?B/s]

Download file logs/1663960776.3205633/events.out.tfevents.1663960776.bf5406322ee0.68.1: 100%|##########| 5.43k…

Download file training_args.bin: 100%|##########| 3.36k/3.36k [00:00<?, ?B/s]

Clean file logs/1663960776.3205633/events.out.tfevents.1663960776.bf5406322ee0.68.1:  18%|#8        | 1.00k/5.…

Clean file training_args.bin:  30%|##9       | 1.00k/3.36k [00:00<?, ?B/s]

Download file logs/events.out.tfevents.1663960776.bf5406322ee0.68.0:  25%|##5       | 3.48k/13.9k [00:00<?, ?B…

Clean file logs/events.out.tfevents.1663960776.bf5406322ee0.68.0:   7%|7         | 1.00k/13.9k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/16.8M [00:00<?, ?B/s]

Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36000
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,5.83,4.882554,0.304968
2,4.7602,3.990367,0.419085
3,4.0301,3.380615,0.503197
4,3.4797,2.906472,0.596655
5,3.0352,2.538918,0.659616
6,2.6787,2.234223,0.704378
7,2.3644,1.987251,0.735366
8,2.1145,1.792825,0.746188
9,1.896,1.629283,0.764388
10,1.7138,1.506156,0.775209


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT/checkpoint-720
Configuration saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-720/config.json
Model weights saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-720/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-720/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-720/special_tokens_map.json
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT/checkpoint-1440
Configuration saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-1440/config.json
Model weights saved in bert-tiny-Massive-intent-KD-BERT/checkpoint-1440/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD

TrainOutput(global_step=30960, training_loss=1.292167179590664, metrics={'train_runtime': 1716.6288, 'train_samples_per_second': 335.367, 'train_steps_per_second': 20.971, 'total_flos': 21871399613904.0, 'train_loss': 1.292167179590664, 'epoch': 43.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 0.8379986882209778,
 'eval_accuracy': 0.853418593212002,
 'eval_runtime': 4.872,
 'eval_samples_per_second': 417.279,
 'eval_steps_per_second': 26.272,
 'epoch': 43.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 0.9081733822822571, 'test_accuracy': 0.8328850033624747, 'test_runtime': 7.7733, 'test_samples_per_second': 382.593, 'test_steps_per_second': 23.928}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46 13 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.8328850033624747
                          precision    recall  f1-score   support

          datetime_query       0.86      0.90      0.88        88
     iot_hue_lightchange       0.87      0.94      0.91        36
        transport_ticket       1.00      0.94      0.97        35
          takeaway_query       0.90      0.74      0.81        35
                qa_stock       0.83      0.92      0.87        26
           general_greet       0.00      0.00      0.00         1
   recommendation_events       0.76      0.65      0.70        43
       music_dislikeness       0.00      0.00      0.00         4
            iot_wemo_off       0.92      0.67      0.77        18
          cooking_recipe       0.85      0.88      0.86        72
             qa_currency       0.95      0.95      0.95        39
       transport_traffic       0.79      1.00      0.88        15
          general_quirky 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT
Configuration saved in bert-tiny-Massive-intent-KD-BERT/config.json
Model weights saved in bert-tiny-Massive-intent-KD-BERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT/special_tokens_map.json


Upload file logs/events.out.tfevents.1664042979.ce181c7f3327.67.0:  12%|#2        | 3.34k/27.6k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1664044700.ce181c7f3327.67.2: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT
   76a41e4..7fe0344  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT
   76a41e4..7fe0344  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT


# Using distilBERT model for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-Massive-intent-KD-distilBERT"

In [None]:
## Teacher model: https://huggingface.co/gokuls/distilbert-base-Massive-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id = "gokuls/distilbert-base-Massive-intent" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer: ', teacher_tokenizer(sample))
print('Student tokenizer: ', student_tokenizer(sample))

Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer_config.json
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--b

Teacher tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
## Tokenization outputs are similar ##

In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer_config.json


In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # student and teacher on same device #
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher model #
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

PyTorch: setting up devices


Downloading:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/config.json
Model config DistilBertConfig {
  "_name_or_path": "gokuls/distilbert-base-Massive-intent",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "datetime_query",
    "1": "iot_hue_lightchange",
    "10": "qa_currency",
    "11": "transport_traffic",
    "12": "general_quirky",
    "13": "weather_query",
    "14": "audio_volume_up",
    "15": "email_addcontact",
    "16": "takeaway_order",
    "17": "email_querycontact",
    "18": "iot_hue_lightup",
    "19": "recommendation_locations",
    "2": "transport_ticket",
    "20": "play_audiobook",
    "21": "lists_createoradd",
    "22": "news_query",
    "23": "alarm_query",
    "24": "iot_wemo_on",
    "25"

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at gokuls/distilbert-base-Massive-intent.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--bert_uncased_L-2_H-128_A-2/snapshots/1ae49ff827beda5996998802695c4cac8e9932c6/config.json
Model config BertConfig {
  "_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    #metric_f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #return {'accuracy' : metric_acc.compute(predictions=predictions, references=labels), 'f1' : metric_f1.compute(predictions=predictions, references=labels, average= 'micro')}
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
## Trainer ##

trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

Cloning https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-distilBERT into local empty directory.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36000
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,10.9795,9.323637,0.291687
2,9.4239,7.9792,0.409247
3,8.2632,6.98239,0.481062
4,7.3425,6.154485,0.551402
5,6.56,5.482856,0.606001
6,5.9032,4.899405,0.646335
7,5.3078,4.412856,0.691097
8,4.819,4.015185,0.707329
9,4.3866,3.673428,0.732415
10,3.9954,3.372925,0.751599


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-tiny-Massive-intent-KD-distilBERT/checkpoint-720
Configuration saved in bert-tiny-Massive-intent-KD-distilBERT/checkpoint-720/config.json
Model weights saved in bert-tiny-Massive-intent-KD-distilBERT/checkpoint-720/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-distilBERT/checkpoint-720/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-distilBERT/checkpoint-720/special_tokens_map.json
tokenizer config file saved in bert-tiny-Massive-intent-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-distilBERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-tiny-Massive-intent-KD-distilBERT/checkpoint-1440
Configuration saved in bert-tiny-Massive-intent-KD-distilBERT/checkpoint-1440/config.json
Model weights saved in 

TrainOutput(global_step=24480, training_loss=3.3348690207487617, metrics={'train_runtime': 1213.7559, 'train_samples_per_second': 474.313, 'train_steps_per_second': 29.66, 'total_flos': 17306045619840.0, 'train_loss': 3.3348690207487617, 'epoch': 34.0})

In [None]:
### BERT-tiny model's convergence of loss takes a very very long time ###

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 1.6612293720245361,
 'eval_accuracy': 0.8396458435809149,
 'eval_runtime': 3.9833,
 'eval_samples_per_second': 510.383,
 'eval_steps_per_second': 32.134,
 'epoch': 34.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 1.8105820417404175, 'test_accuracy': 0.820107599193006, 'test_runtime': 5.5254, 'test_samples_per_second': 538.241, 'test_steps_per_second': 33.663}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46 45 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.820107599193006
                          precision    recall  f1-score   support

          datetime_query       0.88      0.91      0.89        88
     iot_hue_lightchange       0.78      0.89      0.83        36
        transport_ticket       1.00      0.89      0.94        35
          takeaway_query       0.83      0.71      0.77        35
                qa_stock       0.83      0.92      0.87        26
           general_greet       0.00      0.00      0.00         1
   recommendation_events       0.69      0.63      0.66        43
       music_dislikeness       0.00      0.00      0.00         4
            iot_wemo_off       0.68      0.72      0.70        18
          cooking_recipe       0.86      0.92      0.89        72
             qa_currency       0.86      0.95      0.90        39
       transport_traffic       0.79      1.00      0.88        15
          general_quirky  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-Massive-intent-KD-distilBERT
Configuration saved in bert-tiny-Massive-intent-KD-distilBERT/config.json
Model weights saved in bert-tiny-Massive-intent-KD-distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-distilBERT/special_tokens_map.json


Upload file logs/events.out.tfevents.1663961788.bf5406322ee0.68.2:  14%|#4        | 3.34k/23.3k [00:00<?, ?B/s…

Upload file logs/events.out.tfevents.1663963138.bf5406322ee0.68.4: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-distilBERT
   f880de8..c9b60a1  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-distilBERT
   f880de8..c9b60a1  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-distilBERT


# Using BERT-base model and distilBERT model (multiple teacher model) for Knowledge Distillation

In [None]:
# Name for the repository on the huggingface hub #

repo_name = "bert-tiny-Massive-intent-KD-BERT_and_distilBERT"

In [None]:
## Teacher model : https://huggingface.co/gokuls/bert-base-Massive-intent ##
## Teacher model 2: https://huggingface.co/gokuls/distilbert-base-Massive-intent ##

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2" ## using bert-tiny model
teacher_id_1 = "gokuls/bert-base-Massive-intent" ## Our pre-trained BERT model is used as teacher
teacher_id_2 = "gokuls/distilbert-base-Massive-intent" ## Our pre-trained distilBERT model is used as teacher

In [None]:
## Checking if the tokenizers of teacher and student model produces the same output ##

# tokenizer initialization #
teacher_tokenizer1 = AutoTokenizer.from_pretrained(teacher_id_1)
teacher_tokenizer2 = AutoTokenizer.from_pretrained(teacher_id_2)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input #
sample = "Testing tokenizers."

# Sanity check #
print('Teacher tokenizer 1: ', teacher_tokenizer1(sample))
print('Teacher tokenizer 2: ', teacher_tokenizer2(sample))
print('Student tokenizer: ', student_tokenizer(sample))

## Tokenization outputs are similar ##

Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Teacher tokenizer 1:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Teacher tokenizer 2:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Student tokenizer:  {'input_ids': [101, 5604, 19204, 17629, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id_2) ## Don't use the tokenizer from teacher_id_1 (BERT), since it produces token_type_id which is not needed by distilBERT ##

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gokuls--distilbert-base-Massive-intent/snapshots/1282c5096ee4deac235f13f2a1f4a5571f70852c/tokenizer_config.json


In [None]:
## Tokenization ##

def tokenize_function(example):
    return tokenizer(example["utt"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
## Data Pre-processing ##

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'locale', 'partition','scenario','annot_utt', 'utt', 'worker_id', 'slot_method', 'judgments']) ## removing unwanted columns ##
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 11514
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2974
    })
})

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model_1=None, teacher_model_2=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher1 = teacher_model_1
        self.teacher2 = teacher_model_2
        # student and teacher on same device #
        self._move_model_to_device(self.teacher1,self.model.device)
        self._move_model_to_device(self.teacher2,self.model.device)
        self.teacher1.eval()
        self.teacher2.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output #
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss # output from gold labels
        # compute teacher output #
        with torch.no_grad():
            outputs_teacher1 = self.teacher1(**inputs)
            outputs_teacher2 = self.teacher2(**inputs)
        
        # assert size #
        assert outputs_student.logits.size() == outputs_teacher1.logits.size()
        assert outputs_student.logits.size() == outputs_teacher2.logits.size()
        
        # Soften probabilities and compute distillation loss #
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher1.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2)) + (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher2.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss #
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [None]:
# create label2id, id2label dicts #
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# training args #
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    learning_rate=5e-5,
    seed=33,
    # logging & evaluation strategies #
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters #
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters #
    alpha=0.5,
    temperature=3.0
    )

# data_collator #
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Teacher models #
teacher_model1 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_1,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

teacher_model2 = AutoModelForSequenceClassification.from_pretrained(
    teacher_id_2,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

# Student model #
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

PyTorch: setting up devices
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gokuls--bert-base-Massive-intent/snapshots/d53c46768bfbaae4c67389c3aac3e1856d6ad9e5/config.json
Model config BertConfig {
  "_name_or_path": "gokuls/bert-base-Massive-intent",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "datetime_query",
    "1": "iot_hue_lightchange",
    "10": "qa_currency",
    "11": "transport_traffic",
    "12": "general_quirky",
    "13": "weather_query",
    "14": "audio_volume_up",
    "15": "email_addcontact",
    "16": "takeaway_order",
    "17": "email_querycontact",
    "18": "iot_hue_lightup",
    "19": "recommendation_locations",
    "2": "transport_ticket",
    "20": "play_audiobook",
    "21": "lists_createoradd",
    

In [None]:
## Evaluation metric ##

def compute_metrics(eval_preds):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [None]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model_1=teacher_model1,
    teacher_model_2=teacher_model2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)

/content/bert-tiny-Massive-intent-KD-BERT_and_distilBERT is already a clone of https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT_and_distilBERT. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [None]:
## Training ##

trainer.train() 

***** Running training *****
  Num examples = 11514
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36000
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,15.1159,12.825702,0.225283
2,12.9949,10.98908,0.430398
3,11.3865,9.562173,0.503197
4,10.0553,8.369985,0.553861
5,8.9431,7.412732,0.610428
6,8.0135,6.618501,0.628628
7,7.1987,5.951724,0.681751
8,6.5168,5.387944,0.711756
9,5.9352,4.942646,0.727496
10,5.4299,4.563731,0.741269


***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-720
Configuration saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-720/config.json
Model weights saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-720/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-720/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-720/special_tokens_map.json
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16
Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT_and_distilBERT/checkpoint-1440
Configuration saved in bert-tiny-Massive-

TrainOutput(global_step=30960, training_loss=4.12970725756899, metrics={'train_runtime': 2099.2951, 'train_samples_per_second': 274.235, 'train_steps_per_second': 17.149, 'total_flos': 21871399613904.0, 'train_loss': 4.12970725756899, 'epoch': 43.0})

In [None]:
## After the training the Best model will be used. Now evaluating the best model ##

## Evaluate ##

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2033
  Batch size = 16


{'eval_loss': 2.372915267944336,
 'eval_accuracy': 0.8470241023118544,
 'eval_runtime': 5.8714,
 'eval_samples_per_second': 346.255,
 'eval_steps_per_second': 21.801,
 'epoch': 43.0}

In [None]:
## Evaluating the model on Test set ##

print('\nTest results : \n\n', trainer.predict(tokenized_datasets["test"]).metrics)

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16



Test results : 

 {'test_loss': 2.537945032119751, 'test_accuracy': 0.8275050437121722, 'test_runtime': 9.1439, 'test_samples_per_second': 325.242, 'test_steps_per_second': 20.341}


In [None]:
## Computing All the other performance evaluation metics (precision, recall and f1) on the test set ##

## Predicted values ##

test_result = trainer.predict(tokenized_datasets["test"])
predicted_values = np.argmax(test_result.predictions, axis=1)
print("Predicted values : ", predicted_values)

## Actual values ##

actual_values = test_result.label_ids
print('Actual values : ', actual_values)

## Getting label id and names ##

target_names = list(label2id.keys())
labels = list(map(int, list(id2label.keys()))) ## Converting list of strings to list of integers ##


## Getting the classification report using sklearn ##

from sklearn.metrics import classification_report, accuracy_score

print('Accuracy : ', accuracy_score(actual_values, predicted_values))
print(classification_report(actual_values, predicted_values, labels= labels, target_names= target_names))

***** Running Prediction *****
  Num examples = 2974
  Batch size = 16


Predicted values :  [48 46 49 ... 44 44 44]
Actual values :  [48 46  1 ... 44 44 44]
Accuracy :  0.8275050437121722
                          precision    recall  f1-score   support

          datetime_query       0.84      0.92      0.88        88
     iot_hue_lightchange       0.72      0.86      0.78        36
        transport_ticket       0.97      0.91      0.94        35
          takeaway_query       0.90      0.74      0.81        35
                qa_stock       0.81      0.96      0.88        26
           general_greet       0.00      0.00      0.00         1
   recommendation_events       0.69      0.67      0.68        43
       music_dislikeness       0.00      0.00      0.00         4
            iot_wemo_off       1.00      0.83      0.91        18
          cooking_recipe       0.88      0.92      0.90        72
             qa_currency       0.93      0.95      0.94        39
       transport_traffic       0.79      1.00      0.88        15
          general_quirky 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Saving the model on the hugging face hub ##

# save best model, metrics and create model card #

trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()


## Link for the model webpage ##

whoami = HfApi().whoami()
username = whoami['name']

print(f"Model webpage link: https://huggingface.co/{username}/{repo_name}")

Saving model checkpoint to bert-tiny-Massive-intent-KD-BERT_and_distilBERT
Configuration saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/config.json
Model weights saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/pytorch_model.bin
tokenizer config file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/tokenizer_config.json
Special tokens file saved in bert-tiny-Massive-intent-KD-BERT_and_distilBERT/special_tokens_map.json


Upload file logs/events.out.tfevents.1664045154.ce181c7f3327.10095.2:  12%|#2        | 3.34k/27.6k [00:00<?, ?…

Upload file logs/events.out.tfevents.1664047261.ce181c7f3327.10095.4: 100%|##########| 369/369 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT_and_distilBERT
   9db37d9..d2b426f  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT_and_distilBERT
   9db37d9..d2b426f  main -> main



Model webpage link: https://huggingface.co/gokuls/bert-tiny-Massive-intent-KD-BERT_and_distilBERT
