# Knowledge Distillation - Train a student with a teacher model
- Runs in conda environment(kernel): knowdist
- There is an issue with the module "wandb" which is hopefully solved now. If not reinstall wandb in environment 

In [1]:
from transformers import TrainingArguments
import wandb

In [2]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    #*args allows us to pass a variable number of non-keyword arguments to a Python function.
    #**kwargs stands for keyword arguments. The only difference from args is that it uses keywords and returns the values in the form of a dictionary.
    super().__init__(*args, **kwargs)
    #The super() function is often used with the __init__() method to initialize the attributes of the parent class.
    self.alpha = alpha
    self.temperature = temperature

In [3]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

In [4]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

     #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss

In [5]:
from datasets import load_dataset

In [7]:
clinc = load_dataset("clinc_oos", "plus")
#the plus configuration refers to the subset that contains the out-of-scope training examples.

In [8]:
sample = clinc["train"][0]
print(sample)
#Each example in the CLINC150 dataset consists of a query in the text column and its corresponding intent.

{'text': 'what expression would i use to say i love you if i were an italian', 'intent': 61}


In [9]:
intents = clinc["train"].features["intent"]
intent = intents.int2str(sample["intent"])
print(intent)

translate


In [10]:
from transformers import AutoTokenizer

In [11]:
student_checkpoint = "distilbert-base-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)



In [12]:
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True)

In [13]:
clinc_tokenized = clinc.map(tokenize_text, batched=True, remove_columns=["text"])

#We will remove text column as we don't need it
#We will also rename the intent column to labels so it can be automatically detected by the trainer.
clinc_tokenized = clinc_tokenized.rename_column("intent", "labels")


Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [14]:
import numpy as np
from datasets import load_metric
accuracy_score = load_metric("accuracy")

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [15]:
batch_size = 48
finetuned_student_ckpt = "distilbert-base-uncased-finetuned-clinc-student"

In [16]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=1, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
from transformers import pipeline

bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

  return self.fget.__get__(instance, owner)()


In [18]:
from transformers import AutoConfig
num_labels = intents.num_classes
student_config = (AutoConfig
                  .from_pretrained(student_checkpoint, num_labels=num_labels,
                                    id2label=id2label, label2id=label2id))

In [19]:
import torch
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).to(device))

True


In [20]:
teacher_checkpoint = "transformersbook/bert-base-uncased-finetuned-clinc"

In [21]:
teacher_model = (AutoModelForSequenceClassification
                     .from_pretrained(teacher_checkpoint, num_labels=num_labels)
                     .to(device))

In [22]:
#Lets start the training
distilbert_trainer = KnowledgeDistillationTrainer(model_init=student_init,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=clinc_tokenized['train'], eval_dataset=clinc_tokenized['validation'],
        compute_metrics=compute_metrics, tokenizer=student_tokenizer)
distilbert_trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environ

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.190294,0.541613


TrainOutput(global_step=318, training_loss=4.5764831926837655, metrics={'train_runtime': 43.6649, 'train_samples_per_second': 349.251, 'train_steps_per_second': 7.283, 'total_flos': 83004337293780.0, 'train_loss': 4.5764831926837655, 'epoch': 1.0})

In [23]:
def save_teacher_model():
  teacher_model.save_pretrained("teacher_model")
def save_student_model():
  distilbert_trainer.save_model('student_model')

In [24]:
save_teacher_model()
save_student_model()

In [25]:
from transformers import AutoConfig, AutoModelForSequenceClassification
import os

def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [26]:
teacher_model_parameters = compute_parameters(model_path="teacher_model")
print("Teacher Model: ", teacher_model_parameters)

Teacher Model:  109598359


In [28]:
student_model_parameters = compute_parameters(model_path="student_model")
print("Student Model: ", student_model_parameters)

Student Model:  67069591


### Berchechnet die verminderung der Parameteranzahl im Student Model

In [29]:
decrease = (student_model_parameters-teacher_model_parameters)/teacher_model_parameters
print(decrease*100)

-38.804201438818986


In [31]:
!ls student_model -al --block-size=MB

total 269MB
drwxr-xr-x 2 jovyan users   1MB May  2 12:14 .
drwxr-xr-x 9 jovyan users   1MB May  2 12:18 ..
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 config.json
-rw-r--r-- 1 jovyan users 269MB May  2 12:18 model.safetensors
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 special_tokens_map.json
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 tokenizer_config.json
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 tokenizer.json
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 training_args.bin
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 vocab.txt


In [32]:
!ls teacher_model -al --block-size=MB

total 439MB
drwxr-xr-x 2 jovyan users   1MB May  2 12:14 .
drwxr-xr-x 9 jovyan users   1MB May  2 12:18 ..
-rw-r--r-- 1 jovyan users   1MB May  2 12:18 config.json
-rw-r--r-- 1 jovyan users 439MB May  2 12:18 model.safetensors


In [30]:
print(clinc['train']['text'][101])
print(clinc['train']['intent'][101])


complete a transaction from savings to checking of $20000
133


# Inference Testing 

In [31]:
#Lets warmup first
from transformers import pipeline
import time

pipe = pipeline("text-classification", model="teacher_model", tokenizer='bert-base-uncased')

sample_input = clinc['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_teacher_model = time.time()-start
print("Total time to process 100 requests for Teacher Model: ",total_time_teacher_model)



Total time to process 100 requests for Teacher Model:  17.193105459213257


In [32]:
pipe = pipeline("text-classification", model="student_model", tokenizer="distilbert-base-uncased")

sample_input = clinc['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_student_model = time.time()-start

print("Total time to process 100 requests for Student Model: ",total_time_student_model)

Total time to process 100 requests for Student Model:  9.907214403152466


In [33]:
decrease_in_time = (total_time_teacher_model-total_time_student_model)/total_time_teacher_model
print(decrease_in_time*100)

42.37681827372556
