<a href="https://colab.research.google.com/github/donnowhattodo/DistillationOFRoberta/blob/main/RobertaStdTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch

In [1]:
!pip install transformers datasets torch

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

In [4]:
# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Convert dataset to PyTorch DataLoader
dataloader = DataLoader(dataset["train"], batch_size=32, shuffle=True)

# Teacher Model (RoBERTa Base)
teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-base")
teacher_model.eval()
teacher_model.to("cuda")  # Move teacher model to GPU

# Student Model (RoBERTa Small)
student_model = RobertaForSequenceClassification.from_pretrained("roberta-base")
student_model.train()
student_model.to("cuda")  # Move student model to GPU

# Loss and hyperparameters
criterion = nn.KLDivLoss()
temperature = 3.0
alpha = 0.5
learning_rate = 1e-4
num_epochs = 5

optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training loop
for epoch in range(num_epochs):
    total_train_loss = 0.0  # Initialize training loss for the epoch
    for batch in dataloader:
        inputs = tokenizer(batch["sentence"], padding=True, truncation=True, return_tensors="pt")
        inputs = {key: value.to("cuda") for key, value in inputs.items()}  # Move inputs to GPU
        labels = batch["label"].to("cuda")  # Move labels to GPU

        optimizer.zero_grad()

        with torch.no_grad():
            teacher_logits = teacher_model(**inputs).logits

        student_logits = student_model(**inputs).logits

        distillation_loss = criterion(
            nn.functional.log_softmax(student_logits / temperature, dim=-1),
            nn.functional.softmax(teacher_logits / temperature, dim=-1),
        )

        standard_loss = nn.functional.cross_entropy(student_logits, labels)

        total_loss = distillation_loss + alpha * standard_loss

        total_loss.backward()
        optimizer.step()

        total_train_loss += total_loss.item()

    avg_train_loss = total_train_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} | Average Training Loss: {avg_train_loss:.4f}")

print("Training completed.")



Epoch 1/5 | Average Training Loss: 0.3449
Epoch 2/5 | Average Training Loss: 0.3444
Epoch 3/5 | Average Training Loss: 0.3441
Epoch 4/5 | Average Training Loss: 0.3440
Epoch 5/5 | Average Training Loss: 0.3438
Training completed.


In [13]:
print("""

Distillation Process Steps:
  _________________________________________________________
 |                                                         |
 |    Teacher Model                Student Model           |
 |       ________                     ________             |
 |      |        |                   |        |            |
 |      |  Input |------------------>|  Input |            |
 |      |________|                   |________|            |
 |           |                             |                |
 |     Teacher Logits              Student Logits         |
 |           |                             |                |
 |           v                             v                |
 |     Distillation Loss          Standard Loss            |
 |           |                             |                |
 |           v                             v                |
 |      Total Loss                 Total Loss               |
 |           |                             |                |
 |           v                             v                |
 |    Backpropagation            Backpropagation            |
 |           |                             |                |
 |           v                             v                |
 |      Update Student           Update Student             |
 |       Parameters                Parameters              |
 |                                                         |
  _________________________________________________________


""")



Distillation Process Steps:
  _________________________________________________________
 |                                                         |
 |    Teacher Model                Student Model           |
 |       ________                     ________             |
 |      |        |                   |        |            |
 |      |  Input |------------------>|  Input |            |
 |      |________|                   |________|            |
 |           |                             |                |
 |     Teacher Logits              Student Logits         |
 |           |                             |                |
 |           v                             v                |
 |     Distillation Loss          Standard Loss            |
 |           |                             |                |
 |           v                             v                |
 |      Total Loss                 Total Loss               |
 |           |                             |       