In [1]:
!pip install transformers datasets transformers[torch] accelerate>=0.20.11


[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import time

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

## –°–æ–∑–¥–∞–¥–∏–º –∫–∞—Å—Ç–æ–º–Ω—ã–π Trainer –¥–ª—è –¥–∏—Å—Ç–∏–ª–ª—è—Ü–∏–∏ –∑–Ω–∞–Ω–∏–π

---



1. –û–ø—Ä–µ–¥–µ–ª–∏–º –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã Œ± and T

Œ± - –Ω–∞ —Å–∫–æ–ª—å–∫–æ –±–æ–ª—å—à–µ –º—ã —Ö–æ—Ç–∏–º –æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∞—Ç—å—Å—è –Ω–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –º–æ–¥–µ–ª–∏-—É—á–∏—Ç–µ–ª—è, –∏ –º–µ–Ω—å—à–µ –Ω–∞ –º–æ–¥–µ–ª—å-—Å—Ç—É–¥–µ–Ω—Ç–∞  
T - –∫–∞–∫ —Å–∏–ª—å–Ω–æ –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å —Å–≥–ª–∞–∂–µ–Ω–æ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π –∫–ª–∞—Å—Å–æ–≤

2. –í –∫–∞—á–µ—Å—Ç–≤–µ –º–æ–¥–µ–ª–∏-—É—á–∏—Ç–µ–ª—è –±—É–¥–µ–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å BERT-base.

3. –ù–æ–≤–∞—è –ª–æ—Å—Å-—Ñ—É–Ω–∫—Ü–∏—è –±—É–¥–µ—Ç —Å–æ–≤–º–µ—â–∞—Ç—å –≤ —Å–µ–±–µ –∫—Ä–æ—Å—Å-—ç–Ω—Ç—Ä–æ–ø–∏—é –∏ –ª–æ—Å—Å –¥–∏—Å—Ç–∏–ª–ª—è—Ü–∏–∏

–ß—Ç–æ–±—ã –¥–æ–±–∞–≤–∏—Ç—å –Ω–∞—à–∏ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ —Å–æ–∑–¥–∞—Ç—å –∫–ª–∞—Å—Å TrainingArguments –∏ –≤–∫–ª—é—á–∏—Ç—å –∏—Ö –≤ –Ω–µ–≥–æ –∫–∞–∫ –∞—Ç—Ä–∏–±—É—Ç—ã

In [3]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    super().__init__(*args, **kwargs)

    self.alpha = alpha
    self.temperature = temperature

# –ù–∞–ø–∏—à–µ–º –ª–æ—Å—Å-—Ñ—É–Ω–∫—Ü–∏—é –¥–ª—è –¥–∏—Å—Ç–∏–ª–ª—è—Ü–∏–∏ –∑–Ω–∞–Ω–∏–π
–°–æ–∑–¥–∞–¥–∏–º –Ω–∞—Å–ª–µ–¥–Ω–∏–∫–∞ –∫–ª–∞—Å—Å–∞ Trainer –∏ –ø–µ—Ä–µ–ø–∏—à–µ–º compute_loss()



In [4]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits
    inputs.pop('token_type_ids')
    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

    #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss


## –í—ã–±–∏—Ä–∞–µ–º –º–æ–¥–µ–ª—å-—É—á–µ–Ω–∏–∫–∞

–ö–∞–∫ –≤—ã–±—Ä–∞—Ç—å –ø–æ–¥—Ö–æ–¥—è—â—É—é –º–æ–¥–µ–ª—å-—É—á–µ–Ω–∏–∫–∞?
1. –ú–µ–Ω—å—à–∞—è –º–æ–¥–µ–ª—å —á–µ–º —É—á–∏—Ç–µ–ª—å —á—Ç–æ–±—ã —É–º–µ–Ω—å—à–∏—Ç—å –æ–±—ä–µ–º –∑–∞–Ω–∏–º–∞–µ–º–æ–π –ø–∞–º—è—Ç–∏ –∏ —É–≤–µ–ª–∏—á–∏—Ç—å RPS

2. –î–∏—Å—Ç–∏–ª–ª—è—Ü–∏—è –∑–Ω–∞–Ω–∏–π —Ä–∞–±–æ—Ç–∞–µ—Ç –ª—É—á—à–µ, –∫–æ–≥–¥–∞ –º–æ–¥–µ–ª—å-—É—á–∏—Ç–µ–ª—å –∏ —É—á–µ–Ω–∏–∫ –æ–¥–Ω–æ–≥–æ —Ç–∏–ø–∞ (BERT –∏ RoBERTa –º–æ–≥—É—Ç –∏–º–µ—Ç—å —Ä–∞–∑–Ω—É—é –¥–ª–∏–Ω–Ω—É —ç–º–µ–±–¥–¥–∏–Ω–≥–æ–≤ –Ω–∞ –≤—ã—Ö–æ–¥–µ, —á—Ç–æ —Å–æ–∑–¥–∞–µ—Ç —Å–ª–æ–∂–Ω–æ—Å—Ç–∏ –¥–ª—è —É—á–µ–Ω–∏–∫–∞ –º–∏–º–∏–∫—Ä–∏—Ä–æ–≤–∞—Ç—å –ø–æ–¥ —É—á–∏—Ç–µ–ª—è)

–í –∫–∞—á–µ—Å—Ç–≤–µ –ø—Ä–∏–º–µ—Ä–∞ –Ω–∞ —Ä–æ–ª—å –º–æ–¥–µ–ª–∏-—É—á–µ–Ω–∏–∫–∞ –≤–æ–∑—å–º–µ–º DistilBERT.

### –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞

–ë—É–¥–µ–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å CLINC150 –¥–∞—Ç–∞—Å–µ—Ç




In [5]:
data = load_dataset("imdb").shuffle()

In [6]:
sample = data["train"][0]
print(sample)

{'text': "This is a great British film. A cleverly observed script with many quotable lines, which captures perfectly what magic mushrooms can do to a man over a weekend. As per usual Phil Daniels is excellent along with that most under rated of British actors Geoff Bell. Peter Bowles with a joint hanging out of his mouth is a casting masterstroke and Gary Stretch with his brooding looks brings something strangely atmospheric to the piece. Although it seems to be billed as a biker movie, i think it will find an audience outside of this, purely on the premise that a lot of people have been there done it and got the t-shirt. also A great original soundtrack with a blinding version of Freebird. This really could be a 21st century heir to the famous Ealing comedies. Like the weed in the Welsh fields: it's a grower!", 'label': 1}


–ù–∞–º–µ—Ä–µ–Ω–∏—è –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª—è—é—Ç—Å—è –≤ –≤–∏–¥–µ –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤, –Ω–æ –º—ã –º–æ–∂–µ–º –ª–µ–≥–∫–æ –ø–æ–ª—É—á–∏—Ç—å –µ–≥–æ –∑–Ω–∞—á–µ–Ω–∏–µ (–∏ –Ω–∞–æ–±–æ—Ä–æ—Ç), –≤—ã–∑–≤–∞–≤ —Ñ—É–Ω–∫—Ü–∏—é int2str:

# –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –¥–∞—Ç–∞—Å–µ—Ç

In [7]:
student_checkpoint = "google/mobilebert-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

In [8]:
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True, max_length=512)

In [9]:
imdb_tokenized = data.map(tokenize_text, batched=True, remove_columns=["text"])

Map:   4%|‚ñç         | 1000/25000 [00:00<00:02, 9331.73 examples/s]

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25000/25000 [00:02<00:00, 10447.54 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25000/25000 [00:02<00:00, 10395.86 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:05<00:00, 9469.71 examples/s] 


# –û–ø—Ä–µ–¥–µ–ª–∏–º –º–µ—Ç—Ä–∏–∫—É –∫–æ—Ç–æ—Ä–æ–π –±—É–¥–µ–º –∑–∞–º–µ—Ä—è—Ç—å —Ç–æ—á–Ω–æ—Å—Ç—å

In [10]:
accuracy_score = load_metric("accuracy")

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")


–í —ç—Ç–æ–π —Ñ—É–Ω–∫—Ü–∏–∏ –ø—Ä–æ–≥–Ω–æ–∑—ã –æ—Ç –≥–æ–ª–æ–≤—ã –º–æ–¥–µ–ª–∏ –ø–æ—Å—Ç—É–ø–∞—é—Ç –≤ —Ñ–æ—Ä–º–µ –ª–æ–≥–∏—Ç–æ–≤, –ø–æ—ç—Ç–æ–º—É –º—ã –∏—Å–ø–æ–ª—å–∑—É–µ–º —Ñ—É–Ω–∫—Ü–∏—é np.argmax(), —á—Ç–æ–±—ã –Ω–∞–π—Ç–∏ –Ω–∞–∏–±–æ–ª–µ–µ –¥–æ—Å—Ç–æ–≤–µ—Ä–Ω—ã–π –ø—Ä–æ–≥–Ω–æ–∑ –∫–ª–∞—Å—Å–∞ –∏ —Å—Ä–∞–≤–Ω–∏—Ç—å –µ–≥–æ —Å ground truth –º–µ—Ç–∫–æ–π.

# –û–ø—Ä–µ–¥–µ–ª–∏–º –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∏

In [11]:
batch_size = 16
finetuned_student_ckpt = "mobilebert-base-uncased-finetuned-imdb-student"

In [12]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=1, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

## –î–∞–≤–∞–π—Ç–µ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å —É—á–µ–Ω–∏–∫–∞, –Ω–æ –ø–µ—Ä–µ–¥ —ç—Ç–∏–º –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–∏–º –µ–π —Å–ª–æ–≤–∞—Ä–∏ —Å –∫–∞–∂–¥—ã–º –Ω–∞–º–µ—Ä–µ–Ω–∏–µ–º –∏ –µ–≥–æ –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–º.

In [13]:
distilbert_ckpt = "lvwerra/distilbert-imdb"
pipe = pipeline("text-classification", model=distilbert_ckpt)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

In [14]:
num_labels = 2
student_config = (AutoConfig.from_pretrained(student_checkpoint,
                                             num_labels=num_labels,
                                             id2label=id2label,
                                             label2id=label2id))

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).to(device))

## –ó–∞–≥—Ä—É–∑–∏–º –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω—ã–µ –≤–µ—Å–∞ –º–æ–¥–µ–ª–∏-—É—á–∏—Ç–µ–ª—è –∏ –Ω–∞—á–Ω–µ–º –¥–æ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏-—É—á–µ–Ω–∏–∫–∞

In [16]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(distilbert_ckpt, num_labels=num_labels, ignore_mismatched_sizes=True).to(device)

In [17]:
# –ù–∞—á–∏–Ω–∞–µ–º fine-tuning —É—á–µ–Ω–∏–∫–∞
distilbert_trainer = KnowledgeDistillationTrainer(model_init=student_init,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=imdb_tokenized['train'], eval_dataset=imdb_tokenized['test'],
        compute_metrics=compute_metrics, tokenizer=student_tokenizer)

distilbert_trainer.train()

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a MobileBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,117327.312,0.190317,0.92712




TrainOutput(global_step=782, training_loss=75017.539643334, metrics={'train_runtime': 415.6524, 'train_samples_per_second': 60.146, 'train_steps_per_second': 1.881, 'total_flos': 1567070261660160.0, 'train_loss': 75017.539643334, 'epoch': 1.0})

## –°—Ä–∞–≤–Ω–∏–º –º–æ–¥–µ–ª–∏ —É—á–∏—Ç–µ–ª—è –∏ —É—á–µ–Ω–∏–∫–∞



–°–æ—Ö—Ä–∞–Ω–∏–º –º–æ–¥–µ–ª–∏ —É—á–∏—Ç–µ–ª—è –∏ —É—á–µ–Ω–∏–∫–∞, –∞ –∑–∞—Ç–µ–º –≤—ã—á–∏—Å–ª–∏–º —Ä–∞–∑–º–µ—Ä—ã –º–æ–¥–µ–ª–µ–π –≤ MB.

In [18]:
teacher_model.save_pretrained("teacher_model")
distilbert_trainer.save_model('student_model')

raw_student = AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config)
raw_student.save_pretrained("raw_student_model")

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.dense.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

In [19]:
def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [20]:
teacher_model_parameters = compute_parameters(model_path="./teacher_model")
print("Teacher Model: ", teacher_model_parameters)

Teacher Model:  66955010


In [21]:
student_model_parameters = compute_parameters(model_path="./student_model")
print("Student Model: ", student_model_parameters)

Student Model:  24582914


In [22]:
decrease = (teacher_model_parameters - student_model_parameters) / teacher_model_parameters
print(f'–ú–æ–¥–µ–ª—å —Å—Ç—É–¥–µ–Ω—Ç–∞ –∏–º–µ–µ—Ç –Ω–∞ {decrease*100:.2f} % –º–µ–Ω—å—à–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤, —á–µ–º –º–æ–¥–µ–ª—å —É—á–∏—Ç–µ–ª—è')

–ú–æ–¥–µ–ª—å —Å—Ç—É–¥–µ–Ω—Ç–∞ –∏–º–µ–µ—Ç –Ω–∞ 63.28 % –º–µ–Ω—å—à–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤, —á–µ–º –º–æ–¥–µ–ª—å —É—á–∏—Ç–µ–ª—è


In [23]:
!ls ./student_model -al --block-size=MB

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 100MB
drwxrwxr-x 2 grinkevich grinkevich  1MB Oct  8 11:29 .
drwxrwxr-x 6 grinkevich grinkevich  1MB Oct  8 11:29 ..
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 config.json
-rw-rw-r-- 1 grinkevich grinkevich 99MB Oct  8 17:17 pytorch_model.bin
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 special_tokens_map.json
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 tokenizer.json
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 tokenizer_config.json
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 training_args.bin
-rw-rw-r-- 1 grinkevich grinkevich  1MB Oct  8 17:17 vocab.txt


In [24]:
!ls ./teacher_model -al --block-size=MB

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 268MB
drwxrwxr-x 2 grinkevich grinkevich   1MB Oct  8 11:29 .
drwxrwxr-x 6 grinkevich grinkevich   1MB Oct  8 11:29 ..
-rw-rw-r-- 1 grinkevich grinkevich   1MB Oct  8 17:17 config.json
-rw-rw-r-- 1 grinkevich grinkevich 268MB Oct  8 17:17 pytorch_model.bin


–í—ã–ø–æ–ª–Ω–∏–º –∑–∞–º–µ—Ä —Å—Ä–µ–¥–Ω–µ–π —Å–∫–æ—Ä–æ—Å—Ç–∏ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ —É –æ–±–æ–∏—Ö –º–æ–¥–µ–ª–µ–π –Ω–∞ –æ–¥–∏–Ω–∞–∫–æ–≤—ã—Ö –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö

In [25]:
sample_input = data['train']['text'][101]

print(data['train']['text'][101])
print(data['train']['label'][101])

I saw this little magnum opus for the first time very recently, on one of those dollar DVD's that seem to be everywhere nowadays, and was so moved by it that I cannot contain myself. For those who have never seen this mesmerizingly miserable Mexican import, and wish to view it without being prejudiced by anyone else's jaundiced commentary, there are undoubtedly substantial spoilers in what follows. So if you are one of those reckless individuals, stop reading at once and go and watch it for yourself. If you get drunk enough in advance, you might be fortunate enough to pass out before it's over.<br /><br />Begin with the premise that a man may become a werewolf after being bitten by a yeti. No one in the film ventures an explanation as to how this sort of cross-species implantation could occur, and the rest of the movie is even more hopelessly nonsensical. But pour yourself another glass of wine (or whatever you're drinking), and let us proceed.<br /><br />Paul Naschy (our werewolf) has

In [28]:
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
pipe = pipeline("text-classification", model="./teacher_model", tokenizer='lvwerra/distilbert-imdb')

#WARMUP
for _ in range(10):
  _ = pipe(sample_input, **tokenizer_kwargs)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = pipe(sample_input, **tokenizer_kwargs)
total_time_teacher_model = time.time() - start
print("–û–±—â–µ–µ –≤—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ 100 –∑–∞–ø—Ä–æ—Å–æ–≤ –º–æ–¥–µ–ª—å—é-—É—á–∏—Ç–µ–ª–µ–º:", total_time_teacher_model)

–û–±—â–µ–µ –≤—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ 100 –∑–∞–ø—Ä–æ—Å–æ–≤ –º–æ–¥–µ–ª—å—é-—É—á–∏—Ç–µ–ª–µ–º: 8.364979028701782


In [29]:
from tqdm import tqdm

# –ü–û–î–°–ß–ï–¢ –ú–ï–¢–†–ò–ö
data_test_X = data['test']['text'][::50]
data_test_y = data['test']['label'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i, **tokenizer_kwargs)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:22<00:00, 22.60it/s]


{'accuracy': 0.924}

In [31]:
pipe = pipeline("text-classification", model="./student_model", tokenizer="distilbert-base-uncased")
#WARMUP
for _ in range(10):
  _ = pipe(sample_input,  **tokenizer_kwargs)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = pipe(sample_input, **tokenizer_kwargs)
total_time_student_model = time.time()-start

print("–û–±—â–µ–µ –≤—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ 100 –∑–∞–ø—Ä–æ—Å–æ–≤ –º–æ–¥–µ–ª—å—é-—Å—Ç—É–¥–µ–Ω—Ç–æ–º:", total_time_student_model)

–û–±—â–µ–µ –≤—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ 100 –∑–∞–ø—Ä–æ—Å–æ–≤ –º–æ–¥–µ–ª—å—é-—Å—Ç—É–¥–µ–Ω—Ç–æ–º: 7.468947172164917


In [32]:
decrease_in_time = (total_time_teacher_model - total_time_student_model) / total_time_teacher_model

print(f'–ú–æ–¥–µ–ª—å —Å—Ç—É–¥–µ–Ω—Ç–∞ –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ—Ç –±—ã—Å—Ç—Ä–µ–µ –Ω–∞ {decrease_in_time*100:.2f} %')

–ú–æ–¥–µ–ª—å —Å—Ç—É–¥–µ–Ω—Ç–∞ –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ—Ç –±—ã—Å—Ç—Ä–µ–µ –Ω–∞ 10.71 %


In [33]:
# –ü–û–î–°–ß–ï–¢ –ú–ï–¢–†–ò–ö
data_test_X = data['test']['text'][::50]
data_test_y = data['test']['label'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i, **tokenizer_kwargs)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

  0%|          | 0/500 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:21<00:00, 22.94it/s]


{'accuracy': 0.94}

In [34]:
# Raw student-model
pipe = pipeline("text-classification", model="./raw_student_model", tokenizer="distilbert-base-uncased")

# –ü–û–î–°–ß–ï–¢ –ú–ï–¢–†–ò–ö
data_test_X = data['test']['text'][::50]
data_test_y = data['test']['label'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i, **tokenizer_kwargs)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:22<00:00, 21.92it/s]


{'accuracy': 0.478}

In [35]:
model_preds

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
