# Import

In [1]:
# 🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗

In [2]:
!pip install transformers



In [3]:
! pip install datasets



In [4]:
import transformers
from transformers import BertModel, AutoTokenizer, BertTokenizer, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from torch import nn, optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HF_HUB_MODEL = 'prajjwal1/bert-medium'

# 0. Dataset Preparation

Сделаем шаффл датасета, чтобы у нас были примеры разных классов (вроде как в датасете они идут по поряку). На всякий случай укажем сид, но мы будем использовать они и те же данные для всех моделей

In [5]:
from datasets import load_dataset
imdb_dataset = load_dataset('imdb')

small_train_dataset = imdb_dataset["train"].shuffle(seed=42).select(range(5000))
small_test_dataset = imdb_dataset["test"].shuffle(seed=42).select(range(1000))

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Разделим датасет

In [6]:
train_texts = small_train_dataset["text"]
train_labels = small_train_dataset["label"]
test_texts = small_test_dataset["text"][:1000]
test_labels = small_test_dataset["label"][:1000]

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(HF_HUB_MODEL)

Downloading:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Токенизируем датасет используя padding и truncation, так же укажем max_len для последовательности. Наша модель работает с последовательностями дины 512

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_token_type_ids=False, return_attention_mask=True, max_length=512)
val_encodings = tokenizer(val_texts,truncation=True, padding=True, return_token_type_ids=False, return_attention_mask=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_token_type_ids=False, return_attention_mask=True, max_length=512)

In [9]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

## Metric

Будем использовать binary f-score так как у нас бинарная классификация

In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"

## Training arguments

Добавим evaluation_strategy = steps, каждые 100 шагов мы будем делать валидацию. Тут также содержатся все гиперпараметры, которые будут использоваться для всех моделей

In [12]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    report_to=None,
    evaluation_strategy = 'steps'
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# 1. Basic SentimentClassifier 

## Model

Немного изменим класс SentimentClassifier чтобы засунуть его в Trainer (раньше он не умел работать с labels которые требуются для Trainer)

In [13]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(HF_HUB_MODEL)
        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward( # сорс код из BertModel
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=False,
    ):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            return_dict=False)
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.out(pooled_output)
        
        loss = None # сорс код из BertModel, нужно было сделать кастомный лосс
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
        output = (logits,)
        return ((loss,) + output) if loss is not None else output

## Train

In [14]:
model1 = SentimentClassifier(2) 
model1 = model1.to(device)

Downloading:   0%|          | 0.00/159M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-medium were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
trainer = Trainer(
    model=model1,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics = compute_metrics    # metrics to evaluate
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6937,0.617841,0.681,0.728511,0.639761,0.84585
200,0.5116,0.472773,0.785,0.813206,0.725581,0.924901
300,0.4057,0.416051,0.845,0.833154,0.914894,0.764822
400,0.4178,0.418789,0.841,0.845781,0.830476,0.86166
500,0.3916,0.365908,0.84,0.828326,0.906103,0.762846
600,0.3552,0.616473,0.799,0.829517,0.726597,0.966403
700,0.3116,0.409561,0.874,0.876953,0.866795,0.887352
800,0.2687,0.329048,0.882,0.884314,0.877432,0.891304
900,0.2506,0.359364,0.886,0.887795,0.884314,0.891304
1000,0.2709,0.371197,0.886,0.888672,0.878378,0.899209


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share 

TrainOutput(global_step=1000, training_loss=0.38774579620361327, metrics={'train_runtime': 258.0585, 'train_samples_per_second': 31.001, 'train_steps_per_second': 3.875, 'total_flos': 0.0, 'train_loss': 0.38774579620361327, 'epoch': 2.0})

## Test

In [16]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'test_loss': 0.3712855279445648,
 'test_accuracy': 0.89,
 'test_f1': 0.8877551020408164,
 'test_precision': 0.8841463414634146,
 'test_recall': 0.8913934426229508,
 'test_runtime': 6.8724,
 'test_samples_per_second': 145.509,
 'test_steps_per_second': 9.167,
 'epoch': 2.0}

# 2. SentimentClassifier with CLS token

## Model

В предыдущий класс добавим CLS токен. CLS токен это первый токен из last_hidden_state

In [17]:
class CLSSentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(HF_HUB_MODEL)
        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.out = nn.Linear(self.bert.config.hidden_size + 512, n_classes) # так как добавляем CLS токен, надо расширить размерносить выхода на hidden_size
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=False,
    ):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            return_dict=False)
        

        CLS_token = outputs[0][:,0,:] # CLS это первый токен last_hidden_state
        pooled_output = self.linear(self.dropout(outputs[1])) 
        stacked = torch.hstack([CLS_token, pooled_output]) # склеим CLS token и pooled output

        logits = self.out(stacked)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
        output = (logits,)
        return ((loss,) + output) if loss is not None else output

## Train

In [18]:
model2 = CLSSentimentClassifier(2) 
model2 = model2.to(device)

loading configuration file https://huggingface.co/prajjwal1/bert-medium/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/288b0ee1e79a7c3fe770ab8a84ece013c573e7d226ccb5d9ffad317b3419faac.4344f82f77799c092b30b2e0d3749c809f82df14c5993e43dbbdc52f5a0d86e0
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/prajjwal1/bert-medium/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/dabb6f3bc29449f038f41cb09eb1a693eee2

In [19]:
trainer = Trainer(
    model=model2,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics = compute_metrics    # metrics to evaluate
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6855,0.585503,0.725,0.744186,0.702988,0.790514
200,0.4851,0.441349,0.824,0.839416,0.779661,0.909091
300,0.4101,0.431481,0.835,0.844193,0.808318,0.883399
400,0.443,0.433221,0.851,0.859301,0.822785,0.899209
500,0.4039,0.32726,0.858,0.859684,0.859684,0.859684
600,0.3141,0.424729,0.851,0.860878,0.815929,0.911067
700,0.3429,0.387718,0.867,0.865521,0.886128,0.84585
800,0.256,0.334733,0.877,0.873065,0.913607,0.835968
900,0.2552,0.405889,0.885,0.888889,0.869565,0.909091
1000,0.2632,0.395804,0.888,0.888668,0.894,0.883399


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share 

TrainOutput(global_step=1000, training_loss=0.38589393615722656, metrics={'train_runtime': 259.7948, 'train_samples_per_second': 30.794, 'train_steps_per_second': 3.849, 'total_flos': 0.0, 'train_loss': 0.38589393615722656, 'epoch': 2.0})

## Test

In [20]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'test_loss': 0.3970975875854492,
 'test_accuracy': 0.882,
 'test_f1': 0.8773388773388774,
 'test_precision': 0.890295358649789,
 'test_recall': 0.8647540983606558,
 'test_runtime': 6.9012,
 'test_samples_per_second': 144.902,
 'test_steps_per_second': 9.129,
 'epoch': 2.0}

# 3. Transformers - BertForSequenceClassification

## Model

Тут ничего сложного: возьмем BertForSequenceClassification и загрузим предобученную модель с Huggingface

In [21]:
from transformers import  BertForSequenceClassification

## Train

In [22]:
model3 = BertForSequenceClassification.from_pretrained(HF_HUB_MODEL)
model3 = model3.to(device)

loading configuration file https://huggingface.co/prajjwal1/bert-medium/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/288b0ee1e79a7c3fe770ab8a84ece013c573e7d226ccb5d9ffad317b3419faac.4344f82f77799c092b30b2e0d3749c809f82df14c5993e43dbbdc52f5a0d86e0
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/prajjwal1/bert-medium/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/dabb6f3bc29449f038f41cb09eb1a693eee2

In [23]:
trainer = Trainer(
    model=model3,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics = compute_metrics    # metrics to evaluate
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6793,0.618526,0.735,0.749764,0.717902,0.784585
200,0.4977,0.349631,0.861,0.862784,0.861933,0.863636
300,0.3943,0.403675,0.84,0.851577,0.802448,0.907115
400,0.3797,0.389311,0.85,0.855769,0.833333,0.879447
500,0.3928,0.374978,0.844,0.832976,0.908879,0.768775
600,0.348,0.434627,0.857,0.871287,0.8,0.956522
700,0.3275,0.421971,0.855,0.84492,0.920746,0.780632
800,0.2929,0.295128,0.891,0.893451,0.883946,0.903162
900,0.2298,0.322991,0.89,0.894636,0.86803,0.922925
1000,0.2625,0.324668,0.895,0.897361,0.887814,0.907115


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.jso

TrainOutput(global_step=1000, training_loss=0.3804571762084961, metrics={'train_runtime': 258.609, 'train_samples_per_second': 30.935, 'train_steps_per_second': 3.867, 'total_flos': 626289328128000.0, 'train_loss': 0.3804571762084961, 'epoch': 2.0})

# Test

In [24]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'test_loss': 0.38930660486221313,
 'test_accuracy': 0.878,
 'test_f1': 0.8747433264887063,
 'test_precision': 0.8765432098765432,
 'test_recall': 0.8729508196721312,
 'test_runtime': 6.8438,
 'test_samples_per_second': 146.118,
 'test_steps_per_second': 9.205,
 'epoch': 2.0}

# 4. SentimentClassifier with aggregated CLS-tokens from layers

## Model

Для того чтобы аггрегировать CLS-токены для нескольких слоев, сделаем mean pooling по слою hidden_state

In [25]:
class AGGCLSSentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(HF_HUB_MODEL)
        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.out = nn.Linear(self.bert.config.hidden_size + 512, n_classes) 
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=False,
    ):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=True,
                            return_dict=False)
        
        output_hidden_states = outputs[2]
        
        hidden_states = torch.stack(output_hidden_states) # берем слои hidden_state слои
        
        CLS_tokens = torch.mean(hidden_states[:, :, 0], 0) # делаем mean pooling
        pooled_output = self.linear(self.dropout(outputs[1]))
        stacked = torch.hstack([CLS_tokens, pooled_output]) # соединяем mean pooling и pooled output
    

        logits = self.out(stacked)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
        output = (logits,)
        return ((loss,) + output) if loss is not None else output

## Train

In [26]:
model4 = AGGCLSSentimentClassifier(2) 
model4 = model4.to(device)

loading configuration file https://huggingface.co/prajjwal1/bert-medium/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/288b0ee1e79a7c3fe770ab8a84ece013c573e7d226ccb5d9ffad317b3419faac.4344f82f77799c092b30b2e0d3749c809f82df14c5993e43dbbdc52f5a0d86e0
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/prajjwal1/bert-medium/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/dabb6f3bc29449f038f41cb09eb1a693eee2

In [27]:
trainer = Trainer(
    model=model4,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics = compute_metrics    # metrics to evaluate
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.7072,0.647586,0.679,0.658147,0.713626,0.610672
200,0.5325,0.388303,0.839,0.842002,0.836257,0.847826
300,0.4007,0.432419,0.849,0.850643,0.851485,0.849802
400,0.4306,0.525616,0.831,0.848158,0.777595,0.932806
500,0.4,0.34781,0.861,0.857436,0.891258,0.826087
600,0.3612,0.434164,0.847,0.863026,0.788871,0.952569
700,0.3118,0.376082,0.881,0.875393,0.930958,0.826087
800,0.2702,0.336148,0.885,0.886251,0.887129,0.885375
900,0.2621,0.369545,0.89,0.892368,0.883721,0.901186
1000,0.2657,0.353564,0.895,0.896755,0.892368,0.901186


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share 

TrainOutput(global_step=1000, training_loss=0.39420107650756836, metrics={'train_runtime': 260.0432, 'train_samples_per_second': 30.764, 'train_steps_per_second': 3.846, 'total_flos': 0.0, 'train_loss': 0.39420107650756836, 'epoch': 2.0})

## Test

In [28]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'test_loss': 0.3741048574447632,
 'test_accuracy': 0.885,
 'test_f1': 0.8827726809378185,
 'test_precision': 0.8782961460446247,
 'test_recall': 0.8872950819672131,
 'test_runtime': 6.9107,
 'test_samples_per_second': 144.704,
 'test_steps_per_second': 9.116,
 'epoch': 2.0}

# 5. Test on 3 comments

Посмотрим на F-score:

SentimentClassifier: 0.8877

SentimentClassifier + CLS: 0.8773

BertForSequenceClassification: 0.8747

SentimentClassifier + multiple CLS: 0.8827

Лучше всего себя показала модель SentimentClassifier, добавление CLS сделало только хуже. SentimentClassifier + multiple CLS показывают примерно одинаковый результат, обе хороши

Далее буду использовать BertForSequenceClassification чтобы посмотреть, как работают готовые решения с Huggingface




## Spider-Man: No Way Home 5 stars

In [29]:
review = """
Best cinematic experience I've ever had. Cried. Cheered. Cried some more. it's got everything for a Spider-Man fan.
"""

In [30]:
encoding = tokenizer.encode_plus(
  review,
  max_length=512,
  add_special_tokens=True, 
  return_token_type_ids=True,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',  
  truncation=True
).to(device)

In [31]:
model3(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])[0].cpu().detach().numpy().argmax()

1

1 - класс положительного отзыва, все правильно

## The Wolf of Wall Street 1 star

In [32]:
review = """
Worst movie ever I couldn't get more than 1/3 of the way through the movie before having to turn it off. 
Content was horrible, exploiting everybody, foul and DiCaprio not great. It's not even worth 1 star.
"""

In [33]:
encoding = tokenizer.encode_plus(
  review,
  max_length=512,
  add_special_tokens=True, 
  return_token_type_ids=True,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',  
  truncation=True
).to(device)

In [34]:
model3(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])[0].cpu().detach().numpy().argmax()

0

0 - класс негативного отзыв, тут тоже все верно

## The Matrix Resurrections 3 star

In [35]:
review = """
The trilogy was amazing, but this one falls flat. Was not as excited as the others. Yes, it was romantic, but as a Matrix movie, I expected more action.
"""

In [36]:
encoding = tokenizer.encode_plus(
  review,
  max_length=512,
  add_special_tokens=True, 
  return_token_type_ids=True,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',  
  truncation=True
).to(device)

In [37]:
model3(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])[0].cpu().detach().numpy().argmax()

0

Тут интересно было посмотреть на пограничный случай в 3 звезды - нейтральный класс, которого нет при обучении. Модель решила, что этотт отзыв скорее негативный

В принципе, модель работает как и предполагалось