In [1]:
!pip install -q transformers==4.12.2
!pip install -q datasets



In [72]:
import transformers
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

from datasets import load_dataset
from functools import partial
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from textwrap import wrap
from torch import nn
import os

In [20]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

In [21]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 8, 6

### Загрузка данных

Сразу будем загружать не все данные, а только часть, чтобы ускорить процесс обучения и пожалеть бедный колаб

In [22]:
raw_datasets = load_dataset('imdb', split={'train': 'train[:1500]+train[-1500:]', 'test': 'test[:500]+test[-500:]', 'validation': 'test[500:1000]+test[-1000:-500]'})

  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

Токенизируем данные

In [24]:
MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type

In [25]:
def preprocess_function(examples, tokenizer):
    result = tokenizer(examples["text"], max_length=256, padding='max_length', truncation=True)
    result["label"] = examples["label"]
    return result

In [26]:
tokenized_datasets = raw_datasets.map(
    partial(preprocess_function, tokenizer=tokenizer),
    batched=True,
    desc="Running tokenizer on dataset"
)

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [28]:
num_labels = raw_datasets['train'].to_pandas()['label'].nunique()
num_labels

2

In [29]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

Опишем функцию для подсчета метрик, которые хотим увидеть при оценке модели:

In [30]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Модель 1: SentimentClassifier

Простой классификатор навещенный поферх BERT'а как с семинара, но с небольшим изменением для того, чтобы пихнуть в Trainer

In [31]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
    def forward(self, input_ids, attention_mask, labels=None):
        last_hidden_state, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False)
       
        logits = self.out(self.drop(pooled_output))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.n_classes), labels.view(-1))

        output = (logits,)
        return ((loss,) + output) if loss is not None else output

In [32]:
model = SentimentClassifier(num_labels)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type

In [33]:
# заморозим некоторые слои
freeze_layers = list(range(5))
for layer_id in freeze_layers:
    for param in list(model.bert.encoder.layer[layer_id].parameters()):
        param.requires_grad = False

Зададим параметры обучения с помощью TrainingArguments:

In [34]:
training_args = TrainingArguments(
    output_dir='./results/sentiment-classifier',
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Тренируем

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

train_results = trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `SentimentClassifier.forward` and have been ignored: token_type_ids, text.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5039,0.686726,0.733,0.780608,0.662483,0.95
2,0.3583,0.49491,0.827,0.834766,0.798903,0.874


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifier.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results/sentiment-classifier/checkpoint-375/tokenizer_config.json
Special tokens file saved in ./results/sentiment-classifier/checkpoint-375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifier.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./res

In [36]:
pd.DataFrame([train_results.metrics]).T

Unnamed: 0,0
train_runtime,207.1713
train_samples_per_second,28.962
train_steps_per_second,3.62
total_flos,0.0
train_loss,0.491637
epoch,2.0


Посмотрим качество на тесте

In [37]:
test_results = trainer.predict(test_dataset=tokenized_datasets['test'])

The following columns in the test set  don't have a corresponding argument in `SentimentClassifier.forward` and have been ignored: token_type_ids, text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [38]:
pd.DataFrame([test_results.metrics]).T

Unnamed: 0,0
test_loss,0.419586
test_accuracy,0.844
test_f1,0.848837
test_precision,0.823308
test_recall,0.876
test_runtime,11.0579
test_samples_per_second,90.433
test_steps_per_second,11.304


## Модель 2: SentimentClassifier with CLS

Тот же классификатор, но с добавлением эмбеддинга [CLS] токена с последнего слоя, который мы добавляем путем конкатенации (можно и среднее, конечно, но я рещила что так будет лучше)

In [39]:
class SentimentClassifierCLS(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size*2, n_classes)
  
    def forward(self, input_ids, attention_mask, labels=None):
        last_hidden_state, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False)
       

        cls = last_hidden_state[:,0,:]
        stacked_layers = torch.hstack([cls, pooled_output])

        logits = self.out(self.drop(stacked_layers))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.n_classes), labels.view(-1))

        output = (logits,)
        return ((loss,) + output) if loss is not None else output

In [40]:
model = SentimentClassifierCLS(num_labels)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type

In [41]:
# заморозим некоторые слои
freeze_layers = list(range(5))
for layer_id in freeze_layers:
  for param in list(model.bert.encoder.layer[layer_id].parameters()):
    param.requires_grad = False

Зададим параметры обучения с помощью TrainingArguments:

In [42]:
training_args = TrainingArguments(
    output_dir='./results/sentiment-classifier_cls',
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Тренируем

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

train_results = trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `SentimentClassifierCLS.forward` and have been ignored: token_type_ids, text.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5048,0.761136,0.688,0.758514,0.618687,0.98
2,0.3038,0.471862,0.83,0.840226,0.792553,0.894


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifierCLS.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier_cls/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results/sentiment-classifier_cls/checkpoint-375/tokenizer_config.json
Special tokens file saved in ./results/sentiment-classifier_cls/checkpoint-375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifierCLS.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier_cls/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer conf

In [44]:
pd.DataFrame([train_results.metrics]).T

Unnamed: 0,0
train_runtime,206.7153
train_samples_per_second,29.025
train_steps_per_second,3.628
total_flos,0.0
train_loss,0.492177
epoch,2.0


Посмотрим на качество на тесте

In [45]:
test_results = trainer.predict(test_dataset=tokenized_datasets['test'])

The following columns in the test set  don't have a corresponding argument in `SentimentClassifierCLS.forward` and have been ignored: token_type_ids, text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [46]:
pd.DataFrame([test_results.metrics]).T

Unnamed: 0,0
test_loss,0.415283
test_accuracy,0.849
test_f1,0.856872
test_precision,0.814414
test_recall,0.904
test_runtime,10.5801
test_samples_per_second,94.517
test_steps_per_second,11.815


## Модель 3: BertForSequenceClassification

Предобученный BERT для классификации

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type

Зададим параметры обучения с помощью TrainingArguments:

In [48]:
training_args = TrainingArguments(
    output_dir='./results/bert-for-sequence-classification',
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Тренируем

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

train_results = trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4889,0.601487,0.711,0.766747,0.64276,0.95
2,0.3413,0.572957,0.822,0.829175,0.797048,0.864


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/bert-for-sequence-classification/checkpoint-375
Configuration saved in ./results/bert-for-sequence-classification/checkpoint-375/config.json
Model weights saved in ./results/bert-for-sequence-classification/checkpoint-375/pytorch_model.bin
tokenizer config file saved in ./results/bert-for-sequence-classification/checkpoint-375/tokenizer_config.json
Special tokens file saved in ./results/bert-for-sequence-classification/checkpoint-375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./resu

In [50]:
pd.DataFrame([train_results.metrics]).T

Unnamed: 0,0
train_runtime,234.8511
train_samples_per_second,25.548
train_steps_per_second,3.194
total_flos,789333200000000.0
train_loss,0.5226143
epoch,2.0


Смотрим результаты на тесте

In [51]:
test_results = trainer.predict(test_dataset=tokenized_datasets['test'])

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [52]:
pd.DataFrame([test_results.metrics]).T

Unnamed: 0,0
test_loss,0.553569
test_accuracy,0.815
test_f1,0.820563
test_precision,0.79661
test_recall,0.846
test_runtime,10.7053
test_samples_per_second,93.411
test_steps_per_second,11.676


## *Модель 4: SentimentClassifier with CLS tokens from all layers

Наш классификатор как модель 2, только берем [CLS] токен не с посленего слоя, а среднее по всем слоям 

In [53]:
class SentimentClassifierPooledCLS(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size*2, n_classes)
  
    def forward(self, input_ids, attention_mask, labels=None):
        _, pooled_output, hidden_states = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False,
            output_hidden_states=True)
       
        hidden_states = torch.stack(hidden_states)
        hidden_cls = hidden_states[:,:,1,:]
        hidden_cls = hidden_cls.mean(axis=0)

        stacked_layers = torch.hstack([hidden_cls, pooled_output])

        logits = self.out(self.drop(stacked_layers))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.n_classes), labels.view(-1))

        output = (logits,)
        return ((loss,) + output) if loss is not None else output

In [54]:
model = SentimentClassifierPooledCLS(num_labels)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type

In [55]:
# заморозим некоторые слои
freeze_layers = list(range(5))
for layer_id in freeze_layers:
    for param in list(model.bert.encoder.layer[layer_id].parameters()):
        param.requires_grad = False

Зададим параметры обучения:

In [56]:
training_args = TrainingArguments(
    output_dir='./results/sentiment-classifier_pooled_cls',
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Тренируем

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

train_results = trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `SentimentClassifierPooledCLS.forward` and have been ignored: token_type_ids, text.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5089,0.617051,0.722,0.770248,0.656338,0.932
2,0.3859,0.532918,0.811,0.820171,0.782214,0.862


The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifierPooledCLS.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier_pooled_cls/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results/sentiment-classifier_pooled_cls/checkpoint-375/tokenizer_config.json
Special tokens file saved in ./results/sentiment-classifier_pooled_cls/checkpoint-375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `SentimentClassifierPooledCLS.forward` and have been ignored: token_type_ids, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/sentiment-classifier_pooled_cls/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state di

In [58]:
pd.DataFrame([train_results.metrics]).T

Unnamed: 0,0
train_runtime,208.3822
train_samples_per_second,28.793
train_steps_per_second,3.599
total_flos,0.0
train_loss,0.516331
epoch,2.0


Смотрим результаты на тесте

In [59]:
test_results = trainer.predict(test_dataset=tokenized_datasets['test'])

The following columns in the test set  don't have a corresponding argument in `SentimentClassifierPooledCLS.forward` and have been ignored: token_type_ids, text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [60]:
pd.DataFrame([test_results.metrics]).T

Unnamed: 0,0
test_loss,0.440089
test_accuracy,0.822
test_f1,0.832075
test_precision,0.7875
test_recall,0.882
test_runtime,10.5767
test_samples_per_second,94.547
test_steps_per_second,11.818


## Итоги?
Как видим, в целом, все варианты моделей добиваются высокого качества на данных.

Лучше всего себя показала модель `SentimentClassifierCLS`, которая использует конкатенацию эмбеддинга CLS токена с выходом из пулинга, что говорит о том, что CLS токен сам по себе аггрегирует полезную для классификации информацию. Интересно, что модель, использующая эмбеддинги CLS токена со всех слоев выдает качество хуже. Потенциально на других слоях CLS токен аггрегирует информацию не о (скажем так) семантике, которая важна дя определения тональности, а чем-то еще (ср. работы по пробингу, которые говорят о том что модель собирает поверхностную информацию на начальных слоях, синтаксическую на средних и семантическую на верхних), отсюда слои скорее сбивают классификатор. 

## 5. Тестируем модель на отзывах GooglePlay
Чтобы не заморачиваться с загрузкой модели, давайте посмотрим на последнюю

In [63]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.4.0.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.4.0-py3-none-any.whl size=14775 sha256=3f8771858133a96b0aacfeab0d30c57620b16a6ef93e629f845ba0770aec5b28
  Stored in directory: /root/.cache/pip/wheels/fb/c3/0e/c4d8ff8bfcb0461afff199471449f642179b74968c15b7a69c
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.4.0


In [64]:
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv

Downloading...
From: https://drive.google.com/uc?id=1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
To: /kaggle/working/reviews.csv
100%|███████████████████████████████████████| 7.17M/7.17M [00:00<00:00, 212MB/s]


In [65]:
df = pd.read_csv("reviews.csv")

In [66]:
def to_sentiment(rating):
    rating = int(rating)
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else: 
        return 2

df['sentiment'] = df.score.apply(to_sentiment)

In [67]:
class_names = ['negative', 'neutral', 'positive']

Рандомно выберем по одному отзыву на класс

In [68]:
neg = df[df.sentiment==0].sample(1)['content'].to_list()
neut = df[df.sentiment==1].sample(1)['content'].to_list()
pos = df[df.sentiment==2].sample(1)['content'].to_list()

Берем модельку

In [69]:
model = trainer.model.to('cpu')

Проверяем:

In [73]:
tokenized = tokenizer(neg, return_tensors='pt')
logits = model(tokenized['input_ids'], tokenized['attention_mask'])[0]
pred = torch.argmax(F.softmax(logits, dim=1))
print('Text:', neg[0])
print(f'True label:    0\nPred label:    {pred}')

Text: Not a movable app
True label:    0
Pred label:    0


In [74]:
tokenized = tokenizer(neut, return_tensors='pt')
logits = model(tokenized['input_ids'], tokenized['attention_mask'])[0]
pred = torch.argmax(F.softmax(logits, dim=1))
print('Text:', neut[0])
print(f'True label:    1\nPred label:    {pred}')

Text: It alright
True label:    1
Pred label:    0


In [75]:
tokenized = tokenizer(pos, return_tensors='pt')
logits = model(tokenized['input_ids'], tokenized['attention_mask'])[0]
pred = torch.argmax(F.softmax(logits, dim=1))
print('Text:', pos[0])
print(f'True label:    2\nPred label:    {pred}')

Text: This is a great organizational tool. It literally combines all my relevant tasks in one place.
True label:    2
Pred label:    1


**NB:** модель тренировалась на 2 класса, а данные гугла рассчитаны на 3, так что не совсем правильно использовать эту модель для этих данных, но раз уж такое задание

В остальном видим, что модель хорошо справдяется и на этих данных:

- дает отрицательную полярность для негативного отзыва
- дает положительную полярность для положительного отзыва
- дает положительную полярность для нейтрального отзыва, с положительной оценкой