# Задание
Возьмите данные отзывов о фильмах сайта Кинопоиск ру из соревнования https://www.kaggle.com/mikhailklemin/kinopoisks-movies-reviews 

Обучите реккурентную нейронную сеть с различными вариантами embeddig:
* собственный embedding
* word2vec
* GLOVE

после чего дообучите нейронную сеть Bert на
* 10%
* 20%
* 50% 
обучающих примеров

Попробуйте добиться точности выше, чем у рекуррентной нейронной сети.

## Import

In [1]:
import logging
from pathlib import Path
from typing import List, Mapping, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import yaml
from catalyst.utils import set_global_seed, prepare_cudnn
from transformers import AutoTokenizer, AutoConfig, AutoModel
from catalyst.callbacks.metrics.accuracy import AccuracyCallback
from catalyst.dl import (
    CheckpointCallback,
    OptimizerCallback,
    SchedulerCallback,
    SupervisedRunner,
)

def get_project_root() -> Path:
    return Path("").parent.parent

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class_names = ["Negative", "Neutral", "Positive"]
class_names_converter = {
    'neg': 'Negative',
    'pos': 'Positive',
    'neu': 'Neutral',
}

def name_to_id(name):
    return class_names.index(name)

In [3]:
MODEL_NAME = 'cointegrated/rubert-tiny2'
BATCH_SIZE = 16 # 32 не влезает в 10GB VRAM
SEED = 123
NUM_CLASSES = 3
LR = 3e-5

### Класс для токенизации

In [4]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = None,
        model_name: str = None,
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_label = self.tokenizer.special_tokens_map['sep_token']
        self.sep_vid = self.tokenizer.vocab[self.sep_label]
        self.cls_label = self.tokenizer.special_tokens_map['cls_token']
        self.cls_vid = self.tokenizer.vocab[self.cls_label]
        self.pad_label = self.tokenizer.special_tokens_map['pad_token']
        self.pad_vid = self.tokenizer.vocab[self.pad_label]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """
        # encoding the text
        x = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus(
            text=x,
            ##text_pair=text_pair, 
            ##text_target=text_target, 
            ##text_pair_target=text_pair_target, 
            add_special_tokens=True,
            #  Pad to a maximum length specified with the argument max_length
            #  or to the maximum acceptable input length for the model if that argument is not provided.
            padding="max_length", 
            # Truncate to a maximum length specified with the argument max_length 
            # or to the maximum acceptable input length for the model if that argument is not provided. 
            # This will truncate token by token, 
            # removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided.
            truncation=True,
            # Controls the maximum length to use by one of the truncation/padding parameters.
            max_length=self.max_seq_length, 
            # return pytorch tensor
            return_tensors="pt", 
            return_token_type_ids=True,
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict["features"] = output_dict["input_ids"].squeeze(0)
        del output_dict["input_ids"]

        output_dict["token_type_ids"] = output_dict["token_type_ids"].squeeze(0)

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
            output_dict["targets"] = y_encoded
            
        return output_dict

In [5]:
def read_data(path_name) -> Tuple[dict, dict]:
    """
    A custom function that reads data from CSV files, creates PyTorch datasets and
    data loaders. The output is provided to be easily used with Catalyst

    :param params: a dictionary read from the config.yml file
    :return: a tuple with 2 dictionaries
    """
    # reading CSV files to Pandas dataframes
    train_df = pd.read_csv(f'Kinopoisk_train_{path_name}%.csv')
    test_df = pd.read_csv(f'Kinopoisk_eval_{path_name}%.csv')

    # creating PyTorch Datasets
    
    train_dataset = TextClassificationDataset(
        texts=train_df['review'].values.tolist(),
        labels=train_df['sentiment'].values.tolist(),
        max_seq_length=256,
        model_name=MODEL_NAME,
    )

    test_dataset = TextClassificationDataset(
        texts=test_df['review'].values.tolist(),
        labels=test_df['sentiment'].values.tolist(),
        max_seq_length=256,
        model_name=MODEL_NAME,
    )

    set_global_seed(SEED)

    # creating PyTorch data loaders and placing them in dictionaries (for Catalyst)
    train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
        ),
        "valid": DataLoader(
            dataset=test_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
        ),
    }

    del train_df
    del test_df
    del train_dataset
    del test_dataset

    return train_val_loaders

### Класс для обучения

In [8]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(
        self, 
        pretrained_model_name: str, 
        num_classes: int = None, 
        dropout: float = 0.3
    ):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.dropout = nn.Dropout(dropout)

        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, **kwargs):
        attention_mask = kwargs['attention_mask']
        features = kwargs['features']
        token_type_ids = kwargs['token_type_ids']
        head_mask = None
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        bert_output = self.model(
            input_ids=features, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            head_mask=head_mask)
        # we only need the hidden state here and don't need
        # transformer output, so index 0

        seq_output = bert_output[0]  # (bs, seq_len, dim)
        del bert_output, features, attention_mask, token_type_ids, head_mask
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)
        del pooled_output

        return scores

## Pre-Convert Datasets

In [None]:
dataset_path = Path('../datasets/kinopoisk/')

def standardize_text(df, content_field):
    df[content_field] = df[content_field].str.replace(r"http\S+", "")
    df[content_field] = df[content_field].str.replace(r"@\S+", "")
    df[content_field] = df[content_field].str.replace(
        r"[^А-Яа-яA-Za-z0-9Ёё(),!?@\'\`\"\_\n]", " ")
    df[content_field] = df[content_field].str.replace(r"[Ёё]", "е")
    df[content_field] = df[content_field].str.replace(r"[\t\n]", "")
    df[content_field] = df[content_field].str.replace(r"[^А-Яа-яa-zA-Z]", " ")
    df[content_field] = df[content_field].str.lower()
    return df

In [16]:
for perc in [0.1, 0.2, 0.5]:
    df = pd.DataFrame(columns=['review', 'sentiment'])
    
    for class_path in dataset_path.iterdir():
        if class_path.is_dir():
            dirs = np.array(list(class_path.iterdir()))
            np.random.shuffle(dirs)
            rews_fhs = np.random.choice(dirs, round(len(dirs)*perc))
            print(f'len = {rews_fhs.shape}')
            print(class_names_converter[class_path.name])
            for rew_fh in rews_fhs:
                with open(Path(rew_fh), encoding='utf-8') as f:
                    review = f.read()
                    current_df = pd.DataFrame(
                        {'review': [review], 'sentiment': class_names_converter[class_path.name]})
                    df = pd.concat([df, current_df], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)

    df = standardize_text(df, "review")
    df['sentiment'] = df['sentiment'].map(name_to_id)

    train_dataset, eval_dataset = train_test_split(df, test_size = 0.2)

    train_dataset.to_csv(f'Kinopoisk_train_{perc:.0%}.csv')
    eval_dataset.to_csv(f'Kinopoisk_eval_{perc:.0%}.csv')

len = (1983,)
Negative
len = (2470,)
Neutral
len = (8714,)
Positive


  df[content_field] = df[content_field].str.replace(r"http\S+", "")
  df[content_field] = df[content_field].str.replace(r"@\S+", "")
  df[content_field] = df[content_field].str.replace(
  df[content_field] = df[content_field].str.replace(r"[Ёё]", "е")
  df[content_field] = df[content_field].str.replace(r"[\t\n]", "")
  df[content_field] = df[content_field].str.replace(r"[^А-Яа-яa-zA-Z]", " ")


len = (3965,)
Negative
len = (4941,)
Neutral
len = (17428,)
Positive


  df[content_field] = df[content_field].str.replace(r"http\S+", "")
  df[content_field] = df[content_field].str.replace(r"@\S+", "")
  df[content_field] = df[content_field].str.replace(
  df[content_field] = df[content_field].str.replace(r"[Ёё]", "е")
  df[content_field] = df[content_field].str.replace(r"[\t\n]", "")
  df[content_field] = df[content_field].str.replace(r"[^А-Яа-яa-zA-Z]", " ")


len = (9914,)
Negative
len = (12352,)
Neutral
len = (43569,)
Positive


  df[content_field] = df[content_field].str.replace(r"http\S+", "")
  df[content_field] = df[content_field].str.replace(r"@\S+", "")
  df[content_field] = df[content_field].str.replace(
  df[content_field] = df[content_field].str.replace(r"[Ёё]", "е")
  df[content_field] = df[content_field].str.replace(r"[\t\n]", "")
  df[content_field] = df[content_field].str.replace(r"[^А-Яа-яa-zA-Z]", " ")


## Train

In [9]:
num_epochs = [1, 2, 4]
model_names = ['10', '20', '50']
runner = []
for num_epochs, model_name in zip(num_epochs, model_names):
    # загружаем датасет нужного размера
    train_val_loaders = read_data(model_name)

    # загружаем предтренированную модель из HF
    model = BertForSequenceClassification(
        pretrained_model_name=MODEL_NAME,
        num_classes=NUM_CLASSES,
    )

    param_optimizer = list(model.model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    # обнуляем сид
    set_global_seed(SEED)
    prepare_cudnn(deterministic=True)

    runner = SupervisedRunner(input_key=("features", "attention_mask", "token_type_ids"))
    # тренировка
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=train_val_loaders,
        callbacks=[
            AccuracyCallback(num_classes=NUM_CLASSES, input_key="logits", target_key="targets"),
            OptimizerCallback(accumulation_steps=4, metric_key="loss"),
            SchedulerCallback(loader_key="valid", metric_key="loss"),
            CheckpointCallback(logdir="logdir", loader_key="valid", metric_key="loss", minimize=True),
        ],
        logdir="logdir",
        num_epochs=num_epochs,
        verbose=True,
    )
    torch.cuda.empty_cache()
    torch.save(model, f'model_{model_name}%.pt')
    del model, runner, criterion, optimizer, scheduler, train_val_loaders

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1/1 * Epoch (train): 100%|██████████| 659/659 [00:30<00:00, 21.86it/s, accuracy01=0.600, loss=0.637, lr=3.000e-05, 

train (1/1) accuracy01: 0.6559384790771108 | accuracy01/std: 0.12504121922786524 | loss: 0.8617919843714994 | loss/mean: 0.8617919843714994 | loss/std: 0.16782431436785505 | lr: 3e-05 | momentum: 0.9


1/1 * Epoch (valid): 100%|██████████| 165/165 [00:05<00:00, 31.80it/s, accuracy01=0.200, loss=1.368, lr=3.000e-05, momentum=0.900]


valid (1/1) accuracy01: 0.6636294609072898 | accuracy01/std: 0.12052975597766279 | loss: 0.7986804979175172 | loss/mean: 0.7986804979175172 | loss/std: 0.13749004082896044 | lr: 3e-05 | momentum: 0.9
* Epoch (1/1) lr: 3e-05 | momentum: 0.9
Top models:
logdir/model.0001.pth	0.7987


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1/2 * Epoch (train): 100%|██████████| 1317/1317 [00:57<00:00, 22.75it/s, accuracy01=0.455, loss=1.181, lr=3.000e-05

train (1/2) accuracy01: 0.6727583424383646 | accuracy01/std: 0.12344518444846372 | loss: 0.7989554702853663 | loss/mean: 0.7989554702853663 | loss/std: 0.18282527998282225 | lr: 3e-05 | momentum: 0.9


1/2 * Epoch (valid): 100%|██████████| 330/330 [00:12<00:00, 27.19it/s, accuracy01=1.000, loss=0.219, lr=3.000e-05, momentum=0.900]


valid (1/2) accuracy01: 0.7102715018036834 | accuracy01/std: 0.11763777609222056 | loss: 0.6934037586334241 | loss/mean: 0.6934037586334241 | loss/std: 0.192717224524619 | lr: 3e-05 | momentum: 0.9
* Epoch (1/2) lr: 3e-05 | momentum: 0.9


2/2 * Epoch (train): 100%|██████████| 1317/1317 [00:59<00:00, 22.22it/s, accuracy01=0.727, loss=0.575, lr=3.000e-05, momentum=0.900]


train (2/2) accuracy01: 0.7273935539107812 | accuracy01/std: 0.11401886684541465 | loss: 0.6564741386987897 | loss/mean: 0.6564741386987897 | loss/std: 0.18795131919977795 | lr: 3e-05 | momentum: 0.9


2/2 * Epoch (valid): 100%|██████████| 330/330 [00:12<00:00, 27.06it/s, accuracy01=1.000, loss=0.272, lr=3.000e-05, momentum=0.900]


valid (2/2) accuracy01: 0.728498196316689 | accuracy01/std: 0.11195397154947952 | loss: 0.6516227998293529 | loss/mean: 0.6516227998293529 | loss/std: 0.19285006019220513 | lr: 3e-05 | momentum: 0.9
* Epoch (2/2) lr: 3e-05 | momentum: 0.9
Top models:
logdir/model.0002.pth	0.6516


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1/4 * Epoch (train): 100%|██████████| 3292/3292 [02:29<00:00, 22.03it/s, accuracy01=0.667, loss=0.691, lr=3.000e-05

train (1/4) accuracy01: 0.7011278195534 | accuracy01/std: 0.11991643035608987 | loss: 0.7255106063641287 | loss/mean: 0.7255106063641287 | loss/std: 0.19090411759039225 | lr: 3e-05 | momentum: 0.9


1/4 * Epoch (valid): 100%|██████████| 823/823 [00:34<00:00, 23.91it/s, accuracy01=0.667, loss=0.731, lr=3.000e-05, momentum=0.900]


valid (1/4) accuracy01: 0.7310700995137868 | accuracy01/std: 0.11241682628514992 | loss: 0.6453891860159463 | loss/mean: 0.6453891860159463 | loss/std: 0.20229306745918227 | lr: 3e-05 | momentum: 0.9
* Epoch (1/4) lr: 3e-05 | momentum: 0.9


2/4 * Epoch (train): 100%|██████████| 3292/3292 [02:38<00:00, 20.71it/s, accuracy01=0.583, loss=0.750, lr=3.000e-05, momentum=0.900]


train (2/4) accuracy01: 0.7581453634175741 | accuracy01/std: 0.10454674585165724 | loss: 0.5854744199034438 | loss/mean: 0.5854744199034438 | loss/std: 0.179694417392816 | lr: 3e-05 | momentum: 0.9


2/4 * Epoch (valid): 100%|██████████| 823/823 [00:34<00:00, 23.56it/s, accuracy01=0.867, loss=0.596, lr=3.000e-05, momentum=0.900]


valid (2/4) accuracy01: 0.7445127972213321 | accuracy01/std: 0.10689175895339627 | loss: 0.6159893971301804 | loss/mean: 0.6159893971301804 | loss/std: 0.18552633861610127 | lr: 3e-05 | momentum: 0.9
* Epoch (2/4) lr: 3e-05 | momentum: 0.9


3/4 * Epoch (train): 100%|██████████| 3292/3292 [02:38<00:00, 20.73it/s, accuracy01=0.833, loss=0.364, lr=3.000e-05, momentum=0.900]


train (3/4) accuracy01: 0.8116123642529952 | accuracy01/std: 0.0985156907378575 | loss: 0.4693079384532854 | loss/mean: 0.4693079384532854 | loss/std: 0.1781956542713888 | lr: 3e-05 | momentum: 0.9


3/4 * Epoch (valid): 100%|██████████| 823/823 [00:34<00:00, 23.71it/s, accuracy01=0.667, loss=0.873, lr=3.000e-05, momentum=0.900]


valid (3/4) accuracy01: 0.7652464494796095 | accuracy01/std: 0.10977876813860052 | loss: 0.6358669336024836 | loss/mean: 0.6358669336024836 | loss/std: 0.2897278886732773 | lr: 3e-05 | momentum: 0.9
* Epoch (3/4) lr: 3e-05 | momentum: 0.9


4/4 * Epoch (train): 100%|██████████| 3292/3292 [02:47<00:00, 19.70it/s, accuracy01=0.750, loss=0.431, lr=3.000e-05, momentum=0.900]


train (4/4) accuracy01: 0.8713450292397652 | accuracy01/std: 0.08454109581111163 | loss: 0.33952026477257163 | loss/mean: 0.33952026477257163 | loss/std: 0.16731665275732086 | lr: 3e-05 | momentum: 0.9


4/4 * Epoch (valid): 100%|██████████| 823/823 [00:35<00:00, 23.13it/s, accuracy01=0.667, loss=0.879, lr=3.000e-05, momentum=0.900]


valid (4/4) accuracy01: 0.788790157233844 | accuracy01/std: 0.10160383827029872 | loss: 0.6363295485410774 | loss/mean: 0.6363295485410774 | loss/std: 0.3250218898072485 | lr: 3e-05 | momentum: 0.9
* Epoch (4/4) lr: 3e-05 | momentum: 0.9
Top models:
logdir/model.0002.pth	0.6160


## Eval

In [10]:
for model_name in model_names:
    # and running inference
    torch.cuda.empty_cache()

    # read and process data
    train_val_loaders = read_data(model_name)   

    runner = SupervisedRunner(input_key=("features", "attention_mask", "token_type_ids"))
    runner.model = torch.load(f"model_{model_name}%.pt")
    print(f"=== Model: {f'model_{model_name}%.pt'}")
    # getting validation metrics
    metrics = runner.evaluate_loader(
        loader=train_val_loaders["valid"],
        callbacks=[AccuracyCallback(input_key="logits", target_key="targets")],
    )
    print(metrics)
    print()

=== Model: model_10%.pt
valid (1/1) accuracy01: 0.6636294609072898 | accuracy01/std: 0.12052975597766279
* Epoch (1/1) 
defaultdict(None, {'accuracy01': 0.6636294609072898, 'accuracy01/std': 0.12052975597766279})

=== Model: model_20%.pt
valid (1/1) accuracy01: 0.728498196316689 | accuracy01/std: 0.11195397154947952
* Epoch (1/1) 
defaultdict(None, {'accuracy01': 0.728498196316689, 'accuracy01/std': 0.11195397154947952})

=== Model: model_50%.pt
valid (1/1) accuracy01: 0.788790157233844 | accuracy01/std: 0.10160383827029872
* Epoch (1/1) 
defaultdict(None, {'accuracy01': 0.788790157233844, 'accuracy01/std': 0.10160383827029872})



Результат модели LSTM, созданной ранее в ДЗ (34 LSTM, GRU) показал точность в 73% на тестовой выборке.  

Fine-tune BERT с предобучением и токенизатором от ruBERT   в 79% при обучении на 50% датасете. 

Обучение на 20% размере датасета показывает такую же точность модели как и LTSM.