# Info

Дообучение (fine-tuning) BERT модели для задачи классификации текстов.

# Settings

In [None]:
# Files
GDRIVE_DIR = r'/content/drive/MyDrive/DS/20230314_ke-intern-test/'

DATASET_DIR = GDRIVE_DIR + 'dataset/'

TRAIN_NPZ = GDRIVE_DIR + 'tokens_rubert_train.npz'
VAL_NPZ = GDRIVE_DIR + 'tokens_rubert_val.npz'

# Model
BERT_MODEL_NAME = 'DeepPavlov/rubert-base-cased-sentence'

# Output
MODELS_DIR = GDRIVE_DIR + 'models/'

# Reproducibility
SEED = 1

# Init

## Installation

In [None]:
!pip install -q transformers

## Imports

In [None]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import torch

## Definitions

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('CPU')

GPU: Tesla T4


In [None]:
#@title  { form-width: "1px", display-mode: "form" }
#@markdown ```python
#@markdown class Dataset(inputs)
#@markdown ```

class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs
        
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.inputs.items()}
    
    def __len__(self):
        return len(self.inputs['input_ids'])

# Main

## Loading data

In [None]:
train_npz = np.load(TRAIN_NPZ)
val_npz = np.load(VAL_NPZ)

input_ids_train = train_npz['input_ids']
labels_train = train_npz['labels']

input_ids_val = val_npz['input_ids']
labels_val = val_npz['labels']

input_ids_train.shape, labels_train.shape, input_ids_val.shape, labels_val.shape

((81120, 300), (81120,), (10000, 300), (10000,))

In [None]:
# Create pytorch datasets
PAD_TOKEN_ID = 0

train_inputs = {'input_ids': torch.tensor(input_ids_train.astype(np.int32)),
                'attention_mask': torch.tensor(input_ids_train != PAD_TOKEN_ID, dtype=torch.uint8),
                'labels': torch.tensor(labels_train.astype(np.int64))}
                # int64 is required here to convert to torch.long dtype

val_inputs = {'input_ids': torch.tensor(input_ids_val.astype(np.int32)),
                'attention_mask': torch.tensor(input_ids_val != PAD_TOKEN_ID, dtype=torch.uint8),
                'labels': torch.tensor(labels_val.astype(np.int64))}

train_ds = Dataset(train_inputs)
val_ds = Dataset(val_inputs)

train_ds[0]
# Example of training sample

{'input_ids': tensor([   101,  94934,  31091,  46754,  35127,  48675,  43485,    869,  61248,
          33460,  28221,    192,  39362,  31694,  35633,   6301,  54119,  68524,
            814,    106,  79588,  32145,    869,  16337,  54384,   3187,  29697,
           1703,  82941,  31231,   1706,   1766,  36260,   7993,    114,  72792,
            132,  83057,   7471,    851,  19998,   2630,  14269,  24737,  60689,
            869,  16337,  54384,   3187,   2068,  34035,   2748,  27339,    128,
           4427,  11992,   2190,  39843,    851,  89585,  35260,  21953,    132,
            100,  52837,  14444, 112072,   9450,   1469,  10189,  63154,   3521,
          16729,  25377,  38156,    128,   1997,  13231,    875,   3660,   6818,
           7462,  38741,    866,  16729,    132,   7638,  10271,   3998,   5022,
          24856,  89769,    128,   3622,  22571,  45628,   3247,   1516,  45051,
            132,   7638,  56861,    128,  13717,  24935,   1516,  46758,    128,
          27519

In [None]:
num_labels = max(labels_train) + 1
num_labels  # число классов для задачи классификации

845

## Language model

In [None]:
import transformers
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import TrainingArguments, Trainer

In [None]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=num_labels).to(device)
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
rows = [[param_name, list(param_tensor.size())]
        for param_name, param_tensor in model.named_parameters()]
layers = pd.DataFrame(rows, columns='layer_name layer_shape'.split())
layers['layer_size'] = layers.layer_shape.map(np.prod)
layers  # список слоев модели

Unnamed: 0,layer_name,layer_shape,layer_size
0,bert.embeddings.word_embeddings.weight,"[119547, 768]",91812096
1,bert.embeddings.position_embeddings.weight,"[512, 768]",393216
2,bert.embeddings.token_type_embeddings.weight,"[2, 768]",1536
3,bert.embeddings.LayerNorm.weight,[768],768
4,bert.embeddings.LayerNorm.bias,[768],768
...,...,...,...
196,bert.encoder.layer.11.output.LayerNorm.bias,[768],768
197,bert.pooler.dense.weight,"[768, 768]",589824
198,bert.pooler.dense.bias,[768],768
199,classifier.weight,"[845, 768]",648960


## Training

In [None]:
from sklearn.metrics import f1_score

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'F1': f1}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',         # Выходной каталог
    num_train_epochs=3,             # Кол-во эпох для обучения
    per_device_train_batch_size=8,  # Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size=8,   # Размер пакета для каждого устройства во время валидации
    weight_decay=0.01,              # Понижение весов
    logging_dir='./logs',           # Каталог для хранения журналов
    load_best_model_at_end=True,    # Загружать ли лучшую модель после обучения
    learning_rate=1e-5,             # Скорость обучения
    evaluation_strategy='epoch',    # Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy='epoch',       # Логирование после каждой эпохи
    save_strategy='epoch',          # Сохранение после каждой эпохи
    save_total_limit=1,
    seed=SEED,
)

In [None]:
trainer = Trainer(model=model,
                  tokenizer=tokenizer,
                  args=training_args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  compute_metrics=compute_metrics,)

In [None]:
train_output = trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,3.2205,1.900797,0.6314
2,1.5229,1.28108,0.737314
3,1.1232,1.147934,0.765561


In [None]:
train_output

TrainOutput(global_step=30420, training_loss=1.9555304833888068, metrics={'train_runtime': 15738.3222, 'train_samples_per_second': 15.463, 'train_steps_per_second': 1.933, 'total_flos': 3.7801964113056e+16, 'train_loss': 1.9555304833888068, 'epoch': 3.0})

In [None]:
# Batch size selection:

# Batch = 16
#  Epoch	Training Loss	Validation Loss	F1
#  1	6.235500	5.874439	0.025794
#  2	5.684100	5.628022	0.038680

# Batch = 8
#  Epoch	Training Loss	Validation Loss	F1
#  1	5.976200	5.492195	0.054494
#  2	5.258400	5.249204	0.084588

In [None]:
## https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/28
## Release GPU memory:
# import gc
# globals().pop('model', None)
# gc.collect()
# torch.cuda.empty_cache()
# !nvidia-smi -q -d memory

## Save model

In [None]:
model_name = 'fine-tune-bert_0_765561'
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

('fine-tune-bert_0_765561/tokenizer_config.json',
 'fine-tune-bert_0_765561/special_tokens_map.json',
 'fine-tune-bert_0_765561/vocab.txt',
 'fine-tune-bert_0_765561/added_tokens.json',
 'fine-tune-bert_0_765561/tokenizer.json')

In [None]:
!cp -r {model_name} {MODELS_DIR + model_name}
!ls -l {MODELS_DIR}

total 8
drwx------ 2 root root 4096 Mar 17 14:08 fine-tune-bert-0_084588
drwx------ 2 root root 4096 Mar 17 19:36 fine-tune-bert_0_765561


In [None]:
!cp -r logs {GDRIVE_DIR + 'logs/20230317-1'}