# Data load & prepare libs

In [1]:
!pip install catboost -q
!pip install transformers -q
!pip install datasets -q

In [2]:
# Загружаем библиотеки
import numpy as np
import pandas as pd
import itertools
import random
import re
from tqdm import notebook
import time
import itertools


# Модели и инструменты
from scipy.special import softmax
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LassoLars, RidgeClassifier
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostClassifier, CatBoostRegressor

# Нейронки
import torch
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
# Проверяем доступность GPU
torch.cuda.is_available()

True

In [4]:
device = torch.device("cuda")

In [5]:
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

In [6]:
df_train.head(5)

Unnamed: 0.1,Unnamed: 0,text,rate
0,0,Story of a man who has unnatural feelings for ...,3
1,1,Robert DeNiro plays the most unbelievably inte...,1
2,2,"I saw the capsule comment said ""great acting.""...",1
3,3,If I had not read Pat Barker's 'Union Street' ...,4
4,4,This fanciful horror flick has Vincent Price p...,4


In [7]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

SEED = 42
seed_all(SEED)

In [8]:
df_val, df_test = train_test_split(df_test, test_size=.5, random_state=SEED)

train_text = df_train['text'].values
y_train = df_train['rate'].values

val_text = df_val['text'].values
y_val = df_val['rate'].values

test_text = df_test['text'].values
y_test = df_test['rate'].values

In [9]:
def reg_cls_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    bin_true = np.where(y_true > 5, 1, 0)
    bin_pred = np.where(y_pred > 5, 1, 0)
    acc = accuracy_score(bin_true, bin_pred)
    return mse, mae, acc

def print_metrics(y_true, y_pred):
    mse, mae, acc = reg_cls_metrics(y_true, y_pred)
    print(f'mse: {mse}\nmae: {mae}\nacc: {acc}')

# Pertained -> regressor/cls

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def namestr(obj):
    namespace = globals()
    return [name for name in namespace if namespace[name] is obj][0]
    
def get_embedings(model, tokenizer, df_corpus, batch_size=100):

    start_time = time.time()
    tokenized = df_corpus['text'].apply(lambda x: tokenizer.encode(x, 
                                                        max_length=512,
                                                        padding='max_length',
                                                        truncation=True,
                                                        add_special_tokens=True))
    tokenized = np.array(list(tokenized))
    attention_mask = np.where(tokenized != 0, 1, 0)
    print(f'Tokenized time for {namestr(df_corpus)}: {time.time() - start_time:.2f}s')

    embeddings = []
    for i in notebook.tqdm(range(tokenized.shape[0] // batch_size)):
        slice_top = batch_size*(i+1)
        if slice_top == tokenized.shape[0] // batch_size * batch_size:
            slice_top = tokenized.shape[0]
        batch = torch.LongTensor(tokenized[batch_size*i:slice_top])
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:slice_top])
        batch = batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        batch_embeddings = batch_embeddings[0][:,0,:].to('cpu')
        embeddings.append(batch_embeddings.numpy())
    return np.concatenate(embeddings)

In [None]:
train_features = get_embedings(model, tokenizer, df_train, batch_size=100)
val_features = get_embedings(model, tokenizer, df_val, batch_size=100)

Tokenized time for df_train: 47.72s


  0%|          | 0/250 [00:00<?, ?it/s]

Tokenized time for df_val: 20.67s


  0%|          | 0/125 [00:00<?, ?it/s]

In [None]:
ls_result = []

linreg = LinearRegression().fit(train_features, y_train)
pred = linreg.predict(val_features)
print('LinearRegression\n' + '-'*20)
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['LinearRegression', mse, mae, acc])

LinearRegression
--------------------
mse: 4.8134902063989635
mae: 1.73754529296875
acc: 0.86424


In [None]:
res_score = np.inf
for alpha in notebook.tqdm(np.logspace(-6, 2, num=20)):
    model = Lasso(alpha=alpha, random_state=SEED).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
lasso = Lasso(alpha=alpha_best, random_state=SEED).fit(train_features, y_train)
pred = lasso.predict(val_features)
print(f'Lasso\nalpha: {alpha_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['Lasso', mse, mae, acc])

  0%|          | 0/20 [00:00<?, ?it/s]

Lasso
alpha: 0.00012742749857031334
--------------------
mse: 4.791671351353627
mae: 1.7358503594398498
acc: 0.8648


In [None]:
res_score = np.inf
for alpha in notebook.tqdm(np.logspace(-6, 2, num=100)):
    model = Ridge(alpha=alpha, random_state=SEED).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
ridge = Ridge(alpha=alpha_best, random_state=SEED).fit(train_features, y_train)
pred = ridge.predict(val_features)
print(f'Ridge\nalpha: {alpha_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['Ridge', mse, mae, acc])

  0%|          | 0/100 [00:00<?, ?it/s]

Ridge
alpha: 6.135907273413176
--------------------
mse: 4.776531610744256
mae: 1.734499118461609
acc: 0.86704


In [None]:
model = CatBoostRegressor(verbose=False, 
                        random_state=SEED)
model.fit(train_features, y_train, eval_set=(val_features, y_val))
pred = model.predict(val_features)
print(f'CatBoostRegressor\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['CatBoostRegressor', mse, mae, acc])

CatBoostRegressor
--------------------
mse: 4.81536900686281
mae: 1.6962333202259037
acc: 0.85592


In [None]:
res_score = np.inf
for n_neighbors in notebook.tqdm(range(2, 13)):
    model = KNeighborsRegressor(n_neighbors=n_neighbors).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        n_neighbors_best = n_neighbors
neigh = KNeighborsRegressor(n_neighbors=n_neighbors_best).fit(train_features, y_train)
pred = neigh.predict(val_features)
print(f'KNeighborsRegressor\nn_neighbors_best: {n_neighbors_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['KNeighborsRegressor', mse, mae, acc])

  0%|          | 0/11 [00:00<?, ?it/s]

KNeighborsRegressor
n_neighbors_best: 12
--------------------
mse: 6.872704444444445
mae: 2.0732266666666668
acc: 0.79024


In [None]:
params_list = list(itertools.product(np.logspace(-6, 1, num=10),
                                     np.linspace(0.1, 0.9, num=9)))
res_score = np.inf
for alpha, l1_ratio in notebook.tqdm(params_list):
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=SEED).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
        l1_ratio_best = l1_ratio
elastic = ElasticNet(alpha=alpha_best, l1_ratio=l1_ratio_best, random_state=SEED).fit(train_features, y_train)
pred = elastic.predict(val_features)
print(f'ElasticNet\nalpha: {alpha_best}\nl1_ratio: {l1_ratio_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['ElasticNet', mse, mae, acc])

  0%|          | 0/90 [00:00<?, ?it/s]

ElasticNet
alpha: 0.00021544346900318845\l1_ratio: 0.1
--------------------
mse: 4.776438493778181
mae: 1.7341281745529176
acc: 0.8668


In [None]:
params_list = list(itertools.product(np.logspace(-6, 1, num=10),
                                     [True, False]))
res_score = np.inf
for alpha, normalize in notebook.tqdm(params_list):
    model = LassoLars(alpha=alpha, normalize=normalize, random_state=SEED).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
        normalize_best = normalize
elastic = LassoLars(alpha=alpha_best, normalize=normalize_best, random_state=SEED).fit(train_features, y_train)
pred = elastic.predict(val_features)
print(f'LassoLars\nalpha: {alpha_best}\nnormalize: {normalize_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['LassoLars', mse, mae, acc])

  0%|          | 0/20 [00:00<?, ?it/s]

LassoLars
alpha: 1e-06
ormalize: False
--------------------
mse: 4.87488632791581
mae: 1.7598282445812226
acc: 0.86536


In [None]:
res_score = np.inf
for C in notebook.tqdm(np.logspace(-6, 2, num=20)):
    model = LinearSVR(C=C, dual=False, loss='squared_epsilon_insensitive', random_state=SEED).fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        C_best = C
lin_svm = LinearSVR(C=C_best, dual=False, loss='squared_epsilon_insensitive', random_state=SEED).fit(train_features, y_train)
pred = lin_svm.predict(val_features)
print(f'LinearSVR\nC: {C_best}\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['LinearSVR', mse, mae, acc])

  0%|          | 0/20 [00:00<?, ?it/s]

LinearSVR
C: 0.11288378916846883
--------------------
mse: 4.77960358947158
mae: 1.7343398406819766
acc: 0.86648


In [None]:
svm = SVR().fit(train_features, y_train)
pred = svm.predict(val_features)
print(f'SVR\n{"-"*20}')
print_metrics(y_val, pred)
mse, mae, acc = reg_cls_metrics(y_val, pred)
ls_result.append(['SVR', mse, mae, acc])

SVR
--------------------
mse: 5.015660732741902
mae: 1.768242034085415
acc: 0.86304


In [None]:
y_val_bin = np.where(y_val > 5, 1, 0)
y_train_bin = np.where(y_train > 5, 1, 0)
res_score = np.inf
for alpha in notebook.tqdm(np.logspace(-6, 2, num=100)):
    model = RidgeClassifier(alpha=alpha, random_state=SEED).fit(train_features, y_train_bin)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val_bin, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
ridge = RidgeClassifier(alpha=alpha_best, random_state=SEED).fit(train_features, y_train_bin)
pred_bin = ridge.predict(val_features)
acc = accuracy_score(y_val_bin, pred_bin)
print(f'RidgeClassifier\nalpha: {alpha_best}\n{"-"*20}\nacc: {acc}')


  0%|          | 0/100 [00:00<?, ?it/s]

RidgeClassifier
alpha: 6.135907273413176
--------------------
acc: 0.87392


In [None]:
pd.DataFrame(ls_result, columns=['model', 'mse', 'mae', 'acc'])

Unnamed: 0,model,mse,mae,acc
0,LinearRegression,4.81349,1.737545,0.86424
1,Lasso,4.791671,1.73585,0.8648
2,Ridge,4.776532,1.734499,0.86704
3,CatBoostRegressor,4.815369,1.696233,0.85592
4,KNeighborsRegressor,6.872704,2.073227,0.79024
5,ElasticNet,4.776438,1.734128,0.8668
6,LassoLars,4.874886,1.759828,0.86536
7,LinearSVR,4.779604,1.73434,0.86648
8,SVR,5.015661,1.768242,0.86304


Best result shows Ridge regressor, separate model for binary classify shows better results

In [None]:
res_score = np.inf
for alpha in notebook.tqdm(np.logspace(-6, 3, num=100)):
    model = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha=alpha, random_state=SEED))])
    model.fit(train_features, y_train)
    pred = model.predict(val_features)
    score = mean_squared_error(y_val, pred)
    if res_score > score:
        res_score = score
        alpha_best = alpha
ridge = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha=alpha_best, random_state=SEED))])
ridge.fit(train_features, y_train)
pred = ridge.predict(val_features)
print(f'Ridge (scale) \nalpha: {alpha_best}\n{"-"*20}')
print_metrics(y_val, pred)

  0%|          | 0/100 [00:00<?, ?it/s]

Ridge (scale) 
alpha: 533.6699231206302
--------------------
mse: 4.780092177648449
mae: 1.7353514581680298
acc: 0.86704


# Fine tune (classification)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

In [None]:
labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)
    return {'mse': mse, 'mae': mae, 'acc': acc, 'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w}

In [None]:
training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = 5, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay =0.01, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

In [None]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3910
  Number of trainable parameters = 66959624


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.5211,1.36127,4.1196,1.15768,0.92088,2.890535,1.168956,0.91936
2,1.3024,1.315891,3.32888,1.04424,0.92792,2.615739,1.098636,0.9256
3,1.2087,1.288532,3.3324,1.01768,0.93104,2.521636,1.0429,0.92784
4,1.1394,1.287568,3.2852,1.00392,0.93224,2.495398,1.019696,0.92952
5,1.0925,1.289701,3.23304,0.99544,0.9324,2.506436,1.015866,0.9304


***** Running Evaluation *****
  Num examples = 12500
  Batch size = 32
Saving model checkpoint to /content/distilbert-base-uncased-cls/checkpoint-782
Configuration saved in /content/distilbert-base-uncased-cls/checkpoint-782/config.json
Model weights saved in /content/distilbert-base-uncased-cls/checkpoint-782/pytorch_model.bin
tokenizer config file saved in /content/distilbert-base-uncased-cls/checkpoint-782/tokenizer_config.json
Special tokens file saved in /content/distilbert-base-uncased-cls/checkpoint-782/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 12500
  Batch size = 32
Saving model checkpoint to /content/distilbert-base-uncased-cls/checkpoint-1564
Configuration saved in /content/distilbert-base-uncased-cls/checkpoint-1564/config.json
Model weights saved in /content/distilbert-base-uncased-cls/checkpoint-1564/pytorch_model.bin
tokenizer config file saved in /content/distilbert-base-uncased-cls/checkpoint-1564/tokenizer_config.json
Special tokens file

TrainOutput(global_step=3910, training_loss=1.2528266457950368, metrics={'train_runtime': 6633.5877, 'train_samples_per_second': 18.843, 'train_steps_per_second': 0.589, 'total_flos': 1.6560196608e+16, 'train_loss': 1.2528266457950368, 'epoch': 5.0})

# Fine tune (regression)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["label"] = torch.tensor([self.labels[idx]]).to(torch.float32)
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = Data(tokens_train, y_train)
val_dataset = Data(tokens_val, y_val)

In [None]:
def compute_metrics(pred):
    labels = np.round(pred.label_ids.flatten())
    preds = pred.predictions.flatten()
    mse, mae, acc = reg_cls_metrics(labels, preds)
    return {'mse': mse, 'mae': mae, 'acc': acc}

In [None]:
training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls-reg', #Выходной каталог
    num_train_epochs = 5, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay =0.01, #Понижение весов
    logging_dir = '/content/logs-reg', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

In [None]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = RegressionTrainer(model=model,
                            tokenizer = tokenizer,
                            args = training_args,
                            train_dataset = train_dataset,
                            eval_dataset = val_dataset,
                            compute_metrics = compute_metrics)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3910
  Number of trainable parameters = 66954241


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc
1,13.1514,12.166946,12.15139,3.281101,0.50464
2,12.0629,12.140621,12.12794,3.279782,0.50464
3,12.0581,12.140239,12.127345,3.279831,0.50464
4,12.0592,12.141824,12.12615,3.280005,0.50464
5,12.0516,12.140584,12.123675,3.279674,0.50464


***** Running Evaluation *****
  Num examples = 12500
  Batch size = 32
Saving model checkpoint to /content/distilbert-base-uncased-cls-reg/checkpoint-782
Configuration saved in /content/distilbert-base-uncased-cls-reg/checkpoint-782/config.json
Model weights saved in /content/distilbert-base-uncased-cls-reg/checkpoint-782/pytorch_model.bin
tokenizer config file saved in /content/distilbert-base-uncased-cls-reg/checkpoint-782/tokenizer_config.json
Special tokens file saved in /content/distilbert-base-uncased-cls-reg/checkpoint-782/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 12500
  Batch size = 32
Saving model checkpoint to /content/distilbert-base-uncased-cls-reg/checkpoint-1564
Configuration saved in /content/distilbert-base-uncased-cls-reg/checkpoint-1564/config.json
Model weights saved in /content/distilbert-base-uncased-cls-reg/checkpoint-1564/pytorch_model.bin
tokenizer config file saved in /content/distilbert-base-uncased-cls-reg/checkpoint-1564/token

TrainOutput(global_step=3910, training_loss=12.276623691256393, metrics={'train_runtime': 7227.5044, 'train_samples_per_second': 17.295, 'train_steps_per_second': 0.541, 'total_flos': 1.6558129536e+16, 'train_loss': 12.276623691256393, 'epoch': 5.0})

# Fine tuning final model

In [None]:
EPOCHS = 5
LR = 1e-6
WD = 0.01

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)
    return {'mse': mse, 'mae': mae, 'acc': acc, 'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_dataset,
                eval_dataset = val_dataset,
                compute_metrics = compute_metrics)



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.9349,1.719828,8.36816,1.80944,0.87416,6.322771,2.219115,0.8568
2,1.639,1.559535,6.27728,1.51568,0.88792,4.306636,1.657795,0.88632
3,1.5377,1.496121,5.37384,1.37784,0.8968,3.817031,1.519077,0.8908
4,1.4956,1.469862,5.16592,1.34112,0.9004,3.621419,1.451766,0.896


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.9349,1.719828,8.36816,1.80944,0.87416,6.322771,2.219115,0.8568
2,1.639,1.559535,6.27728,1.51568,0.88792,4.306636,1.657795,0.88632
3,1.5377,1.496121,5.37384,1.37784,0.8968,3.817031,1.519077,0.8908
4,1.4956,1.469862,5.16592,1.34112,0.9004,3.621419,1.451766,0.896
5,1.4787,1.462712,5.14176,1.33536,0.90192,3.567168,1.430436,0.89592


TrainOutput(global_step=3910, training_loss=1.6171887800211797, metrics={'train_runtime': 7025.0071, 'train_samples_per_second': 17.794, 'train_steps_per_second': 0.557, 'total_flos': 1.6560196608e+16, 'train_loss': 1.6171887800211797, 'epoch': 5.0})

In [None]:
EPOCHS = 5
LR = 1e-4
WD = 0.01

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)
    return {'mse': mse, 'mae': mae, 'acc': acc, 'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_dataset,
                eval_dataset = val_dataset,
                compute_metrics = compute_metrics)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.4761,1.339078,4.33952,1.16464,0.9088,3.065161,1.189497,0.92032
2,1.1485,1.287695,3.43056,1.03216,0.92128,2.738172,1.057487,0.9228
3,0.8899,1.372527,3.13168,0.99552,0.92632,2.680583,1.029467,0.92304


KeyboardInterrupt: ignored

In [None]:
EPOCHS = 5
LR = 1e-5
WD = 0.001

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)
    return {'mse': mse, 'mae': mae, 'acc': acc, 'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_dataset,
                eval_dataset = val_dataset,
                compute_metrics = compute_metrics)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.521,1.360652,4.1168,1.15648,0.92088,2.886591,1.168523,0.91904
2,1.3021,1.315197,3.33624,1.04424,0.92776,2.615301,1.097297,0.92544


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W
1,1.521,1.360652,4.1168,1.15648,0.92088,2.886591,1.168523,0.91904
2,1.3021,1.315197,3.33624,1.04424,0.92776,2.615301,1.097297,0.92544
3,1.2083,1.288982,3.32472,1.01656,0.93112,2.524922,1.043496,0.92784


In [None]:
EPOCHS = 15
LR = 1e-5
WD = 0.001

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)

    preds_avg = np.array([(i+j)/2 for i, j in zip(preds, preds_w)])
    mse_avg, mae_avg, acc_avg = reg_cls_metrics(labels, preds_avg)

    mse_w_int, mae_w_int, _ = reg_cls_metrics(labels, preds_w.round())
    mse_avg_int, mae_avg_int, _ = reg_cls_metrics(labels, preds_avg.round())

    return {'mse': mse, 'mae': mae, 'acc': acc, 
            'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w,
            'mse_avg': mse_avg, 'mae_avg': mae_avg, 'acc_avg': acc_avg, 
            'mse_w_int': mse_w_int, 'mae_w_int': mae_w_int,
            'mse_avg_int': mse_avg_int, 'mae_avg_int': mae_avg_int}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_dataset,
                eval_dataset = val_dataset,
                compute_metrics = compute_metrics)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W,Mse Avg,Mae Avg,Acc Avg,Mse W Int,Mae W Int,Mse Avg Int,Mae Avg Int
1,1.5194,1.360474,4.16408,1.16152,0.92008,2.896933,1.169314,0.91856,3.265112,1.138517,0.92016,3.00256,1.16896,3.34664,1.07704
2,1.295,1.309393,3.25656,1.03192,0.92912,2.591414,1.083827,0.92752,2.772669,1.046022,0.92912,2.66928,1.04528,2.9144,1.0128
3,1.1862,1.297228,3.54488,1.03528,0.92816,2.617359,1.037203,0.92456,2.922566,1.025768,0.928,2.69256,0.98408,3.0532,0.98728
4,1.091,1.298633,3.0888,0.97984,0.93256,2.464736,1.010111,0.93056,2.668668,0.989934,0.93264,2.55232,0.95856,2.81416,0.96552
5,1.0003,1.321527,2.91712,0.96416,0.93288,2.433807,1.012178,0.9324,2.582883,0.984729,0.93304,2.5044,0.96408,2.71256,0.95752
6,0.9254,1.35462,3.1652,0.98184,0.93032,2.603935,0.99602,0.92888,2.804143,0.985872,0.93032,2.69248,0.94912,2.94696,0.96808
7,0.851,1.401606,3.14856,0.982,0.93,2.62845,1.00428,0.92752,2.81204,0.990357,0.92968,2.72472,0.96808,2.94072,0.97224
8,0.7913,1.459038,3.08872,0.97864,0.92912,2.645438,1.000118,0.92744,2.801825,0.98742,0.92888,2.73696,0.9688,2.9228,0.97224
9,0.7345,1.502671,3.17768,0.98088,0.93104,2.678766,0.988723,0.93016,2.863589,0.982299,0.93096,2.77408,0.95744,2.9892,0.97096
10,0.6876,1.550816,3.20488,0.99144,0.92944,2.714683,0.999468,0.92848,2.898093,0.99318,0.92968,2.81848,0.9716,3.016,0.98064


TrainOutput(global_step=11730, training_loss=0.8652653947002544, metrics={'train_runtime': 20583.6655, 'train_samples_per_second': 18.218, 'train_steps_per_second': 0.57, 'total_flos': 4.9680589824e+16, 'train_loss': 0.8652653947002544, 'epoch': 15.0})

In [None]:
EPOCHS = 15
LR = 5e-6
WD = 0.001

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)

    preds_avg = np.array([(i+j)/2 for i, j in zip(preds, preds_w)])
    mse_avg, mae_avg, acc_avg = reg_cls_metrics(labels, preds_avg)

    mse_w_int, mae_w_int, _ = reg_cls_metrics(labels, preds_w.round())
    mse_avg_int, mae_avg_int, _ = reg_cls_metrics(labels, preds_avg.round())

    return {'mse': mse, 'mae': mae, 'acc': acc, 
            'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w,
            'mse_avg': mse_avg, 'mae_avg': mae_avg, 'acc_avg': acc_avg, 
            'mse_w_int': mse_w_int, 'mae_w_int': mae_w_int,
            'mse_avg_int': mse_avg_int, 'mae_avg_int': mae_avg_int}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train = tokenizer.batch_encode_plus(
    train_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_cls = np.array([labels2id[i] for i in y_train])
y_val_cls = np.array([labels2id[i] for i in y_val])
    
train_dataset = Data(tokens_train, y_train_cls)
val_dataset = Data(tokens_val, y_val_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_dataset,
                eval_dataset = val_dataset,
                compute_metrics = compute_metrics)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W,Mse Avg,Mae Avg,Acc Avg,Mse W Int,Mae W Int,Mse Avg Int,Mae Avg Int
1,1.6044,1.402119,4.6676,1.24792,0.91336,3.09602,1.238478,0.91232,3.514774,1.198278,0.91344,3.19032,1.23592,3.51096,1.12504
2,1.3628,1.345776,3.75216,1.1128,0.92152,2.836456,1.162165,0.91904,3.080064,1.116642,0.92152,2.932,1.14768,3.17592,1.0716
3,1.294,1.314241,3.61096,1.07848,0.9268,2.629744,1.097493,0.92512,2.916464,1.071661,0.92672,2.71816,1.06968,3.02824,1.02376
4,1.2359,1.30605,3.36176,1.03872,0.92968,2.560222,1.067474,0.9272,2.795299,1.041788,0.9296,2.6528,1.02432,2.92744,1.00808
5,1.1822,1.293359,3.20416,1.0072,0.9316,2.498574,1.049725,0.9296,2.707319,1.019914,0.93168,2.58304,1.00272,2.84616,0.98568
6,1.1391,1.303916,3.23552,1.00528,0.9296,2.554853,1.041597,0.9272,2.774148,1.01688,0.92968,2.64576,0.99456,2.90928,0.98304
7,1.0926,1.312677,3.51184,1.0328,0.9288,2.670958,1.034247,0.92464,2.962156,1.026708,0.92832,2.76928,0.98816,3.09936,0.99296
8,1.0577,1.315493,3.22056,0.99896,0.9296,2.574995,1.026688,0.928,2.790137,1.008385,0.92952,2.65472,0.97712,2.9368,0.98064
9,1.025,1.317208,3.2664,0.99584,0.93016,2.595534,1.0161,0.92824,2.827602,1.001231,0.93024,2.69248,0.97056,2.96096,0.97344


KeyboardInterrupt: ignored

# Final evaluate

In [None]:
train_text_full = np.concatenate([train_text, val_text])
y_train_full = np.concatenate([y_train, y_val])

In [None]:
EPOCHS = 5
LR = 1e-5
WD = 0.001

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    preds_w = np.array([sum([id2labels[i] * w for i, w in enumerate(softmax(arr))]) for arr in raw_preds])
    mse_w, mae_w, acc_w = reg_cls_metrics(labels, preds_w)

    preds_avg = np.array([(i+j)/2 for i, j in zip(preds, preds_w)])
    mse_avg, mae_avg, acc_avg = reg_cls_metrics(labels, preds_avg)

    mse_w_int, mae_w_int, _ = reg_cls_metrics(labels, preds_w.round())
    mse_avg_int, mae_avg_int, _ = reg_cls_metrics(labels, preds_avg.round())

    return {'mse': mse, 'mae': mae, 'acc': acc, 
            'mse_w': mse_w, 'mae_w': mae_w, 'acc_w': acc_w,
            'mse_avg': mse_avg, 'mae_avg': mae_avg, 'acc_avg': acc_avg, 
            'mse_w_int': mse_w_int, 'mae_w_int': mae_w_int,
            'mse_avg_int': mse_avg_int, 'mae_avg_int': mae_avg_int}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens_train_full = tokenizer.batch_encode_plus(
    train_text_full.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y_train_full_cls = np.array([labels2id[i] for i in y_train_full])
y_test_cls = np.array([labels2id[i] for i in y_test])
    
train_full_dataset = Data(tokens_train_full, y_train_full_cls)
test_dataset = Data(tokens_test, y_test_cls)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_full_dataset,
                eval_dataset = test_dataset,
                compute_metrics = compute_metrics)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W,Mse Avg,Mae Avg,Acc Avg,Mse W Int,Mae W Int,Mse Avg Int,Mae Avg Int
1,1.4638,1.321453,4.07208,1.12536,0.92504,2.714463,1.108029,0.92192,3.14221,1.094149,0.92504,2.80448,1.076,3.23552,1.03664


Epoch,Training Loss,Validation Loss,Mse,Mae,Acc,Mse W,Mae W,Acc W,Mse Avg,Mae Avg,Acc Avg,Mse W Int,Mae W Int,Mse Avg Int,Mae Avg Int
1,1.4638,1.321453,4.07208,1.12536,0.92504,2.714463,1.108029,0.92192,3.14221,1.094149,0.92504,2.80448,1.076,3.23552,1.03664
2,1.2615,1.244213,3.31752,0.99384,0.936,2.347465,1.007259,0.93632,2.64011,0.984126,0.936,2.4312,0.95536,2.75128,0.9364
3,1.1635,1.226197,3.13664,0.95344,0.93752,2.324952,0.975992,0.93584,2.577352,0.954485,0.9376,2.39504,0.92112,2.7156,0.92072
4,1.0948,1.226417,3.1044,0.94632,0.93536,2.35028,0.974568,0.93312,2.596923,0.952995,0.93552,2.4352,0.92192,2.74312,0.92136
5,1.051,1.232837,3.12552,0.94616,0.93736,2.348415,0.954687,0.93592,2.616845,0.944351,0.93744,2.43208,0.90168,2.76808,0.91928


TrainOutput(global_step=5860, training_loss=1.2069243447365614, metrics={'train_runtime': 9629.1539, 'train_samples_per_second': 19.472, 'train_steps_per_second': 0.609, 'total_flos': 2.4840294912e+16, 'train_loss': 1.2069243447365614, 'epoch': 5.0})

In [None]:
preds = trainer.predict(test_dataset).predictions

In [None]:
softmax(preds, axis=-1)

array([[1.39110703e-02, 4.62415479e-02, 1.79338396e-01, ...,
        8.54830164e-03, 2.95900623e-03, 4.13391739e-03],
       [1.40812490e-02, 2.53013708e-02, 1.40481696e-01, ...,
        5.32333963e-02, 1.12186419e-02, 1.24833416e-02],
       [1.18795652e-02, 2.35717855e-02, 9.42182392e-02, ...,
        9.32840630e-02, 2.08880901e-02, 1.39411315e-02],
       ...,
       [9.00902152e-01, 6.55059516e-02, 2.02105157e-02, ...,
        1.01747585e-03, 7.29460327e-04, 2.22341786e-03],
       [2.53424561e-03, 2.23011058e-03, 1.62485242e-03, ...,
        1.37565553e-01, 2.11420968e-01, 6.06024325e-01],
       [2.15396145e-03, 1.75934145e-03, 1.28248346e-03, ...,
        8.98417011e-02, 1.55613959e-01, 7.29256690e-01]], dtype=float32)

In [None]:
parameters = {'alpha': np.logspace(-8, 2, num=100)}
model = Ridge(random_state=42)
result = GridSearchCV(model, parameters, n_jobs=-1, cv=5, 
                      scoring='neg_mean_absolute_error').fit(preds, y_test)
print(f'MAE: {-result.best_score_}')
for _, alpha_best in result.best_params_.items():
    pass
print(f'alpha_best: {alpha_best}')

MAE: 1.0740163829612732
alpha_best: 1e-08


In [None]:
parameters = {'alpha': np.logspace(-8, 2, num=100)}
model = Ridge(random_state=42)
result = GridSearchCV(model, parameters, n_jobs=-1, cv=5, 
                      scoring='neg_mean_absolute_error').fit(softmax(preds, axis=-1), y_test)
print(f'MAE: {-result.best_score_}')
for _, alpha_best in result.best_params_.items():
    pass
print(f'alpha_best: {alpha_best}')

MAE: 1.006515598602295
alpha_best: 1e-08


In [None]:
y_test_bin = np.where(y_test > 5, 1, 0)

parameters = {'alpha': np.logspace(-8, 2, num=100)}
model = RidgeClassifier(random_state=42)
result = GridSearchCV(model, parameters, n_jobs=-1, cv=5, 
                      scoring='accuracy').fit(preds, y_test_bin)
print(f'accuracy: {result.best_score_}')
for _, alpha_best in result.best_params_.items():
    pass
print(f'alpha_best: {alpha_best}')

accuracy: 0.9379200000000001
alpha_best: 62.80291441834247


In [None]:
y_test_bin = np.where(y_test > 5, 1, 0)

parameters = {'alpha': np.logspace(-8, 2, num=100)}
model = RidgeClassifier(random_state=42)
result = GridSearchCV(model, parameters, n_jobs=-1, cv=5, 
                      scoring='accuracy').fit(softmax(preds, axis=-1), y_test_bin)
print(f'accuracy: {result.best_score_}')
for _, alpha_best in result.best_params_.items():
    pass
print(f'alpha_best: {alpha_best}')

accuracy: 0.938
alpha_best: 39.44206059437648


# Final fit

In [10]:
text = np.concatenate([train_text, val_text, test_text])
y = np.concatenate([y_train, y_val, y_test])

In [11]:
EPOCHS = 5
LR = 1e-5
WD = 0.001

class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
        
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    labels = np.array([id2labels[i] for i in labels])
    raw_preds = pred.predictions
    preds = raw_preds.argmax(-1)
    preds = np.array([id2labels[i] for i in preds])
    mse, mae, acc = reg_cls_metrics(labels, preds)

    return {'mse': mse, 'mae': mae, 'acc': acc}
    
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8).to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens = tokenizer.batch_encode_plus(
    text.astype('str'),
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

labels2id = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
id2labels = {0:1, 1:2, 2:3, 3:4, 4:7, 5:8, 6:9, 7:10}

y = np.array([labels2id[i] for i in y])
    
dataset = Data(tokens, y)

training_args = TrainingArguments(
    output_dir = '/content/distilbert-base-uncased-cls', #Выходной каталог
    num_train_epochs = EPOCHS, #Кол-во эпох для обучения
    per_device_train_batch_size = 32, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 32, #Размер пакета для каждого устройства во время валидации
    weight_decay = WD, #Понижение весов
    logging_dir = '/content/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = LR, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=SEED)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [12]:
trainer = Trainer(model=model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = dataset,
                eval_dataset = dataset,
                compute_metrics = compute_metrics)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Acc
1,1.4264,1.229917,2.9554,0.94528,0.94658
2,1.2256,1.097695,2.0425,0.74798,0.96434
3,1.1219,1.018625,1.63756,0.66304,0.97406
4,1.058,0.981679,1.52522,0.63254,0.97836
5,1.0172,0.951037,1.33972,0.58448,0.98122


TrainOutput(global_step=7815, training_loss=1.169823515849928, metrics={'train_runtime': 16324.8512, 'train_samples_per_second': 15.314, 'train_steps_per_second': 0.479, 'total_flos': 3.3120393216e+16, 'train_loss': 1.169823515849928, 'epoch': 5.0})

In [14]:
trainer.save_model('model_nlp')

In [15]:
!zip -r '/content/model_nlp.zip' '/content/model_nlp'

  adding: content/model_nlp/ (stored 0%)
  adding: content/model_nlp/tokenizer_config.json (deflated 45%)
  adding: content/model_nlp/config.json (deflated 54%)
  adding: content/model_nlp/training_args.bin (deflated 48%)
  adding: content/model_nlp/pytorch_model.bin (deflated 8%)
  adding: content/model_nlp/special_tokens_map.json (deflated 42%)
  adding: content/model_nlp/vocab.txt (deflated 53%)


In [16]:
from google.colab import files

files.download('/content/model_nlp.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>