In [1]:
import pandas as pd
import torch
import numpy as np

from sklearn.metrics import accuracy_score


from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback, IntervalStrategy
from meat_dataset import MeatDataset
from utils import create_label_encoder

  from .autonotebook import tqdm as notebook_tqdm


# Чтение данных

In [2]:
train_df = pd.read_csv("train.csv", delimiter=';')
test_df = pd.read_csv("test.csv", delimiter=';')

In [3]:
labels_to_code, code_to_labels = create_label_encoder(pd.unique(train_df['mtype']))
print(labels_to_code)
print(code_to_labels)

{'Баранина': 0, 'Говядина': 1, 'Индейка': 2, 'Кура': 3, 'Свинина': 4, 'Цыпленок': 5}
{0: 'Баранина', 1: 'Говядина', 2: 'Индейка', 3: 'Кура', 4: 'Свинина', 5: 'Цыпленок'}


# Подготовка модели

In [4]:
tokenizer_path = 'cointegrated/rubert-tiny'
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

model_path = 'cointegrated/rubert-tiny'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(labels_to_code.keys()))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = MeatDataset(train_df, tokenizer, labels_to_code)
test_dataset = MeatDataset(test_df, tokenizer, labels_to_code)

In [6]:
SAVE_PATH = "model_data"
training_args = TrainingArguments(
    output_dir=SAVE_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10, 
    load_best_model_at_end=True,
    eval_strategy = IntervalStrategy.STEPS
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        
    }

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)],
    compute_metrics=compute_metrics
)

trainer.train()
tokenizer.save_pretrained(SAVE_PATH)
model.save_pretrained(SAVE_PATH)    

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  3%|▎         | 500/16440 [02:18<1:16:10,  3.49it/s]

{'loss': 0.8441, 'grad_norm': 1.4511138200759888, 'learning_rate': 1.9391727493917275e-05, 'epoch': 0.3}


                                                     
  3%|▎         | 500/16440 [02:43<1:16:10,  3.49it/s]

{'eval_loss': 0.3028925359249115, 'eval_accuracy': 0.9057177615571776, 'eval_runtime': 24.9513, 'eval_samples_per_second': 131.776, 'eval_steps_per_second': 16.472, 'epoch': 0.3}


  6%|▌         | 1000/16440 [05:09<1:14:10,  3.47it/s]

{'loss': 0.226, 'grad_norm': 0.43839478492736816, 'learning_rate': 1.878345498783455e-05, 'epoch': 0.61}


                                                      
  6%|▌         | 1000/16440 [05:35<1:14:10,  3.47it/s]

{'eval_loss': 0.15762774646282196, 'eval_accuracy': 0.9534671532846716, 'eval_runtime': 25.6382, 'eval_samples_per_second': 128.246, 'eval_steps_per_second': 16.031, 'epoch': 0.61}


  9%|▉         | 1500/16440 [08:05<1:14:42,  3.33it/s] 

{'loss': 0.151, 'grad_norm': 2.1301376819610596, 'learning_rate': 1.8175182481751824e-05, 'epoch': 0.91}


                                                      
  9%|▉         | 1500/16440 [08:30<1:14:42,  3.33it/s]

{'eval_loss': 0.1319664865732193, 'eval_accuracy': 0.9571167883211679, 'eval_runtime': 25.8339, 'eval_samples_per_second': 127.274, 'eval_steps_per_second': 15.909, 'epoch': 0.91}


 12%|█▏        | 2000/16440 [11:00<1:11:13,  3.38it/s] 

{'loss': 0.1339, 'grad_norm': 2.82753849029541, 'learning_rate': 1.75669099756691e-05, 'epoch': 1.22}


                                                      
 12%|█▏        | 2000/16440 [11:26<1:11:13,  3.38it/s]

{'eval_loss': 0.11931801587343216, 'eval_accuracy': 0.9607664233576643, 'eval_runtime': 25.8106, 'eval_samples_per_second': 127.39, 'eval_steps_per_second': 15.924, 'epoch': 1.22}


 15%|█▌        | 2500/16440 [13:56<1:08:23,  3.40it/s] 

{'loss': 0.1257, 'grad_norm': 7.21980094909668, 'learning_rate': 1.6958637469586377e-05, 'epoch': 1.52}


                                                      
 15%|█▌        | 2500/16440 [14:22<1:08:23,  3.40it/s]

{'eval_loss': 0.12025241553783417, 'eval_accuracy': 0.9607664233576643, 'eval_runtime': 25.9292, 'eval_samples_per_second': 126.807, 'eval_steps_per_second': 15.851, 'epoch': 1.52}


 18%|█▊        | 3000/16440 [16:53<1:06:58,  3.34it/s] 

{'loss': 0.1306, 'grad_norm': 0.03434718772768974, 'learning_rate': 1.635036496350365e-05, 'epoch': 1.82}


                                                      
 18%|█▊        | 3000/16440 [17:19<1:06:58,  3.34it/s]

{'eval_loss': 0.11570734530687332, 'eval_accuracy': 0.9613746958637469, 'eval_runtime': 26.0962, 'eval_samples_per_second': 125.995, 'eval_steps_per_second': 15.749, 'epoch': 1.82}


 21%|██▏       | 3500/16440 [19:50<1:04:47,  3.33it/s] 

{'loss': 0.1139, 'grad_norm': 0.019529622048139572, 'learning_rate': 1.5742092457420927e-05, 'epoch': 2.13}


                                                      
 21%|██▏       | 3500/16440 [20:16<1:04:47,  3.33it/s]

{'eval_loss': 0.11090797185897827, 'eval_accuracy': 0.9610705596107056, 'eval_runtime': 26.3123, 'eval_samples_per_second': 124.96, 'eval_steps_per_second': 15.62, 'epoch': 2.13}


 24%|██▍       | 4000/16440 [22:47<1:01:35,  3.37it/s] 

{'loss': 0.1121, 'grad_norm': 3.8831281661987305, 'learning_rate': 1.51338199513382e-05, 'epoch': 2.43}


                                                      
 24%|██▍       | 4000/16440 [23:13<1:01:35,  3.37it/s]

{'eval_loss': 0.11286475509405136, 'eval_accuracy': 0.9631995133819952, 'eval_runtime': 26.1509, 'eval_samples_per_second': 125.732, 'eval_steps_per_second': 15.716, 'epoch': 2.43}


 27%|██▋       | 4500/16440 [25:44<1:01:33,  3.23it/s] 

{'loss': 0.1017, 'grad_norm': 0.7016041278839111, 'learning_rate': 1.4525547445255475e-05, 'epoch': 2.74}


                                                      
 27%|██▋       | 4500/16440 [26:10<1:01:33,  3.23it/s]

{'eval_loss': 0.11438106745481491, 'eval_accuracy': 0.9628953771289538, 'eval_runtime': 26.0229, 'eval_samples_per_second': 126.35, 'eval_steps_per_second': 15.794, 'epoch': 2.74}


 27%|██▋       | 4500/16440 [26:11<1:09:29,  2.86it/s]


{'train_runtime': 1571.4383, 'train_samples_per_second': 83.681, 'train_steps_per_second': 10.462, 'train_loss': 0.21544961802164714, 'epoch': 2.74}


# Метрики, полученные на тестовых данных

In [9]:
predictions = trainer.predict(test_dataset)

100%|██████████| 411/411 [00:25<00:00, 16.17it/s]


In [10]:
predictions.metrics

{'test_loss': 0.11090797185897827,
 'test_accuracy': 0.9610705596107056,
 'test_runtime': 25.4973,
 'test_samples_per_second': 128.955,
 'test_steps_per_second': 16.119}