# Loading dataset

In [181]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Считываем базу знаний
knowledge_base = pd.read_excel('/kaggle/input/faq-vseros/01__.xlsx')
knowledge_base.rename(columns={'Вопрос из БЗ': 'question', 'Ответ из БЗ': 'answer'}, inplace=True)

# Считываем реальные кейсы использования
real_cases = pd.read_excel('/kaggle/input/faq-vseros/02__.xlsx')
real_cases.rename(columns={'Вопрос пользователя': 'question', 'Ответ сотрудника': 'answer'}, inplace=True)

# Считываем эталонные вопросы-ответы из реальных кейсов
neighbors_best_faq = real_cases[['Вопрос из БЗ', 'Ответ из БЗ', 'Классификатор 1 уровня', 'Классификатор 2 уровня']]
neighbors_best_faq.rename(columns={'Вопрос из БЗ': 'question', 'Ответ из БЗ': 'answer'}, inplace=True)

negative_examples = pd.read_excel('/kaggle/input/faq-vseros/modified_questions_answers.xlsx')
negative_examples = negative_examples.assign(target1='ОТСУТСТВУЕТ', target2='Отсутствует')
negative_examples.rename(columns={'target1': 'Классификатор 1 уровня', 'target2': 'Классификатор 2 уровня'}, inplace=True)

data = pd.concat([
    knowledge_base[['question', 'answer', 'Классификатор 1 уровня', 'Классификатор 2 уровня']],
    real_cases[['question', 'answer', 'Классификатор 1 уровня', 'Классификатор 2 уровня']],
    neighbors_best_faq[['question', 'answer', 'Классификатор 1 уровня', 'Классификатор 2 уровня']],
    negative_examples[['question', 'answer', 'Классификатор 1 уровня', 'Классификатор 2 уровня']]
])

data.head(5)
len(data)

2228

In [182]:
# Распределение классов
data['Классификатор 1 уровня'].value_counts()

Классификатор 1 уровня
УПРАВЛЕНИЕ АККАУНТОМ                  537
ОТСУТСТВУЕТ                           368
ВИДЕО                                 337
ТРАНСЛЯЦИЯ                            239
ПРЕДЛОЖЕНИЯ                           225
МОНЕТИЗАЦИЯ                           200
МОДЕРАЦИЯ                             184
ДОСТУП К RUTUBE                        70
СОТРУДНИЧЕСТВО ПРОДВИЖЕНИЕ РЕКЛАМА     28
БЛАГОТВОРИТЕЛЬНОСТЬ ДОНАТЫ             28
ПОИСК                                  12
Name: count, dtype: int64

## Разобьем выборку на train/val

In [183]:
# Создание простых переходов от айдишника к лэйблу и обратно
id2label = data['Классификатор 1 уровня'].value_counts().to_dict()
label2id = {k: i for i, k in enumerate(id2label)}
id2label = {i: k for i, k in enumerate(id2label)}

In [184]:
from sklearn.model_selection import train_test_split

texts = [row['question'] for _, row in data.iterrows()]
labels = [label2id[row['Классификатор 1 уровня']] for _, row in data.iterrows()]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

## Создадим токены для датасета

In [185]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [186]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

## Определим кастомный датасет

In [187]:
import torch
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [188]:
from torch.utils.data import DataLoader

# Создадим датасеты
train_dataset = CustomTextDataset(train_encodings, train_labels)
test_dataset = CustomTextDataset(test_encodings, test_labels)

# Metrics

In [189]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Вычисляем метрики для каждого класса
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None)
    
    # Вычисляем средние значения метрик
    macro_f1 = np.mean(f1)
    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    
    # Вычисляем общую точность
    accuracy = accuracy_score(labels, predictions)
    
    # Вычисляем weighted F1-score
    weighted_f1 = precision_recall_fscore_support(labels, predictions, average='weighted')[2]

    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'weighted_f1': weighted_f1
    }


# Train

In [198]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased", num_labels=len(label2id), id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [201]:
from transformers import TrainerCallback

class ContiguousTensorCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        model = kwargs.get('model')
        for param in model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

In [204]:
training_args = TrainingArguments(
    output_dir="my_baseline_classifier_with_negative_examples",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    logging_strategy="epoch",
    logging_steps=1,
    metric_for_best_model="weighted_f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[ContiguousTensorCallback()]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,Weighted F1
1,0.4539,0.583944,0.825112,0.588712,0.588802,0.603118,0.810483
2,0.3181,0.456596,0.856502,0.656269,0.699836,0.652434,0.850323
3,0.2017,0.412636,0.883408,0.700169,0.723639,0.689876,0.878016
4,0.1098,0.380122,0.903587,0.857781,0.921186,0.842039,0.900717
5,0.0761,0.443889,0.890135,0.850046,0.922218,0.828648,0.886434
6,0.0473,0.359208,0.926009,0.921661,0.944279,0.906959,0.924569
7,0.0383,0.382114,0.921525,0.911294,0.928318,0.900239,0.921456
8,0.0353,0.400134,0.899103,0.888627,0.903576,0.881448,0.899318
9,0.0239,0.493533,0.894619,0.876168,0.908614,0.871898,0.89133
10,0.0232,0.41852,0.908072,0.898723,0.91474,0.888538,0.908033


TrainOutput(global_step=1120, training_loss=0.07493555082806519, metrics={'train_runtime': 524.5762, 'train_samples_per_second': 67.941, 'train_steps_per_second': 2.135, 'total_flos': 1483634564573040.0, 'train_loss': 0.07493555082806519, 'epoch': 20.0})

In [205]:
eval_results = trainer.evaluate()
eval_results

{'eval_loss': 0.40618661046028137,
 'eval_accuracy': 0.9125560538116592,
 'eval_macro_f1': 0.8838651655262233,
 'eval_macro_precision': 0.9170878152099688,
 'eval_macro_recall': 0.8722324195419116,
 'eval_weighted_f1': 0.9119735274488326,
 'eval_runtime': 1.9274,
 'eval_samples_per_second': 231.399,
 'eval_steps_per_second': 7.264,
 'epoch': 20.0}

In [215]:
import torch
import os

def save_model_manually(model, tokenizer, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # Сохраняем состояние модели
    state_dict = model.state_dict()
    for key, param in state_dict.items():
        if isinstance(param, torch.Tensor):
            state_dict[key] = param.detach().cpu().contiguous()
    
    torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
    
    # Сохраняем конфигурацию модели
    model.config.save_pretrained(output_dir)
    
    # Сохраняем токенизатор
    tokenizer.save_pretrained(output_dir)


In [216]:
save_model_manually(model, tokenizer, 'rubert_check')

# Inference модели

In [268]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/rubert_check")
inputs = tokenizer('Как создать трансляцию?Как создать трансляцию?Как создать трансляцию?Как создать трансляцию?', return_tensors="pt")

In [269]:
import torch

model.eval()
with torch.no_grad():
    outputs = model(**inputs.to(torch.device('cuda')))

In [270]:
predictions = torch.softmax(outputs.logits, dim=1)
predicted_class = torch.argmax(predictions, dim=1).item()
print(predictions)
id2label[predicted_class]

tensor([[1.2143e-04, 1.3427e-04, 3.6648e-04, 9.9828e-01, 2.5789e-04, 9.4947e-05,
         7.8016e-05, 5.8378e-05, 2.6627e-04, 2.2711e-04, 1.1868e-04]],
       device='cuda:0')


'ТРАНСЛЯЦИЯ'