In [1]:
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install scikit_learn
# !pip install transformers[torch]

In [2]:
# import huggingface_hub
# huggingface_hub.login()

In [3]:
import json
import os
import re
from glob import glob
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import torch
import transformers
from datasets import load_dataset
from sklearn.metrics import f1_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, EarlyStoppingCallback)
from sklearn.model_selection import StratifiedShuffleSplit
from datasets import Dataset

In [4]:
if not os.path.exists("train.csv"):
    train = pd.read_csv("prime99_train.csv")
    test = pd.read_csv("prime99_test.csv")
    k = StratifiedShuffleSplit(test_size=0.1, random_state=42, n_splits=1)
    train_index, val_index = list(k.split(train, train["Тема"]))[0]
    val = train.loc[val_index]
    train = train.loc[train_index]
    train.to_csv("train.csv", index=None)
    val.to_csv("val.csv", index=None)

In [5]:
def tokenize_function(examples):
    return tokenizer(
        examples["Текст инцидента"],
        max_length=256,
        padding='max_length',
        truncation=True)

def create_dataset(data, split):
    data = Dataset.from_pandas(data, split=split).map(tokenize_function, batched=True)
    return data

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt, problem_type="single_label_classification")


In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    output = dict()
    for average in ("macro", "micro", "weighted"):
        score = f1_score(labels,
                         np.argmax(predictions, axis=1),
                         average=average)
        output[f"f1_{average}"] = score
    return output

In [35]:
# def train_model(train, val, test, column="Исполнитель"):
column = "Группа тем"
start_df = pd.read_csv("prime99_train.csv")
label_encoder = LabelEncoder()
label_encoder.fit(start_df[column])

datasets = ["train", "val", "prime99_test"]
dataset_dict = dict()
for dataset in datasets:
    df = pd.read_csv(f"{dataset}.csv")
    df["label"] = label_encoder.transform(df[column])
    hf_dataset = create_dataset(df, dataset)
    dataset_dict[dataset] = hf_dataset
    
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(label_encoder.classes_),
    problem_type="single_label_classification").to('cuda')


batch_size = 32

args = TrainingArguments(
    output_dir="multiclass_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=5,
    metric_for_best_model="f1_macro",
    load_best_model_at_end=True,
    
)

trainer = Trainer(
    model,
    args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["val"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.evaluate()
trainer.train()

In [15]:
from torch import nn
sigmoid = nn.Sigmoid()

def inference(model, message, **kwargs):
    input_ids, attention_mask = tokenize(message, **kwargs)
    preds = model(input_ids, attention_mask)
    logits = sigmoid(preds["logits"])[0].cpu().detach().numpy()
    return logits

def tokenize(message, device="cuda", **kwargs):
    text_enc = tokenizer.encode_plus(
        message,
        None,
        add_special_tokens=True,
        max_length=256,
        padding="max_length",
        return_token_type_ids=False,
        return_attention_mask=True,
        truncation=True,
        return_tensors="np",
    )
    input_ids = torch.tensor(text_enc["input_ids"], dtype=torch.long).to(device)
    attention_mask = torch.tensor(text_enc["attention_mask"], dtype=torch.long).to(
        device
    )
    return input_ids, attention_mask

In [19]:
len(label_encoder.classes_)

26

In [21]:
test = pd.read_csv("prime99_test.csv")
preds = [label_encoder.inverse_transform([np.argmax(inference(model2, text))])[0]
         for text in test["Текст инцидента"].values]
f1_score(test[column], preds, average="weighted")

0.7504600839262756

In [26]:
label_encoder.classes_

array(['Безопасность', 'Благоустройство', 'Газ и топливо',
       'Государственная собственность', 'Дороги', 'ЖКХ',
       'Здравоохранение/Медицина', 'Коронавирус', 'Культура',
       'МФЦ "Мои документы"', 'Мобилизация', 'Мусор/Свалки/ТКО',
       'Образование', 'Общественный транспорт',
       'Памятники и объекты культурного наследия',
       'Погребение и похоронное дело', 'Роспотребнадзор',
       'Связь и телевидение', 'Социальное обслуживание и защита',
       'Спецпроекты', 'Строительство и архитектура', 'Торговля',
       'Физическая культура и спорт', 'Экология', 'Экономика и бизнес',
       'Электроснабжение'], dtype=object)

In [29]:
bert_classes = {c: ci for ci, c in enumerate(bert_label_encoder.classes_)}

In [33]:
bert_probas = inference(group_roberta_model, input_text)

In [34]:
bert_probas

array([0.5637781 , 0.726879  , 0.23757683, 0.16903396, 0.6920174 ,
       0.8380316 , 0.99354136, 0.8742636 , 0.23495382, 0.13958792,
       0.2907984 , 0.8348023 , 0.48864707, 0.44516757, 0.12066784,
       0.1485628 , 0.1616542 , 0.784488  , 0.8539515 , 0.37535444,
       0.3281864 , 0.08632873, 0.16671015, 0.10712508, 0.1850831 ,
       0.22988942], dtype=float32)

In [31]:
input_text = "медицина"

In [25]:
inference(model2, "медицина")

array([0.5637781 , 0.726879  , 0.23757683, 0.16903396, 0.6920174 ,
       0.8380316 , 0.99354136, 0.8742636 , 0.23495382, 0.13958792,
       0.2907984 , 0.8348023 , 0.48864707, 0.44516757, 0.12066784,
       0.1485628 , 0.1616542 , 0.784488  , 0.8539515 , 0.37535444,
       0.3281864 , 0.08632873, 0.16671015, 0.10712508, 0.1850831 ,
       0.22988942], dtype=float32)

In [30]:
group_roberta_model = model2

In [None]:
bert_probas = inference(group_roberta_model, "медицина")

In [23]:
import pickle
with open("theme_group_label_encoder.pcl", "wb") as f:
    pickle.dump(label_encoder, f)

In [10]:
# datasets = ["train", "val", "prime99_test"]
# dataset_dict = dict()
# for dataset in datasets:
#     df = pd.read_csv(f"{dataset}.csv")
#     preds = [label_encoder.inverse_transform([np.argmax(inference(model, text))])[0] for text in df["Текст инцидента"].values]
#     preds = [p + ";" for p in preds]
#     df["Текст инцидента"] = preds + df["Текст инцидента"]
#     df.to_csv(f"{dataset}.csv", index=None)

In [11]:
model.push_to_hub("denis-gordeev/xlm-roberta-base-theme")

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/denis-gordeev/xlm-roberta-base-theme/commit/e7626226016e18df6d6c1227994adfc20bed1873', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='e7626226016e18df6d6c1227994adfc20bed1873', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
model2 = AutoModelForSequenceClassification.from_pretrained(
    "denis-gordeev/xlm-roberta-base-theme-group",
    problem_type="single_label_classification").to('cuda')