In [1]:
import pandas as pd
import mlflow
from sklearn.model_selection import train_test_split, ParameterGrid
import torch
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, default_data_collator, DebertaV2Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
from tqdm.notebook import tqdm

#models = [('sberbank-ai/ruRoberta-large', 512), ('sberbank-ai/sbert_large_nlu_ru', 512),
#          ('sberbank-ai/sbert_large_mt_nlu_ru', 512), ('sberbank-ai/ruBert-large', 512),
#          ('sberbank-ai/ruBert-base', 512), ('cointegrated/rubert-tiny2', 2048),
##          ('DeepPavlov/rubert-base-cased-conversational', 512), ('cointegrated/LaBSE-en-ru', 512),
models =          [('microsoft/mdeberta-v3-base', 512), ('vicgalle/xlm-roberta-large-xnli-anli', 512),
         ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512), ('facebook/bart-large-mnli', 1024)]



* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
class Model:
    def __init__(self, model_name, model_length, batch_size):
        self.model_name = model_name.split('/')[-1]
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to("cuda")
        self.model_length = model_length
        self.batch_size = batch_size
        
    def create_embeddings(self, texts):
        batches = [texts[i:i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
    
        embeddings = []
        with torch.no_grad():
    
            for batch in tqdm(batches):
                encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=self.model_length)
                encoded_input = {key: val.to('cuda') for key, val in encoded_input.items()}
    
                model_output = self.model(**encoded_input)
                
                embedding = model_output.last_hidden_state[:, 0, :]
                embedding = torch.nn.functional.normalize(embedding)
                
                sentence_embeddings = embedding.tolist()
                embeddings.extend(sentence_embeddings)
    
                torch.cuda.empty_cache()
        
        return embeddings

    def create_datasets(self, train, test):
        train_, test_ = train.copy(), test.copy()

        train_embeddings = pd.DataFrame(self.create_embeddings(train_["Текст инцидента"].to_list()))
        test_embeddings = pd.DataFrame(self.create_embeddings(test_["Текст инцидента"].to_list()))

        train_ = pd.concat([train_, train_embeddings], axis=1)
        test_ = pd.concat([test_, test_embeddings], axis=1)

        train_.to_csv(f"datasets/{self.model_name}_train.csv", index=False)
        test_.to_csv(f"datasets/{self.model_name}_test.csv", index=False) 

        return train_, test_

In [3]:
data = pd.read_csv("data/train.csv")
data.head(1)

Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема
0,Лысьвенский городской округ,Благоустройство,"Добрый день. Сегодня, 20.08.22, моя мать шла ...",★ Ямы во дворах


In [4]:
column = "Группа тем"
train, test = train_test_split(data,
                               test_size=0.33, random_state=42, stratify=data[column])

train, test = train.reset_index(drop=True), test.reset_index(drop=True)

In [5]:
train.head(1)

Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема
0,АО ПРО ТКО,Мусор/Свалки/ТКО,Краснокамск Новостикраснокамск Объявлениякрас...,★ Уборка/Вывоз мусора


In [6]:
param_grid = {
    'iterations': [100, 250, 500]
}
grid = list(ParameterGrid(param_grid))

In [7]:
mlflow.set_experiment('embedding')

<Experiment: artifact_location='file:///workspace/mlruns/277205432312933511', creation_time=1700778858948, experiment_id='277205432312933511', last_update_time=1700778858948, lifecycle_stage='active', name='embedding', tags={}>

In [8]:
for model_ in models:
    model_name, model_length = model_
    model = Model(model_name, model_length, 16)

    train_, test_ = model.create_datasets(train, test) 
    
    for params in grid:
        try:
            with mlflow.start_run(nested=True):
                catboost_model = CatBoostClassifier(**params, verbose=0, random_seed=42)
                catboost_model.fit(train_.drop(columns=["Текст инцидента", "Группа тем", "Исполнитель", "Тема"]), train_["Группа тем"])
    
                predictions = catboost_model.predict(test_.drop(columns=["Текст инцидента", "Группа тем", "Исполнитель", "Тема"]))
                
                accuracy = accuracy_score(test_["Группа тем"], predictions)
                f1 = f1_score(test_["Группа тем"], predictions, average='weighted') 
                report = classification_report(test_["Группа тем"], predictions, output_dict=True)
                report_text =  classification_report(test_["Группа тем"], predictions)
    
                
                mlflow.log_metric("report_accuracy", report['accuracy'])
                mlflow.log_metric("macro avg_precision", report['macro avg']['precision'])
                mlflow.log_metric("macro avg_recall", report['macro avg']['recall'])
                mlflow.log_metric("macro avg_f1-score", report['macro avg']['f1-score'])
                mlflow.log_metric("weighted avg_precision", report['weighted avg']['precision'])
                mlflow.log_metric("weighted avg_recall", report['weighted avg']['recall'])
                mlflow.log_metric("weighted avg_f1-score", report['weighted avg']['f1-score'])
    
                mlflow.log_text(report_text, "classification_report.txt")
                
                mlflow.set_tag("embedding_name", model_name)
                mlflow.set_tag("embedding_size", model_length)
                mlflow.set_tag("dataset_name", "clear_v2")
                mlflow.set_tag("model_name", "catboost")
                mlflow.catboost.log_model(catboost_model, "model")
                mlflow.log_params(params)
                mlflow.log_metrics({'accuracy': accuracy, 'f1-weighted': f1})
        except Exception as e:
            error_name = type(e).__name__
            print(f"Caught an error: {error_name}")



  0%|          | 0/944 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weigh

  0%|          | 0/944 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/944 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/944 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacty of 23.65 GiB of which 877.12 MiB is free. Process 3868217 has 10.87 GiB memory in use. Process 3868312 has 8.08 GiB memory in use. Process 3925111 has 3.82 GiB memory in use. Of the allocated memory 6.59 GiB is allocated by PyTorch, and 1.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF