In [34]:
import torch
import torch.nn as nn
import pandas as pd

from typing import List

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

In [35]:
def concat(paths: List[str]) -> pd.DataFrame:
    return pd.concat([pd.read_csv(path) for path in paths])

In [36]:
data = pd.read_csv("../train_dataset_train_variant2.csv")

concated_new_train_data = concat(paths=[
    "../generated_train_saiga_v1.csv", "../generated_train_saiga_v2.csv",
    "../generated_train_mistral.csv", "../generated_train_paraphraser.csv",
])

In [45]:
train, test = train_test_split(data, random_state=42, test_size=0.33, stratify=data["Группа тем"] + data["Тема"])

In [46]:
# concat old train with new generated train data

train = pd.concat([train, concated_new_train_data])

train.index = range(len(train))
test.index = range(len(test))

In [47]:
class EmbeddingsGenerator:
    
    def __init__(self, model_name, model_length, batch_size):
        self.model_name = model_name.split('/')[-1]
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to("cuda")
        self.model_length = model_length
        self.batch_size = batch_size

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    def create_embeddings(self, texts):
        batches = [texts[i:i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
    
        embeddings = []
        with torch.no_grad():
    
            for batch in tqdm(batches):
                encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=self.model_length)
                encoded_input = {key: val.to('cuda') for key, val in encoded_input.items()}
    
                model_output = self.model(**encoded_input)
                
                embedding = self.mean_pooling(model_output, encoded_input['attention_mask'])
                
                sentence_embeddings = embedding.tolist()
                embeddings.extend(sentence_embeddings)
    
                torch.cuda.empty_cache()
        
        return embeddings

    def create_datasets(self, train, test):
        train_, test_ = train.copy(), test.copy()

        train_embeddings = pd.DataFrame(self.create_embeddings(train_["Текст инцидента"].to_list()))
        test_embeddings = pd.DataFrame(self.create_embeddings(test_["Текст инцидента"].to_list()))

        train_ = pd.concat([train_, train_embeddings], axis=1)
        test_ = pd.concat([test_, test_embeddings], axis=1)

        return train_, test_

In [48]:
embedding_generator = EmbeddingsGenerator(
    model_name="sberbank-ai/sbert_large_mt_nlu_ru", model_length=512, batch_size=32
)

In [49]:
train_with_embeddings, test_with_embeddings = embedding_generator.create_datasets(train, test)

  0%|          | 0/605 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

In [51]:
train_with_embeddings.to_csv("data/train_with_embeddings_v2.csv", index=False)
test_with_embeddings.to_csv("data/test_with_embeddings_v2.csv", index=False)

In [53]:
test["Тема"]

0                          Хамство медицинских работников
1                            Ремонт спортивных учреждений
2       ★ Нарушение правил очистки дорог от снега и на...
3       Нехватка или сокращение врачей и медицинских у...
4       ★ Нарушение правил уборки от снега и наледи вн...
                              ...                        
7430                            Оказание гос. соц. помощи
7431                              Ямы и выбоины на дороге
7432    ★ Оказание медицинской помощи не в полном объе...
7433        Нехватка материально-технического обеспечения
7434               Завышение платы за коммунальные услуги
Name: Тема, Length: 7435, dtype: object

In [54]:
train, test = pd.read_csv("data/row_data/final_train.csv"), pd.read_csv("data/row_data/final_test.csv")

In [56]:
full = pd.concat([train, test])

In [58]:
full.to_csv("data/row_data/final_full.csv", index=False)