In [36]:
import torch
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

DEVICE = "cuda"

In [37]:
model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to("cuda")

In [38]:
train = pd.read_csv("data/final_train.csv")
test = pd.read_csv("data/final_test.csv")

In [39]:
def get_entailment_score(query: str, premise: str) -> float:
    with torch.inference_mode():
        out = model(**tokenizer(query, premise, return_tensors='pt', truncation=True, max_length=512).to(model.device))
        proba = torch.softmax(out.logits, -1).cpu().numpy()[0]
    
    result = {v: proba[k] for k, v in model.config.id2label.items()}

    return result["entailment"]


def get_entailments_scores(data: pd.DataFrame) -> pd.DataFrame:
    entailments = []
    for _, row in tqdm(data.iterrows(), total=len(data)):
        incident, group, theme = row["Текст инцидента"], row["Группа тем"], row["Тема"]

        entailments.append(get_entailment_score(incident, group + ", " + theme))

    return entailments

In [54]:
class EmbeddingsGenerator:
    
    def __init__(self, model_name, model_length, batch_size):
        self.model_name = model_name.split('/')[-1]
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to("cuda")
        self.model_length = model_length
        self.batch_size = batch_size

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    def create_embeddings(self, texts):
        batches = [texts[i:i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
    
        embeddings = []
        with torch.no_grad():
    
            for batch in tqdm(batches):
                encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=self.model_length)
                encoded_input = {key: val.to('cuda') for key, val in encoded_input.items()}
    
                model_output = self.model(**encoded_input)
                
                embedding = self.mean_pooling(model_output, encoded_input['attention_mask'])
                
                sentence_embeddings = embedding.tolist()
                embeddings.extend(sentence_embeddings)
    
                torch.cuda.empty_cache()
        
        return embeddings

    def create_datasets(self, train, test):
        train_, test_ = train.copy(), test.copy()

        train_embeddings = pd.DataFrame(self.create_embeddings(train_["Текст инцидента"].to_list()))
        test_embeddings = pd.DataFrame(self.create_embeddings(test_["Текст инцидента"].to_list()))

        train_ = pd.concat([train_, train_embeddings], axis=1)
        test_ = pd.concat([test_, test_embeddings], axis=1)

        return train_, test_

In [40]:
entailments = get_entailments_scores(train)

  0%|          | 0/16852 [00:00<?, ?it/s]

In [70]:
embedding_generator = EmbeddingsGenerator(
    model_name="DeepPavlov/rubert-base-cased-sentence", model_length=512, batch_size=32
)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [71]:
train_with_embeddings, test_with_embeddings = embedding_generator.create_datasets(train, test)

  0%|          | 0/527 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

In [72]:
train_with_embeddings.to_csv("data/train_with_embeddings_rubert-base-cased-sentence.csv", index=False)
test_with_embeddings.to_csv("data/test_with_embeddings_rubert-base-cased-sentence.csv", index=False)