In [3]:
import pandas as pd

import torch
import torch.nn as nn

import torch.nn.functional as F

import pickle

from transformers import AutoModel, AutoTokenizer

from tqdm.notebook import tqdm

from typing import Tuple, List, Dict

DEVICE = "cuda"

In [4]:
class Layer(nn.Module):
    
    def __init__(self, dim: int, device: str = "cuda"):
        super().__init__()
                        
        self.linear = nn.Sequential(
            nn.Linear(dim, dim, device=device),
            nn.BatchNorm1d(dim, device=device),
            nn.ReLU(),
        )
        
    def forward(self, x):        
        return self.linear(x)
    
    
class HierarchicalSoftmax(nn.Module):
    
    def __init__(self, tree: Dict[int, List[int]]):
        super().__init__()
        
        self.tree = tree
        self.specified_indexes_list = [list(tree.keys())]
        
        for head in tree.keys():
            self.specified_indexes_list.append(tree[head])
    
    def forward(self, x):
        x = x.clone()
        
        for specified_indexes in self.specified_indexes_list:
            sub_tensor = x[:, specified_indexes]
        
            softmax_result = F.softmax(sub_tensor, dim=1)
        
            x[:, specified_indexes] = softmax_result
            
        return x


class Model(nn.Module):
    
    def __init__(self, embed_dim: int, hidden_dim: int, num_classes: int, n_layers: int = 1, device: str = "cuda"):
        super().__init__()
        
        self.input = nn.Linear(embed_dim, hidden_dim, device=device)
        self.relu = nn.ReLU()
        self.layers = nn.ModuleList([Layer(hidden_dim, device) for _ in range(n_layers)])
        self.output = nn.Linear(hidden_dim, num_classes, device=device)
        
    def forward(self, x):
        x = self.input(x)
        x = self.relu(x)
        
        for layer in self.layers:
            x = layer(x)
        
        return self.output(x)

In [5]:
# load model, encoder, tree and reversed mapping

model = Model(embed_dim=1524, num_classes=221, n_layers=1, hidden_dim=1024).to(DEVICE)
model.load_state_dict(torch.load("model/final_model_for_deploying_roma"))

tree = pickle.load(open("model/tree", "rb"))
reversed_mapping = pickle.load(open("model/reversed_mapping", "rb"))

encoder = AutoModel.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")

In [6]:
model.eval()

def predict(output: torch.Tensor, tree: Dict[int, List[int]]):
    softmax = HierarchicalSoftmax(tree)
    
    output = softmax(output)
    
    tree_heads = list(tree.keys())
    
    groups_prob = output[:, tree_heads]
    
    groups_index = groups_prob.argmax(dim=1).tolist()
    
    indexes = []
    for k, gi in enumerate(tqdm(groups_index)):
        theme_index = output[:, tree[gi]].argmax(dim=1)[k].item()
        
        indexes.append([gi, tree[gi][theme_index]])
        
    return indexes


def make_embeddings(encoder, tokenizer, texts):
    with torch.no_grad():
        x = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(DEVICE)
    
        y = encoder(**x).last_hidden_state.mean(dim=1)

    return y


def make_predict(model: Model, 
                 input_tensors: torch.Tensor, 
                 tree: Dict[int, List[int]],
                 reversed_mapping: Dict[int, str]) -> Dict[str, str]:

    predictions = predict(model(input_tensors), tree)

    
    results = []
    for pred in predictions:
        results.append({
            "group": reversed_mapping[pred[0]].replace("Группа: ", ""),
            "theme": reversed_mapping[pred[1]]
        })

    return results

In [7]:
submission_test = pd.read_csv("test.csv", sep=";")

In [8]:
submission_test_texts = submission_test["Текст инцидента"]

In [None]:
my_embeddings = []

for text in tqdm(submission_test_texts):
    my_embeddings.append(make_embeddings(encoder, tokenizer, text))

    torch.cuda.empty_cache()

  0%|          | 0/9743 [00:00<?, ?it/s]