In [1]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [2]:
def get_logger(filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def preprocess(df):
    df["Текст инцидента"] = df["Текст инцидента"].apply(lambda x: " ".join(re.findall(r"[а-яА-Я0-9 ёЁ\-\.,?!+a-zA-Z]+", x)))

    return df

In [3]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['Текст инцидента'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

In [4]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc_exec = nn.Linear(self.config.hidden_size, 10)
        self.fc_topic = nn.Linear(self.config.hidden_size, 26)
        self.fc_subtopic = nn.Linear(self.config.hidden_size, 195)
        self._init_weights(self.fc_exec)
        self._init_weights(self.fc_topic)
        self._init_weights(self.fc_subtopic)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output_exec = self.fc_exec(feature)
        output_topic = self.fc_topic(feature)
        output_subtopic = self.fc_subtopic(feature)
        
        return output_exec, output_topic, output_subtopic

In [5]:
m = nn.Softmax(dim=1)

def inference_fn(test_loader, model, device):
    exec_preds = []
    topic_preds = []
    subtopic_preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            exec_pred, topic_pred, subtopic_pred = model(inputs)
        
        exec_pred = m(exec_pred)
        topic_pred = m(topic_pred)
        subtopic_pred = m(subtopic_pred)
        
        exec_preds.append(exec_pred.to('cpu').numpy())
        topic_preds.append(topic_pred.to('cpu').numpy())
        subtopic_preds.append(subtopic_pred.to('cpu').numpy())
    
    exec_predictions = np.concatenate(exec_preds)
    topic_predictions = np.concatenate(topic_preds)
    subtopic_predictions = np.concatenate(subtopic_preds)
    return exec_predictions, topic_predictions, subtopic_predictions

In [6]:
test_path = "test.csv"

# me5

In [7]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=8
    path="output_me5"
    config_path=os.path.join(path, 'config.pth')
    model="intfloat/multilingual-e5-large"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['Исполнитель', 'Группа тем', 'Тема']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    max_len=512

CFG.tokenizer = AutoTokenizer.from_pretrained(os.path.join(CFG.path, 'tokenizer'))
LOGGER = get_logger(os.path.join(CFG.path, 'inference'))

In [8]:
oof_df = pd.read_pickle(os.path.join(CFG.path, 'oof_df.pkl'))
oof_df.head()
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values

exec_score = f1_score(labels[:, 0], preds[:, 0], average="weighted")
topic_score = f1_score(labels[:, 1], preds[:, 1], average="weighted")
subtopic_score = f1_score(labels[:, 2], preds[:, 2], average="weighted")
score = (exec_score + topic_score + subtopic_score) / 3
target_score = (topic_score + subtopic_score) / 2
LOGGER.info(f'Score: {score:.4f} Target_score: {target_score} Scores: {exec_score}, {topic_score}, {subtopic_score}')

Score: 0.7171 Target_score: 0.679446758195246 Scores: 0.7923219900503915, 0.80921062969516, 0.549682886695332


In [9]:
test = pd.read_csv(test_path, delimiter=";")#.sample(n=1000).reset_index(drop=True)
test = preprocess(test)

if CFG.model == "intfloat/multilingual-e5-large":
    test["Текст инцидента"] = test["Текст инцидента"].apply(lambda x: "query: " + x)

test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['Текст инцидента'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
print(f"test.shape: {test.shape}")
display(test.head())

Token indices sequence length is longer than the specified maximum sequence length for this model (732 > 512). Running this sequence through the model will result in indexing errors


test.shape: (9743, 3)


Unnamed: 0,id,Текст инцидента,tokenize_length
0,9504,query: Нет,6
1,9378,query: да,6
2,1834,query: Здравствуйте!,7
3,8783,query: Вот ещё,7
4,4657,query: Центр холодно,7


In [10]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
me5_exec_predictions = []
me5_topic_predictions = []
me5_subtopic_predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(os.path.join(CFG.path, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    exec_prediction, topic_prediction, subtopic_prediction = inference_fn(test_loader, model, device)
    
    me5_exec_predictions.append(exec_prediction)
    me5_topic_predictions.append(topic_prediction)
    me5_subtopic_predictions.append(subtopic_prediction)
    del model, state, exec_prediction, topic_prediction, subtopic_prediction; gc.collect()
    torch.cuda.empty_cache()

me5_exec_predictions = np.mean(me5_exec_predictions, axis=0)
me5_topic_predictions = np.mean(me5_topic_predictions, axis=0)
me5_subtopic_predictions = np.mean(me5_subtopic_predictions, axis=0)

  0%|          | 0/153 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenize

# sbert

In [11]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=8
    path="output_sbert"
    config_path=os.path.join(path, 'config.pth')
    model="ai-forever/sbert_large_mt_nlu_ru"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['Исполнитель', 'Группа тем', 'Тема']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    max_len=512

CFG.tokenizer = AutoTokenizer.from_pretrained(os.path.join(CFG.path, 'tokenizer'))
LOGGER = get_logger(os.path.join(CFG.path, 'inference'))

In [12]:
oof_df = pd.read_pickle(os.path.join(CFG.path, 'oof_df.pkl'))
oof_df.head()
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values

exec_score = f1_score(labels[:, 0], preds[:, 0], average="weighted")
topic_score = f1_score(labels[:, 1], preds[:, 1], average="weighted")
subtopic_score = f1_score(labels[:, 2], preds[:, 2], average="weighted")
score = (exec_score + topic_score + subtopic_score) / 3
target_score = (topic_score + subtopic_score) / 2
LOGGER.info(f'Score: {score:.4f} Target_score: {target_score} Scores: {exec_score}, {topic_score}, {subtopic_score}')

Score: 0.6970 Target_score: 0.6620021369201331 Scores: 0.766880231985079, 0.8013798113888421, 0.5226244624514241
Score: 0.6970 Target_score: 0.6620021369201331 Scores: 0.766880231985079, 0.8013798113888421, 0.5226244624514241


In [13]:
test = pd.read_csv(test_path, delimiter=";")#.sample(n=1000).reset_index(drop=True)
test = preprocess(test)

if CFG.model == "intfloat/multilingual-e5-large":
    test["Текст инцидента"] = test["Текст инцидента"].apply(lambda x: "query: " + x)

test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['Текст инцидента'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
print(f"test.shape: {test.shape}")
display(test.head())

test.shape: (23128, 5)


Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема,tokenize_length
0,ИГЖН ПК,ЖКХ,,★ Наледь и сосульки на кровле,2
1,Город Пермь,Памятники и объекты культурного наследия,,Памятники и объекты культурного наследия,2
2,Лысьвенский городской округ,Благоустройство,,★ Ненадлежащее содержание зеленых насаждений (...,2
3,Александровский муниципальный округ Пермского ...,ЖКХ,,Плохое качество воды,2
4,Губахинский городской округ,Дороги,,Ремонт/строительство мостов,2


In [14]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
sbert_exec_predictions = []
sbert_topic_predictions = []
sbert_subtopic_predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(os.path.join(CFG.path, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    exec_prediction, topic_prediction, subtopic_prediction = inference_fn(test_loader, model, device)
    
    sbert_exec_predictions.append(exec_prediction)
    sbert_topic_predictions.append(topic_prediction)
    sbert_subtopic_predictions.append(subtopic_prediction)
    del model, state, exec_prediction, topic_prediction, subtopic_prediction; gc.collect()
    torch.cuda.empty_cache()

sbert_exec_predictions = np.mean(sbert_exec_predictions, axis=0)
sbert_topic_predictions = np.mean(sbert_topic_predictions, axis=0)
sbert_subtopic_predictions = np.mean(sbert_subtopic_predictions, axis=0)

  0%|          | 0/362 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note t

# labse

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=8
    path="output_labse"
    config_path=os.path.join(path, 'config.pth')
    model="cointegrated/LaBSE-en-ru"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['Исполнитель', 'Группа тем', 'Тема']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    max_len=512

CFG.tokenizer = AutoTokenizer.from_pretrained(os.path.join(CFG.path, 'tokenizer'))
LOGGER = get_logger(os.path.join(CFG.path, 'inference'))

In [None]:
oof_df = pd.read_pickle(os.path.join(CFG.path, 'oof_df.pkl'))
oof_df.head()
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values

exec_score = f1_score(labels[:, 0], preds[:, 0], average="weighted")
topic_score = f1_score(labels[:, 1], preds[:, 1], average="weighted")
subtopic_score = f1_score(labels[:, 2], preds[:, 2], average="weighted")
score = (exec_score + topic_score + subtopic_score) / 3
target_score = (topic_score + subtopic_score) / 2
LOGGER.info(f'Score: {score:.4f} Target_score: {target_score} Scores: {exec_score}, {topic_score}, {subtopic_score}')

Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505
Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505
Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505
Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505
Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505


In [None]:
test = pd.read_csv(test_path, delimiter=";")#.sample(n=1000).reset_index(drop=True)
test = preprocess(test)

if CFG.model == "intfloat/multilingual-e5-large":
    test["Текст инцидента"] = test["Текст инцидента"].apply(lambda x: "query: " + x)

test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['Текст инцидента'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
print(f"test.shape: {test.shape}")
display(test.head())

test.shape: (1000, 5)


Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема,tokenize_length
0,Лысьвенский городской округ,Благоустройство,,★ Ненадлежащее содержание зеленых насаждений (...,2
1,АО ПРО ТКО,Мусор/Свалки/ТКО,,★ Уборка/Вывоз мусора,2
2,Министерство социального развития ПК,Социальное обслуживание и защита,Здравствуйте,Дети и многодетные семьи,3
3,Министерство здравоохранения,Здравоохранение/Медицина,Здравствуйте,Здравоохранение прочее,3
4,Министерство здравоохранения,Здравоохранение/Медицина,Да,★ Просьбы о лечении,3


In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
labse_exec_predictions = []
labse_topic_predictions = []
labse_subtopic_predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(os.path.join(CFG.path, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    exec_prediction, topic_prediction, subtopic_prediction = inference_fn(test_loader, model, device)
    
    labse_exec_predictions.append(exec_prediction)
    labse_topic_predictions.append(topic_prediction)
    labse_subtopic_predictions.append(subtopic_prediction)
    del model, state, exec_prediction, topic_prediction, subtopic_prediction; gc.collect()
    torch.cuda.empty_cache()

labse_exec_predictions = np.mean(labse_exec_predictions, axis=0)
labse_topic_predictions = np.mean(labse_topic_predictions, axis=0)
labse_subtopic_predictions = np.mean(labse_subtopic_predictions, axis=0)

  0%|          | 0/16 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note th

# ensemble

In [None]:
print("exec", nn.Softmax(dim=0)(torch.tensor([0.7923219900503915, 0.766880231985079 ,0.7566052420202363])))
print("topic", nn.Softmax(dim=0)(torch.tensor([0.80921062969516, 0.8013798113888421 ,0.7871673740925216])))
print("subtopic", nn.Softmax(dim=0)(torch.tensor([0.549682886695332, 0.5226244624514241 ,0.5037623216318505])))

exec tensor([0.3402, 0.3316, 0.3282])
topic tensor([0.3367, 0.3340, 0.3293])
subtopic tensor([0.3415, 0.3324, 0.3262])


In [None]:
# CV
# me5 Score: 0.7171 Target_score: 0.679446758195246 Scores: 0.7923219900503915, 0.80921062969516, 0.549682886695332
# sbert Score: 0.6970 Target_score: 0.6620021369201331 Scores: 0.766880231985079, 0.8013798113888421, 0.5226244624514241
# labse Score: 0.6825 Target_score: 0.645464847862186 Scores: 0.7566052420202363, 0.7871673740925216, 0.5037623216318505

final_exec_predictions = 0.3402*me5_exec_predictions + 0.3316*sbert_exec_predictions + 0.3282*labse_exec_predictions
final_topic_predictions = 0.3367*me5_topic_predictions + 0.3340*sbert_topic_predictions + 0.3293*labse_topic_predictions
final_subtopic_predictions = 0.3415*me5_subtopic_predictions + 0.3324*sbert_subtopic_predictions + 0.3262*labse_subtopic_predictions

In [None]:
final_labels_exec = [np.argmax(el) for el in final_exec_predictions]
final_labels_topic = [np.argmax(el) for el in final_topic_predictions]
final_labels_subtopic = [np.argmax(el) for el in final_subtopic_predictions]

In [None]:
import pickle

with open ("output_me5/executor_le.pkl", "rb") as f:
    exec_le = pickle.load(f)

with open ("output_sbert/executor_le.pkl", "rb") as f:
    sbert_exec_le = pickle.load(f)

final_labels_exec = exec_le.inverse_transform(final_labels_exec)
final_labels_exec

array(['Лысьвенский городской округ',
       'Министерство социального развития ПК',
       'Министерство социального развития ПК',
       'Министерство социального развития ПК', 'Город Пермь',
       'Министерство социального развития ПК',
       'Лысьвенский городской округ',
       'Министерство социального развития ПК', 'Город Пермь',
       'Лысьвенский городской округ',
       'Министерство социального развития ПК', 'Город Пермь',
       'Лысьвенский городской округ', 'Губахинский городской округ',
       'Город Пермь', 'Министерство здравоохранения', 'АО ПРО ТКО',
       'АО ПРО ТКО', 'Город Пермь', 'АО ПРО ТКО',
       'Бардымский муниципальный округ Пермского края',
       'Лысьвенский городской округ',
       'Александровский муниципальный округ Пермского края',
       'Министерство здравоохранения', 'Министерство здравоохранения',
       'ИГЖН ПК', 'Лысьвенский городской округ',
       'Министерство социального развития ПК',
       'Министерство здравоохранения', 'АО ПРО ТКО

In [58]:
dict(zip(exec_le.classes_, exec_le.transform(exec_le.classes_)))

{'АО ПРО ТКО': 0,
 'Александровский муниципальный округ Пермского края': 1,
 'Бардымский муниципальный округ Пермского края': 2,
 'Город Пермь': 3,
 'Губахинский городской округ': 4,
 'ИГЖН ПК': 5,
 'Лысьвенский городской округ': 6,
 'Министерство здравоохранения': 7,
 'Министерство образования': 8,
 'Министерство социального развития ПК': 9}

In [57]:
dict(zip(sbert_exec_le.classes_, sbert_exec_le.transform(sbert_exec_le.classes_)))

array(['АО ПРО ТКО', 'Александровский муниципальный округ Пермского края',
       'Бардымский муниципальный округ Пермского края', 'Город Пермь',
       'Губахинский городской округ', 'ИГЖН ПК',
       'Лысьвенский городской округ', 'Министерство здравоохранения',
       'Министерство образования', 'Министерство социального развития ПК'],
      dtype='<U50')

In [None]:
f1_score(test["Исполнитель"].tolist(), final_labels_exec, average="weighted")

0.36100790295310176

In [46]:
test

Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема,tokenize_length
0,Лысьвенский городской округ,Благоустройство,,★ Ненадлежащее содержание зеленых насаждений (...,2
1,АО ПРО ТКО,Мусор/Свалки/ТКО,,★ Уборка/Вывоз мусора,2
2,Министерство социального развития ПК,Социальное обслуживание и защита,Здравствуйте,Дети и многодетные семьи,3
3,Министерство здравоохранения,Здравоохранение/Медицина,Здравствуйте,Здравоохранение прочее,3
4,Министерство здравоохранения,Здравоохранение/Медицина,Да,★ Просьбы о лечении,3
...,...,...,...,...,...
995,Министерство здравоохранения,Здравоохранение/Медицина,Долго думала писать или нет! br История такая!...,Содержание больниц,458
996,Лысьвенский городской округ,Благоустройство,Ситуация по этим качелям следующая br 1. Качел...,★ Ненадлежащее состояние игровых элементов на ...,465
997,Министерство образования,Образование,Доброго времени суток. Хотел бы чтобы это сооб...,Безопасность образовательного процесса,468
998,Министерство социального развития ПК,Социальное обслуживание и защита,"Ирина Десятерик, 27 мая в 12 43 br Добрый день...",Дети и многодетные семьи,480


In [10]:
test[CFG.target_cols] = predictions
submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + CFG.target_cols].to_csv('submission.csv', index=False)

NameError: name 'predictions' is not defined