In [None]:
from transformers import set_seed

set_seed(228)

In [None]:
import pickle

with open('stared.dict', 'rb') as fin:
    stared_dict = pickle.load(fin)
    
with open('theme_to_group_dict.dict', 'rb') as fin:
    theme_to_group_dict = pickle.load(fin)

In [None]:
TRAIN_DF_PATH = 'preproc_dataset.csv'
TEST_DF_PATH = 'test.csv'
TEXT_COLUMN = 'Текст инцидента'

TF_IDF_PATH = 'final_tf_idf.pkl'
CLASSIFIER_PATH = 'final_model.pkl'

In [None]:
import pandas as pd

dataset_df = pd.read_csv(TRAIN_DF_PATH)

test_df = pd.read_csv(TEST_DF_PATH, sep=';')
test_df['text'] = test_df[TEXT_COLUMN].apply(lambda x: x[1:] if x[0] == "'" else x)
test_df.text = test_df.text.map(lambda x: x[:768])
test_df.head()

In [None]:
id2label = {k: v for k, v in enumerate(dataset_df.theme.unique())}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from datasets import Dataset

test_dataset = Dataset.from_pandas(test_df[['text']])

In [None]:
import torch
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification


BATCH_SIZE = 8

def batch_iter(iterable, n=1):
    l = len(iterable)
    it = iter(iterable)
    temp = []

    for ndx in range(0, l):
        temp.append(next(it))
        if len(temp) == n:
            yield temp.copy()
            temp.clear()

    if temp:
        yield temp

def get_logits(model_path):
    train_logits = []
    test_logits = []
    all_trues = []
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to('cuda')

    model.eval()
    with torch.no_grad():
        for batch in tqdm(batch_iter(test_dataset, BATCH_SIZE), total=len(test_dataset) // BATCH_SIZE):
            texts = [b['text'] for b in batch]
            tokenized = tokenizer(texts, return_tensors='pt', truncation=True, padding=True).to('cuda')
            batch_logits = model(**tokenized).logits.cpu()
            # batch_pred_ids = batch_logits.argmax(axis=1).numpy()
            # pred_labels = [model.config.id2label[label_id] for label_id in batch_pred_ids]
            test_logits.extend(batch_logits.numpy())
            #all_trues.extend([id2label[b['labels']] for b in batch])
    return test_logits

In [None]:
test_logits_LaBSE = get_logits('LaBSE-en-ru_checkpoint-6795_GROUP-20231125T092338Z-001/LaBSE-en-ru_checkpoint-6795_GROUP')

In [None]:
test_logits_LaBSE_theme = get_logits('checkpoint-5500')

In [None]:
X_test_1 = pd.DataFrame({
    'text': test_dataset['text'],
    'logits': test_logits_LaBSE
})

X_test_2 = pd.DataFrame({
    'text': test_dataset['text'],
    'logits': test_logits_LaBSE_theme
})

In [None]:
with open(TF_IDF_PATH, 'rb') as fin:
    tfidf = pickle.load(fin)
    
with open(CLASSIFIER_PATH, 'rb') as fin:
    classifier = pickle.load(fin)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack as sparse_hstack
from torch.nn.functional import softmax


tfidf_test = tfidf.transform(X_test_1.text)
test_features = sparse_hstack([tfidf_test, 
                               softmax(torch.tensor(X_test_1.logits.to_list()), dim=1).numpy(),
                               softmax(torch.tensor(X_test_2.logits.to_list()), dim=1).numpy(),
                              ])

In [None]:
y_pred = classifier.predict(test_features)

In [None]:
submission_df = pd.read_csv(TEST_DF_PATH, sep=';')
submission_df['Группа тем'] = [theme_to_group_dict[stared_dict.get(x, x)] for x in y_pred] 
submission_df['Тема'] = y_pred
submission_df['Тема'] = submission_df['Тема'].apply(lambda x: x if x not in stared_dict else stared_dict[x])
submission_df = submission_df.drop('Текст инцидента', axis=1)

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv', sep=';', index=False, encoding = 'utf-8')