In [1]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [2]:
def get_logger(filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def preprocess(df):
    df["text"] = df["text"].apply(lambda x: " ".join(re.findall(r"[а-яА-Я0-9 ёЁ\-\.,?!+a-zA-Z]+", x)))

    return df

In [3]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=512,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

In [12]:
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(
        ~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(
                cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        self.fc = nn.Linear(self.config.hidden_size, 11)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature_me5(self, inputs):
        outputs = self.model(**inputs)
        feature = average_pool(outputs.last_hidden_state,
                               inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature_me5(inputs)
        output = self.fc(feature)

        return output

In [13]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            pred = model(inputs)
                
        preds.append(pred.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    return predictions

In [14]:
test_path = "dataset/sample.csv"

In [15]:
class CFG:
    num_workers=8
    path="output_intfloat-multilingual-e5-large-extradata_baseline"
    config_path=os.path.join(path, 'config.pth')
    model="intfloat/multilingual-e5-large"
    gradient_checkpointing=False
    batch_size=32
    target_cols=['class']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    max_len=512

CFG.tokenizer = AutoTokenizer.from_pretrained(os.path.join(CFG.path, 'tokenizer'))
LOGGER = get_logger(os.path.join(CFG.path, 'inference'))

In [16]:
oof_df = pd.read_pickle(os.path.join(CFG.path, 'oof_df.pkl'))
oof_df.head()
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred" for c in CFG.target_cols]].values

score = f1_score(labels[:, 0], preds[:, 0], average="macro")
LOGGER.info(f'Score: {score:.4f}')

Score: 0.9946
Score: 0.9946


In [17]:
test = pd.read_csv(test_path)#.sample(n=1000).reset_index(drop=True)
test = preprocess(test)

if CFG.model == "intfloat/multilingual-e5-large":
    test["text"] = test["text"].apply(lambda x: "query: " + x)

test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['text'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
print(f"test.shape: {test.shape}")
display(test.head())

Token indices sequence length is longer than the specified maximum sequence length for this model (1423 > 512). Running this sequence through the model will result in indexing errors


test.shape: (501, 3)


Unnamed: 0,class,text,tokenize_length
0,application,query: Директору ООО Мармелад Денисову Олег...,76
1,application,query: Директору ООО Рога Денисову Антону Ви...,80
2,application,query: Директору ООО Рога и копыта Денисову ...,81
3,application,query: Директору ООО Цветы жизни Денисову Ол...,84
4,application,query: Директору ООО Роща леса Денисову Олег...,88


In [18]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(os.path.join(CFG.path, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()

predictions = np.mean(predictions, axis=0)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [19]:
final_labels = [np.argmax(el) for el in predictions]
final_labels[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [20]:
import pickle

with open ("output_intfloat-multilingual-e5-large-extradata_baseline/executor_le.pkl", "rb") as f:
    exec_le = pickle.load(f)

final_labels_exec = exec_le.inverse_transform(final_labels)
final_labels_exec[:10]

array(['application', 'application', 'application', 'application',
       'application', 'application', 'application', 'application',
       'application', 'application'], dtype='<U14')

In [21]:
f1_score(test["class"].tolist(), final_labels_exec, average="weighted")

0.9960096767812957

In [None]:
test[CFG.target_cols] = predictions
submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + CFG.target_cols].to_csv('submission.csv', index=False)