In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from scipy.special import softmax
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.24.0
env: TOKENIZERS_PARALLELISM=false


In [3]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
train = pd.read_csv("train.csv")

print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

test.shape: (26260, 2)


Unnamed: 0,oid,text
0,749208109,СПОЧНО СООБЩЕСТВО ПРОДАЕТСЯ ЗА 1300Р ЗА ПОКУПК...
1,452466036,Естественное восстановление после тяжелой трен...
2,161038103,Тема нарядов продолжается Одна из британских ж...
3,663621910,Привет Избранный. Ты спрашиваешь себя ЧТО здес...
4,566255305,КОРОЛЬ ПЯТИСОТНИКОВ В ДЕЛЕ Андрей Рублев успеш...


submission.shape: (2626, 2)


Unnamed: 0,oid,category
0,1622114,athletics
1,1663817,autosport
2,3174332,basketball
3,3469228,extreme
4,3905302,boardgames


In [4]:
# clear data

# text with different labels
tmp = train[["oid", "text", "category"]].groupby("text").agg(list)
tmp["counts"] = tmp.category.apply(lambda x: len(set(x)))
tmp = tmp[tmp.counts >= 2].reset_index()
noise_text = set(tmp.text.tolist())

test = test[~test.text.isin(noise_text)]
test = test.drop_duplicates(subset=['oid', 'text'])

In [11]:
with open("rubert-base-cased_7epoch_clear_unique/predictions_clear_unique.npy", 'rb') as f:
    predictions_rbc = np.load(f)
with open("rubert-base-cased-sentence_7epoch_clear_unique/predictions_clear_unique.npy", 'rb') as f:
    predictions_rbcs = np.load(f)
with open("ruRoBERTa_large_rucola_7epoch_clear_unique/predictions_clear_unique.npy", 'rb') as f:
    predictions_rlr = np.load(f)
with open("sbert_large_nlu_ru_7epoch_clear_unique/predictions_clear_unique.npy", 'rb') as f:
    predictions_slnr = np.load(f)

In [12]:
predictions_rbc = softmax(predictions_rbc, axis=-1)
predictions_rbcs = softmax(predictions_rbcs, axis=-1)
predictions_rlr = softmax(predictions_rlr, axis=-1)
predictions_slnr = softmax(predictions_slnr, axis=-1)

In [13]:
predictions_rbc = np.mean(predictions_rbc, axis=0)
predictions_rbcs = np.mean(predictions_rbcs, axis=0)
predictions_rlr = np.mean(predictions_rlr, axis=0)
predictions_slnr = np.mean(predictions_slnr, axis=0)

In [14]:
predictions = 0.329740665*predictions_rbc + 0.053*predictions_rbcs + 0.288835*predictions_rlr + 0.328424335*predictions_slnr
predictions.shape

(25893, 13)

In [15]:
with open("rubert_base_cased/le.pkl", 'rb') as handle:
    le = pickle.load(handle)

In [16]:
test["category"] = le.inverse_transform(np.argmax(predictions, axis=-1))
test["prob"] = np.max(predictions, axis=-1)
test.head()

Unnamed: 0,oid,text,category,prob
0,749208109,СПОЧНО СООБЩЕСТВО ПРОДАЕТСЯ ЗА 1300Р ЗА ПОКУПК...,esport,0.795556
1,452466036,Естественное восстановление после тяжелой трен...,extreme,0.491316
2,161038103,Тема нарядов продолжается Одна из британских ж...,tennis,0.999789
3,663621910,Привет Избранный. Ты спрашиваешь себя ЧТО здес...,esport,0.843497
4,566255305,КОРОЛЬ ПЯТИСОТНИКОВ В ДЕЛЕ Андрей Рублев успеш...,tennis,0.999825


In [17]:
test = test[["oid", "category", "prob"]].groupby(["oid", "category"]).agg(sum)
test = test.loc[test.groupby('oid').prob.idxmax()]
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,prob
oid,category,Unnamed: 2_level_1
1622114,athletics,6.208379
1663817,autosport,6.499804
3174332,basketball,8.556559
3469228,extreme,2.688659
3905302,boardgames,8.638161


In [18]:
test.drop(columns=["prob"]).to_csv("ensemble_v6.csv", index=True)