In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import torch
import torch.nn as nn
from tqdm import tqdm

import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

#### Данные

In [19]:
def preprocess(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'])

    df['day'] = df['activation_date'].dt.day
    df['month'] = df["activation_date"].dt.month
    df['year'] = df["activation_date"].dt.year
    df['weekday'] = df['activation_date'].dt.weekday
    df["dayofyear"] = df['activation_date'].dt.dayofyear
    df.drop(columns=['activation_date', 'item_id'], inplace=True)
    df['param_1'] = df['param_1'].fillna('')
    df['param_2'] = df['param_2'].fillna('')
    df['param_3'] = df['param_3'].fillna('')
    df['description'] = df['description'].fillna('')
    return df

test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')
item_id = test.item_id
train = preprocess(train)
test = preprocess(test)

### Табличные данные

### Catboost

In [3]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['deal_probability', 'image', 'param_1', 'param_2', 'param_3', 'title', 'description', 'region', 'city', 'parent_category_name', 'category_name']), train['deal_probability'], test_size=0.2, random_state=42)

In [4]:
model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, eval_set=(X_val,y_val), use_best_model=True, verbose=100, cat_features=['user_type', 'user_id'])

Learning rate set to 0.155054
0:	learn: 0.2567373	test: 0.2563903	best: 0.2563903 (0)	total: 315ms	remaining: 5m 14s
100:	learn: 0.2387694	test: 0.2371287	best: 0.2371287 (100)	total: 17.3s	remaining: 2m 33s
200:	learn: 0.2372134	test: 0.2356789	best: 0.2356789 (200)	total: 34.4s	remaining: 2m 16s
300:	learn: 0.2363882	test: 0.2349105	best: 0.2349105 (300)	total: 52.6s	remaining: 2m 2s
400:	learn: 0.2358867	test: 0.2344587	best: 0.2344587 (400)	total: 1m 10s	remaining: 1m 45s
500:	learn: 0.2355119	test: 0.2341662	best: 0.2341661 (499)	total: 1m 28s	remaining: 1m 28s
600:	learn: 0.2352342	test: 0.2339664	best: 0.2339664 (600)	total: 1m 46s	remaining: 1m 10s
700:	learn: 0.2349980	test: 0.2338060	best: 0.2338060 (700)	total: 2m 3s	remaining: 52.6s
800:	learn: 0.2347932	test: 0.2336635	best: 0.2336635 (800)	total: 2m 20s	remaining: 34.9s
900:	learn: 0.2346339	test: 0.2335708	best: 0.2335708 (900)	total: 2m 36s	remaining: 17.2s
999:	learn: 0.2344694	test: 0.2334613	best: 0.2334613 (999)	tot

<catboost.core.CatBoostRegressor at 0x2c3e56c70>

In [6]:
model.save_model("catboost_model_tabular.cbm")
result = np.clip(model.predict(test.drop(columns=['image', 'image', 'param_1', 'param_2', 'param_3', 'title', 'description', 'region', 'city', 'parent_category_name', 'category_name'])), 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result}).to_csv("../results/catboost_model_tabular.csv", index=0)

In [21]:
result = pd.read_csv("../results/catboost_model_tabular.csv").deal_probability.values

### RNN

In [39]:
model = nn.Sequential(
    nn.Linear(8, 24),
    nn.ReLU(),
    nn.Linear(24, 12),
    nn.ReLU(),
    nn.Linear(12, 6),
    nn.ReLU(),
    nn.Linear(6, 1)
)
checkpoint = torch.load("models/Sequential_7_0.81_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [40]:
user_type_dict = {'Private': 0, 'Company': 1, 'Shop': 2}
y_pred_rnn = []
len_test = test.shape[0]
for i, row in tqdm(test.iterrows(), total=len_test):
    tabular = torch.tensor([row["item_seq_number"], row["day"], row["month"], row["year"], row["weekday"], row["dayofyear"], user_type_dict[row["user_type"]], 0.0 if row["price"] is None else row["price"]])
    tabular = torch.nan_to_num(tabular, nan=0.0)
    y_pred_rnn.append(float(model(tabular)))

100%|██████████| 508438/508438 [00:32<00:00, 15637.18it/s]


In [41]:
result_rnn = np.clip(y_pred_rnn, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result_rnn}).to_csv("../results/rnn-tabular.csv", index=0)

Результат: 0.27903

### Текст

### Transformer

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
class TransformerModelWithAttention(nn.Module):
    def __init__(self, input_dim = 1024, hidden_dim=128, num_heads = 4, num_layers = 8, dropout = 0.1):
        super(TransformerModelWithAttention, self).__init__()
        self.in_layer = nn.Linear(input_dim, hidden_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 10000, hidden_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model = hidden_dim, nhead = num_heads, dim_feedforward = hidden_dim, dropout = dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        self.fc_out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = x.to(device)
        x = self.in_layer(x)
        batch_size, seq_len, _ = x.size()
        x = x + self.positional_encoding[:, :seq_len, :]
        encoder_output = self.transformer_encoder(x)
        x = encoder_output.mean(dim = 1)
        return self.fc_out(x).flatten()

In [12]:
model = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2)
checkpoint = torch.load("models/TransformerModelWithAttention_7_0.74_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [22]:
import os
jina_list = sorted(os.listdir('../data/jina'), key= lambda x: int(x.replace("jina_test_", "")))

In [23]:
import pickle
import io

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

In [25]:
user_type_dict = {'Private': 0, 'Company': 1, 'Shop': 2}
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
for i, row in tqdm(test.iterrows(), total=len_test):
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    text_embedding = jina_emb[i % 10000].unsqueeze(0)
    y_pred.append(float(model(text_embedding.float())))

100%|██████████| 508438/508438 [10:53<00:00, 778.00it/s] 


In [30]:
result_text = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result_text}).to_csv("../results/transformer-text.csv", index=0)

Результат: 0.28390

LSTM

In [46]:
class LSTM(nn.Module):
    def __init__(self, input_size = 1024, hidden_size = 64, num_layers = 2, dropout = 0.1, bidirectional=True):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size = input_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional=bidirectional
        )
        if bidirectional:
            self.fc = nn.Linear(2 * hidden_size, 1)
        else:
            self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        if self.lstm.bidirectional:
            h0, c0 = torch.zeros(2 * self.num_layers, len(x), self.hidden_size).to(device), torch.zeros(2 * self.num_layers, len(x), self.hidden_size).to(device)
        else:
            h0, c0 = torch.zeros(self.num_layers, len(x), self.hidden_size).to(device), torch.zeros(self.num_layers, len(x), self.hidden_size).to(device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        if self.lstm.bidirectional:
            out = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)
        else:
            out = out[:, -1, :]
        out = self.fc(out)
        return out

In [47]:
model = LSTM().to(device)
checkpoint = torch.load("models/LSTM_7_0.88_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [48]:
user_type_dict = {'Private': 0, 'Company': 1, 'Shop': 2}
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
for i, row in tqdm(test.iterrows(), total=len_test):
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    tabular = torch.tensor([row["item_seq_number"], row["day"], row["month"], row["year"], row["weekday"], row["dayofyear"], user_type_dict[row["user_type"]], 0.0 if row["price"] is None else row["price"]])
    tabular = tabular.unsqueeze(0).unsqueeze(2).expand(-1, -1, 1024)
    tabular = torch.nan_to_num(tabular,nan=0.0)
    text_embedding = jina_emb[i % 10000].unsqueeze(0)
    emb_concat = torch.concat((tabular, text_embedding), 1)
    y_pred.append(float(model(emb_concat)))

100%|██████████| 508438/508438 [1:15:35<00:00, 112.10it/s] 


In [49]:
result_text = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result_text}).to_csv("../results/lstm-text.csv", index=0)

Результат: 0.27610

### Catboost + Transformer

In [35]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result + result_text) / 2.0}).to_csv("../results/decision_catboost_transformer.csv", index=0)

Результат: 0.26810

взвешенный результат

In [38]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result + 2.0 * result_text) / 3.0}).to_csv("../results/decision_catboost_transformer_2_1.csv", index=0)

Результат: 0.26360

### RNN + Transformer

In [42]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result_rnn + result_text) / 2.0}).to_csv("../results/decision_rnn_transformer.csv", index=0)

Результат: 0.25335

взвешенный результат

In [43]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (2.0 * result_rnn + result_text) / 3.0}).to_csv("../results/decision_rnn_transformer_2_1.csv", index=0)

Результат: 0.25573

### Catboost + LSTM

In [50]:
result = pd.read_csv("../results/catboost_model_tabular.csv").deal_probability.values

In [55]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result + result_text) / 2.0}).to_csv("../results/decision_catboost_lstm.csv", index=0)

Результат: 0.26472

взвешенный результат

In [57]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result + 2 * result_text) / 3.0}).to_csv("../results/decision_catboost_lstm_1_2.csv", index=0)

Результат: 0.25867

### RNN + LSTM

In [53]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result_rnn + result_text) / 2.0}).to_csv("../results/decision_rnn_lstm.csv", index=0)

Результат: 0.27541

взвешенный результат

In [58]:
pd.DataFrame({'item_id': item_id, 'deal_probability': (result_rnn + 2.0 * result_text) / 3.0}).to_csv("../results/decision_rnn_lstm_1_2.csv", index=0)

Результат: 0.27516