In [129]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

In [None]:
def preprocess(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'])

    df['day'] = df['activation_date'].dt.day
    df['month'] = df["activation_date"].dt.month
    df['year'] = df["activation_date"].dt.year
    df['weekday'] = df['activation_date'].dt.weekday
    df["dayofyear"] = df['activation_date'].dt.dayofyear
    df.drop(columns=['activation_date', 'item_id'], inplace=True)
    df['param_1'] = df['param_1'].fillna('')
    df['param_2'] = df['param_2'].fillna('')
    df['param_3'] = df['param_3'].fillna('')
    df['description'] = df['description'].fillna('')
    return df

test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')
item_id = test.item_id
train = preprocess(train)
test = preprocess(test)

In [77]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['deal_probability', 'image']), train['deal_probability'], test_size=0.2, random_state=42)

In [79]:
model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, verbose=100, cat_features=['user_type', 'region', 'city', 'parent_category_name', 'category_name', 'user_id', ], text_features=['param_1', 'param_2', 'param_3', 'title', 'description'])

Learning rate set to 0.125559
0:	learn: 0.2560008	total: 1.4s	remaining: 23m 13s
100:	learn: 0.2311637	total: 1m 25s	remaining: 12m 44s
200:	learn: 0.2286115	total: 2m 43s	remaining: 10m 49s
300:	learn: 0.2271049	total: 4m 3s	remaining: 9m 25s
400:	learn: 0.2259709	total: 5m 22s	remaining: 8m 1s
500:	learn: 0.2251311	total: 6m 44s	remaining: 6m 42s
600:	learn: 0.2244398	total: 8m	remaining: 5m 19s
700:	learn: 0.2238571	total: 9m 18s	remaining: 3m 58s
800:	learn: 0.2233213	total: 10m 40s	remaining: 2m 39s
900:	learn: 0.2228750	total: 11m 54s	remaining: 1m 18s
999:	learn: 0.2224401	total: 13m 10s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x345abc9a0>

In [None]:
model.save_model("catboost_model.cbm")
result = model.predict(test.drop(columns=['image']))
pd.DataFrame({'item_id': item_id, 'deal_probability': np.clip(result, 0, 1)}).to_csv("../results/catboost.csv", index=0)

Результат: 0.33028

In [102]:
model = CatBoostRegressor(loss_function='RMSE')
model.fit(X_train, y_train, eval_set=(X_val,y_val), use_best_model=True, verbose=100, cat_features=['user_type', 'region', 'city', 'parent_category_name', 'category_name', 'user_id', ], text_features=['param_1', 'param_2', 'param_3', 'title', 'description'])

Learning rate set to 0.155054
0:	learn: 0.2550980	test: 0.2547325	best: 0.2547325 (0)	total: 1.12s	remaining: 18m 40s
100:	learn: 0.2304103	test: 0.2293768	best: 0.2293768 (100)	total: 1m 17s	remaining: 11m 32s
200:	learn: 0.2278975	test: 0.2269961	best: 0.2269961 (200)	total: 2m 29s	remaining: 9m 55s
300:	learn: 0.2263355	test: 0.2256411	best: 0.2256411 (300)	total: 3m 44s	remaining: 8m 42s
400:	learn: 0.2252344	test: 0.2246647	best: 0.2246647 (400)	total: 5m	remaining: 7m 28s
500:	learn: 0.2243428	test: 0.2239676	best: 0.2239676 (500)	total: 6m 20s	remaining: 6m 18s
600:	learn: 0.2236491	test: 0.2234614	best: 0.2234614 (600)	total: 7m 45s	remaining: 5m 9s
700:	learn: 0.2230237	test: 0.2230561	best: 0.2230561 (700)	total: 9m 2s	remaining: 3m 51s
800:	learn: 0.2224853	test: 0.2227070	best: 0.2227070 (800)	total: 10m 14s	remaining: 2m 32s
900:	learn: 0.2220028	test: 0.2224447	best: 0.2224447 (900)	total: 11m 35s	remaining: 1m 16s
999:	learn: 0.2215728	test: 0.2222553	best: 0.2222553 (99

<catboost.core.CatBoostRegressor at 0x4e82c8be0>

In [103]:
model.save_model("catboost_model2.cbm")
result = model.predict(test.drop(columns=['image']))
pd.DataFrame({'item_id': item_id, 'deal_probability': np.clip(result, 0, 1)}).to_csv("../results/catboost2.csv", index=0)

Результат: 0.3456

In [134]:
model = CatBoostRegressor(loss_function='RMSE')
grid = {'learning_rate': [0.03, 0.05, 0.1, 0.15],
        'depth': [4, 6, 10, 16],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}
X = train.drop(columns=['deal_probability', 'image'])
y = train['deal_probability']
pool_train = Pool(X, label=y, cat_features=['user_type', 'region', 'city', 'parent_category_name', 'category_name', 'user_id', ], text_features=['param_1', 'param_2', 'param_3', 'title', 'description'])
grid_search_result = model.grid_search(grid, X=pool_train, plot=True, search_by_train_test_split=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.2923811	test: 0.2913955	best: 0.2913955 (0)	total: 298ms	remaining: 4m 57s
1:	learn: 0.2897441	test: 0.2887585	best: 0.2887585 (1)	total: 508ms	remaining: 4m 13s
2:	learn: 0.2872370	test: 0.2862527	best: 0.2862527 (2)	total: 608ms	remaining: 3m 22s
3:	learn: 0.2848640	test: 0.2838793	best: 0.2838793 (3)	total: 809ms	remaining: 3m 21s
4:	learn: 0.2826067	test: 0.2816243	best: 0.2816243 (4)	total: 900ms	remaining: 2m 59s
5:	learn: 0.2804702	test: 0.2794882	best: 0.2794882 (5)	total: 964ms	remaining: 2m 39s
6:	learn: 0.2784149	test: 0.2774300	best: 0.2774300 (6)	total: 1.08s	remaining: 2m 32s
7:	learn: 0.2764887	test: 0.2755071	best: 0.2755071 (7)	total: 1.18s	remaining: 2m 25s
8:	learn: 0.2746610	test: 0.2736838	best: 0.2736838 (8)	total: 1.28s	remaining: 2m 20s
9:	learn: 0.2728934	test: 0.2719147	best: 0.2719147 (9)	total: 1.34s	remaining: 2m 13s
10:	learn: 0.2712213	test: 0.2702413	best: 0.2702413 (10)	total: 1.45s	remaining: 2m 10s
11:	learn: 0.2696622	test: 0.2686811	best

In [135]:
model.save_model("catboost_model_grid_search.cbm")
result = model.predict(test.drop(columns=['image']))
pd.DataFrame({'item_id': item_id, 'deal_probability': np.clip(result, 0, 1)}).to_csv("../results/catboost_grid_search.csv", index=0)

Результат: 0.35807