## Импорт необходимых библиотек и настройка среды

In [80]:
import os
import pickle
import collections
import datetime
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split, cross_val_score
from optuna.integration import CatBoostPruningCallback
from sklearn.metrics import r2_score

# Сброс ограничений на количество выводимых рядов
pd.set_option('display.max_rows', 20)
# Сброс ограничений на число столбцов
pd.set_option('display.max_columns', None)
# Сброс ограничений на количество символов в записи
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42

## Загрузка датасетов

In [2]:
#train_df = pd.read_csv('train_features_v1.csv', index_col='document_id')

In [3]:
train_df = pd.DataFrame()
for chunk in tqdm(pd.read_csv('train_features_v6.1.csv', index_col='document_id', chunksize=100)):
    train_df = pd.concat([train_df, chunk])

0it [00:00, ?it/s]

In [4]:
train_df.shape

(7000, 35692)

In [5]:
#train_df.head(3)

In [6]:
test_df = pd.DataFrame()
for chunk in tqdm(pd.read_csv('test_features_v6.1.csv', index_col='document_id', chunksize=100)):
    test_df = pd.concat([test_df, chunk])

0it [00:00, ?it/s]

In [7]:
test_df.shape

(3000, 35689)

In [8]:
#test_df.head(3)

## Проверка корреляции признаков

## Разбиение на признаки и целевые переменные

In [9]:
to_drop_train = ['views', 'depth', 'full_reads_percent']

In [81]:
train_x = train_df.drop(to_drop_train, axis=1)
# train_y_views = [np.log(x) for x in train_df['views'].values]
# train_y_depth = [np.log(x) for x in train_df['depth'].values]
# train_y_full_reads_percent = [np.log(x) for x in train_df['full_reads_percent'].values]
train_y_views = train_df['views'].values
train_y_depth = train_df['depth'].values
train_y_full_reads_percent = train_df['full_reads_percent'].values

In [11]:
# plt.figure(figsize=(24, 12))
# plt.scatter(range(len(train_y_full_reads_percent)), train_y_full_reads_percent)
# plt.show()

# CatBoost

## views

### Создание выборок

In [82]:
X_train_views, X_test_views, y_train_views, y_test_views = train_test_split(train_x, train_y_views, test_size=0.2, random_state=RANDOM_STATE)

In [83]:
pool_full_views = Pool(train_x, train_y_views)

In [84]:
pool_train_views = Pool(X_train_views, y_train_views)
pool_test_views = Pool(X_test_views)
pool_eval_views = Pool(X_test_views, y_test_views)

### Кросс-валидация

In [85]:
params_cv = {'eval_metric': 'R2', 'loss_function': 'RMSE'}

In [86]:
cv_views = cv(pool_full_views, params=params_cv, fold_count=5, seed=RANDOM_STATE, verbose=10, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: -0.0823229	test: -0.0497877	best: -0.0497877 (0)	total: 139ms	remaining: 2m 18s
10:	learn: 0.1498043	test: 0.2088296	best: 0.2088296 (10)	total: 1.63s	remaining: 2m 26s
20:	learn: 0.3161744	test: 0.3896570	best: 0.3896570 (20)	total: 2.91s	remaining: 2m 15s
30:	learn: 0.4377678	test: 0.5236563	best: 0.5236563 (30)	total: 4.15s	remaining: 2m 9s
40:	learn: 0.5269907	test: 0.6210829	best: 0.6210829 (40)	total: 5.53s	remaining: 2m 9s
50:	learn: 0.5905622	test: 0.6878155	best: 0.6878155 (50)	total: 6.86s	remaining: 2m 7s
60:	learn: 0.6383157	test: 0.7370681	best: 0.7370681 (60)	total: 8.19s	remaining: 2m 6s
70:	learn: 0.6693694	test: 0.7689250	best: 0.7689250 (70)	total: 9.38s	remaining: 2m 2s
80:	learn: 0.7028621	test: 0.7986095	best: 0.7986095 (80)	total: 10.7s	remaining: 2m 1s
90:	learn: 0.7225733	test: 0.8162463	best: 0.8162463 (90)	total: 11.9s	remaining: 1m 58s
100:	learn: 0.7394392	test: 0.8312145	best: 0.8312145 (100)	total: 13.1s	remaining: 1m 56s
1

In [87]:
cv_views

Unnamed: 0,iterations,test-R2-mean,test-R2-std,train-R2-mean,train-R2-std,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,-0.108683,0.099183,-0.072824,0.010319,96745.852484,23289.495486,98752.240937,4788.818816
1,1,-0.083853,0.103838,-0.044291,0.010453,95581.663694,22807.071986,97429.733769,4717.474882
2,2,-0.060166,0.108242,-0.016830,0.012636,94460.639568,22360.244154,96135.541357,4560.184775
3,3,-0.036771,0.113466,0.010485,0.013906,93328.118609,21870.116087,94832.803303,4445.590252
4,4,-0.014936,0.118012,0.035818,0.015274,92265.724720,21442.745135,93608.756498,4355.534448
...,...,...,...,...,...,...,...,...,...
995,995,0.752494,0.150568,0.956151,0.003702,42083.191992,10016.242659,19920.234923,345.276844
996,996,0.752512,0.150508,0.956192,0.003692,42083.662233,10016.686858,19911.057999,341.547467
997,997,0.752532,0.150471,0.956226,0.003688,42082.953685,10017.690295,19903.470787,343.215399
998,998,0.752514,0.150509,0.956254,0.003686,42083.723123,10016.941983,19897.030531,345.232168


### Трейн-тест сплит

In [88]:
cbt_reg_views = CatBoostRegressor(eval_metric='R2')

In [89]:
cbt_reg_views.fit(pool_train_views, eval_set=pool_eval_views, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.066736
0:	learn: 0.0535519	test: 0.0452251	best: 0.0452251 (0)	total: 121ms	remaining: 2m 1s
1:	learn: 0.1027684	test: 0.0904876	best: 0.0904876 (1)	total: 239ms	remaining: 1m 59s
2:	learn: 0.1453105	test: 0.1307937	best: 0.1307937 (2)	total: 372ms	remaining: 2m 3s
3:	learn: 0.1875752	test: 0.1662258	best: 0.1662258 (3)	total: 490ms	remaining: 2m 1s
4:	learn: 0.2268033	test: 0.2021235	best: 0.2021235 (4)	total: 615ms	remaining: 2m 2s
5:	learn: 0.2626736	test: 0.2347414	best: 0.2347414 (5)	total: 736ms	remaining: 2m 1s
6:	learn: 0.2957902	test: 0.2646562	best: 0.2646562 (6)	total: 860ms	remaining: 2m 2s
7:	learn: 0.3191609	test: 0.2826577	best: 0.2826577 (7)	total: 982ms	remaining: 2m 1s
8:	learn: 0.3605070	test: 0.3248814	best: 0.3248814 (8)	total: 1.1s	remaining: 2m 1s
9:	learn: 0.3870477	test: 0.3466640	best: 0.3466640 (9)	total: 1.24s	remaining: 2m 2s
10:	learn: 0.4115465	test: 0.3685626	best: 0.3685626 (10)	total: 1.36s	remaining: 2m 2s
11:	learn: 0.4445992	t

<catboost.core.CatBoostRegressor at 0x1b36e80a250>

### Features importance

### R2-score

In [90]:
y_test_views_pred = cbt_reg_views.predict(pool_test_views)
views_r2 = r2_score(y_test_views, y_test_views_pred)
views_r2
#0.7323492933679034 - log
#0.8248165727632922 - norm

0.8248165727632922

### Обучение на полной выборке

In [30]:
cbt_reg_views.fit(pool_full_views, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.055681
0:	learn: 0.0488154	total: 123ms	remaining: 2m 3s
1:	learn: 0.0974906	total: 253ms	remaining: 2m 6s
2:	learn: 0.1424515	total: 374ms	remaining: 2m 4s
3:	learn: 0.1807272	total: 493ms	remaining: 2m 2s
4:	learn: 0.2210438	total: 630ms	remaining: 2m 5s
5:	learn: 0.2554513	total: 760ms	remaining: 2m 5s
6:	learn: 0.2861674	total: 881ms	remaining: 2m 5s
7:	learn: 0.3215688	total: 1s	remaining: 2m 4s
8:	learn: 0.3492550	total: 1.14s	remaining: 2m 5s
9:	learn: 0.3768135	total: 1.27s	remaining: 2m 5s
10:	learn: 0.4040711	total: 1.39s	remaining: 2m 5s
11:	learn: 0.4291182	total: 1.52s	remaining: 2m 5s
12:	learn: 0.4522107	total: 1.69s	remaining: 2m 8s
13:	learn: 0.4716817	total: 1.86s	remaining: 2m 11s
14:	learn: 0.4901922	total: 2.12s	remaining: 2m 19s
15:	learn: 0.5088273	total: 2.27s	remaining: 2m 19s
16:	learn: 0.5254545	total: 2.51s	remaining: 2m 25s
17:	learn: 0.5463384	total: 2.66s	remaining: 2m 25s
18:	learn: 0.5651989	total: 2.81s	remaining: 2m 25s
19:	lear

<catboost.core.CatBoostRegressor at 0x1b3391e5d90>

### Подбор гиперпараметров с помощью Optuna

## depth

### Создание выборок

In [91]:
X_train_depth, X_test_depth, y_train_depth, y_test_depth = train_test_split(train_x, train_y_depth, test_size=0.2, random_state=RANDOM_STATE)

In [92]:
pool_full_depth = Pool(train_x, train_y_depth)

In [93]:
pool_train_depth = Pool(X_train_depth, y_train_depth)
pool_test_depth = Pool(X_test_depth)
pool_eval_depth = Pool(X_test_depth, y_test_depth)

### Кросс-валидация

In [94]:
cv_depth = cv(pool_full_depth, params={'eval_metric': 'R2', 'loss_function': 'RMSE'}, fold_count=5, seed=RANDOM_STATE, verbose=10, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: -273.1007416	test: -266.6045217	best: -266.6045217 (0)	total: 139ms	remaining: 2m 18s
10:	learn: -148.7752491	test: -145.0799017	best: -145.0799017 (10)	total: 1.49s	remaining: 2m 14s
20:	learn: -81.0396423	test: -78.8885693	best: -78.8885693 (20)	total: 2.86s	remaining: 2m 13s
30:	learn: -44.1285138	test: -42.8749044	best: -42.8749044 (30)	total: 4.44s	remaining: 2m 18s
40:	learn: -24.0319521	test: -23.2825539	best: -23.2825539 (40)	total: 5.93s	remaining: 2m 18s
50:	learn: -13.0791677	test: -12.6247475	best: -12.6247475 (50)	total: 7.31s	remaining: 2m 16s
60:	learn: -7.1126257	test: -6.8341697	best: -6.8341697 (60)	total: 8.66s	remaining: 2m 13s
70:	learn: -3.8309615	test: -3.6565456	best: -3.6565456 (70)	total: 9.99s	remaining: 2m 10s
80:	learn: -1.9617226	test: -1.8501915	best: -1.8501915 (80)	total: 11.4s	remaining: 2m 9s
90:	learn: -0.8965408	test: -0.8274887	best: -0.8274887 (90)	total: 12.7s	remaining: 2m 6s
100:	learn: -0.2766259	test: -0.23925

In [95]:
cv_depth

Unnamed: 0,iterations,test-R2-mean,test-R2-std,train-R2-mean,train-R2-std,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,-272.397895,10.417972,-271.798073,2.586005,1.073718,0.002481,1.073725,0.000601
1,1,-256.355761,9.776006,-255.775612,2.438665,1.041743,0.002493,1.041716,0.000584
2,2,-241.241958,9.172825,-240.702321,2.290336,1.010692,0.002505,1.010678,0.000570
3,3,-227.033309,8.587571,-226.525184,2.158713,0.980607,0.002515,0.980589,0.000572
4,4,-213.652971,8.034022,-213.171771,2.038254,0.951406,0.002556,0.951379,0.000539
...,...,...,...,...,...,...,...,...,...
995,995,0.804364,0.035633,0.892726,0.001953,0.028657,0.002683,0.021292,0.000260
996,996,0.804377,0.035632,0.892775,0.001934,0.028656,0.002683,0.021288,0.000259
997,997,0.804367,0.035633,0.892798,0.001930,0.028656,0.002683,0.021285,0.000258
998,998,0.804365,0.035627,0.892852,0.001907,0.028657,0.002682,0.021280,0.000257


### Трейн-тест сплит

In [96]:
cbt_reg_depth = CatBoostRegressor(eval_metric='R2')

In [97]:
cbt_reg_depth.fit(pool_train_depth, eval_set=pool_eval_depth, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.066736
0:	learn: 0.0720307	test: 0.0610962	best: 0.0610962 (0)	total: 124ms	remaining: 2m 3s
1:	learn: 0.1342443	test: 0.1179819	best: 0.1179819 (1)	total: 250ms	remaining: 2m 4s
2:	learn: 0.1877654	test: 0.1676446	best: 0.1676446 (2)	total: 377ms	remaining: 2m 5s
3:	learn: 0.2350659	test: 0.2116423	best: 0.2116423 (3)	total: 509ms	remaining: 2m 6s
4:	learn: 0.2762507	test: 0.2497455	best: 0.2497455 (4)	total: 637ms	remaining: 2m 6s
5:	learn: 0.3132704	test: 0.2840172	best: 0.2840172 (5)	total: 762ms	remaining: 2m 6s
6:	learn: 0.3469979	test: 0.3156463	best: 0.3156463 (6)	total: 893ms	remaining: 2m 6s
7:	learn: 0.3829583	test: 0.3496292	best: 0.3496292 (7)	total: 1.02s	remaining: 2m 6s
8:	learn: 0.4108564	test: 0.3757704	best: 0.3757704 (8)	total: 1.16s	remaining: 2m 7s
9:	learn: 0.4363134	test: 0.4025883	best: 0.4025883 (9)	total: 1.28s	remaining: 2m 7s
10:	learn: 0.4587141	test: 0.4233778	best: 0.4233778 (10)	total: 1.41s	remaining: 2m 6s
11:	learn: 0.4770230	t

<catboost.core.CatBoostRegressor at 0x1b40ebd1370>

### Features importance

### R2-score

In [98]:
y_test_depth_pred = cbt_reg_depth.predict(pool_test_depth)
depth_r2 = r2_score(y_test_depth, y_test_depth_pred)
depth_r2
#0.8186146500917406 - log
#0.80569282594943 - norm

0.80569282594943

### Обучение на полной выборке

In [33]:
cbt_reg_depth.fit(pool_full_depth, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.055681
0:	learn: 0.0582668	total: 168ms	remaining: 2m 48s
1:	learn: 0.1121360	total: 314ms	remaining: 2m 36s
2:	learn: 0.1626305	total: 454ms	remaining: 2m 30s
3:	learn: 0.2059433	total: 591ms	remaining: 2m 27s
4:	learn: 0.2457977	total: 720ms	remaining: 2m 23s
5:	learn: 0.2802294	total: 841ms	remaining: 2m 19s
6:	learn: 0.3110091	total: 971ms	remaining: 2m 17s
7:	learn: 0.3406055	total: 1.13s	remaining: 2m 19s
8:	learn: 0.3650184	total: 1.29s	remaining: 2m 22s
9:	learn: 0.3892554	total: 1.46s	remaining: 2m 24s
10:	learn: 0.4122128	total: 1.6s	remaining: 2m 24s
11:	learn: 0.4313609	total: 1.74s	remaining: 2m 23s
12:	learn: 0.4506563	total: 1.9s	remaining: 2m 23s
13:	learn: 0.4670046	total: 2.05s	remaining: 2m 24s
14:	learn: 0.4856875	total: 2.17s	remaining: 2m 22s
15:	learn: 0.5002274	total: 2.3s	remaining: 2m 21s
16:	learn: 0.5156487	total: 2.42s	remaining: 2m 20s
17:	learn: 0.5304751	total: 2.55s	remaining: 2m 19s
18:	learn: 0.5436745	total: 2.68s	remaining: 2m

<catboost.core.CatBoostRegressor at 0x1b3391d8310>

### Подбор гиперпараметров с помощью Optuna

## full_reads_percent

### Создание выборок

In [99]:
X_train_full_reads_percent, X_test_full_reads_percent, y_train_full_reads_percent, y_test_full_reads_percent = train_test_split(train_x, train_y_full_reads_percent, test_size=0.2, random_state=RANDOM_STATE)

In [100]:
pool_full_full_reads_percent = Pool(train_x, train_y_full_reads_percent)

In [101]:
pool_train_full_reads_percent = Pool(X_train_full_reads_percent, y_train_full_reads_percent)
pool_test_full_reads_percent = Pool(X_test_full_reads_percent)
pool_eval_full_reads_percent = Pool(X_test_full_reads_percent, y_test_full_reads_percent)

### Кросс-валидация

In [102]:
cv_full_reads_percent = cv(pool_full_full_reads_percent, params={'eval_metric': 'R2', 'loss_function': 'RMSE'}, fold_count=5, seed=RANDOM_STATE, verbose=10, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: -10.6605488	test: -11.0260398	best: -11.0260398 (0)	total: 161ms	remaining: 2m 41s
10:	learn: -5.7361875	test: -5.8970665	best: -5.8970665 (10)	total: 2.52s	remaining: 3m 46s
20:	learn: -3.0139296	test: -3.0795805	best: -3.0795805 (20)	total: 4.08s	remaining: 3m 10s
30:	learn: -1.4956283	test: -1.5193193	best: -1.5193193 (30)	total: 6.5s	remaining: 3m 23s
40:	learn: -0.6462285	test: -0.6486232	best: -0.6486232 (40)	total: 8.91s	remaining: 3m 28s
50:	learn: -0.1645803	test: -0.1608666	best: -0.1608666 (50)	total: 10.8s	remaining: 3m 21s
60:	learn: 0.1113308	test: 0.1175740	best: 0.1175740 (60)	total: 13.1s	remaining: 3m 21s
70:	learn: 0.2695811	test: 0.2754068	best: 0.2754068 (70)	total: 15.1s	remaining: 3m 18s
80:	learn: 0.3626387	test: 0.3676287	best: 0.3676287 (80)	total: 16.6s	remaining: 3m 8s
90:	learn: 0.4189529	test: 0.4232830	best: 0.4232830 (90)	total: 18.1s	remaining: 3m
100:	learn: 0.4541638	test: 0.4572848	best: 0.4572848 (100)	total: 19.7s	r

In [103]:
cv_full_reads_percent

Unnamed: 0,iterations,test-R2-mean,test-R2-std,train-R2-mean,train-R2-std,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,-10.750259,0.449882,-10.731344,0.116591,35.059064,0.232641,35.059285,0.056009
1,1,-10.110484,0.424166,-10.091620,0.108705,34.091373,0.235414,34.089990,0.053210
2,2,-9.505678,0.400458,-9.487191,0.100202,33.150612,0.242415,33.148151,0.046896
3,3,-8.936668,0.371533,-8.919300,0.098913,32.240750,0.239661,32.238106,0.045792
4,4,-8.405805,0.347362,-8.388275,0.095939,31.367989,0.243521,31.363276,0.039577
...,...,...,...,...,...,...,...,...,...
995,995,0.565951,0.028498,0.728804,0.005185,6.740474,0.328607,5.330607,0.068965
996,996,0.565976,0.028535,0.728966,0.005152,6.740265,0.328774,5.329017,0.068481
997,997,0.565983,0.028506,0.729101,0.005118,6.740220,0.328591,5.327683,0.068299
998,998,0.566025,0.028524,0.729268,0.005147,6.739893,0.328836,5.326047,0.068434


### Трейн-тест сплит

In [104]:
cbt_reg_full_reads_percent = CatBoostRegressor(eval_metric='R2')

In [105]:
cbt_reg_full_reads_percent.fit(pool_train_full_reads_percent, eval_set=pool_eval_full_reads_percent, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.066736
0:	learn: 0.0480391	test: 0.0486975	best: 0.0486975 (0)	total: 142ms	remaining: 2m 21s
1:	learn: 0.0954891	test: 0.0968081	best: 0.0968081 (1)	total: 295ms	remaining: 2m 26s
2:	learn: 0.1363027	test: 0.1394144	best: 0.1394144 (2)	total: 459ms	remaining: 2m 32s
3:	learn: 0.1748534	test: 0.1795126	best: 0.1795126 (3)	total: 720ms	remaining: 2m 59s
4:	learn: 0.2079036	test: 0.2127850	best: 0.2127850 (4)	total: 923ms	remaining: 3m 3s
5:	learn: 0.2405333	test: 0.2458249	best: 0.2458249 (5)	total: 1.09s	remaining: 3m
6:	learn: 0.2686234	test: 0.2744636	best: 0.2744636 (6)	total: 1.32s	remaining: 3m 7s
7:	learn: 0.2927578	test: 0.3005690	best: 0.3005690 (7)	total: 1.56s	remaining: 3m 13s
8:	learn: 0.3148884	test: 0.3237443	best: 0.3237443 (8)	total: 1.73s	remaining: 3m 10s
9:	learn: 0.3328549	test: 0.3429199	best: 0.3429199 (9)	total: 1.88s	remaining: 3m 6s
10:	learn: 0.3503570	test: 0.3611052	best: 0.3611052 (10)	total: 2.01s	remaining: 3m
11:	learn: 0.3645732	t

<catboost.core.CatBoostRegressor at 0x1b4198f4100>

### Features importance

### R2-score

In [106]:
y_test_full_reads_percent_pred = cbt_reg_full_reads_percent.predict(pool_test_full_reads_percent)
full_reads_percent_r2 = r2_score(y_test_full_reads_percent, y_test_full_reads_percent_pred)
full_reads_percent_r2
#0.6208278499435618 - log
#0.5944432696994856 - norm

0.5944432696994856

### Обучение на полной выборке

In [36]:
cbt_reg_full_reads_percent.fit(pool_full_full_reads_percent, verbose=1, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.055681
0:	learn: 0.0469253	total: 155ms	remaining: 2m 35s
1:	learn: 0.0864705	total: 303ms	remaining: 2m 31s
2:	learn: 0.1241028	total: 444ms	remaining: 2m 27s
3:	learn: 0.1578486	total: 593ms	remaining: 2m 27s
4:	learn: 0.1883066	total: 736ms	remaining: 2m 26s
5:	learn: 0.2161277	total: 887ms	remaining: 2m 27s
6:	learn: 0.2417633	total: 1.03s	remaining: 2m 25s
7:	learn: 0.2639054	total: 1.17s	remaining: 2m 24s
8:	learn: 0.2852370	total: 1.31s	remaining: 2m 23s
9:	learn: 0.3054747	total: 1.45s	remaining: 2m 23s
10:	learn: 0.3236921	total: 1.58s	remaining: 2m 22s
11:	learn: 0.3393662	total: 1.72s	remaining: 2m 21s
12:	learn: 0.3533347	total: 1.85s	remaining: 2m 20s
13:	learn: 0.3669791	total: 2s	remaining: 2m 20s
14:	learn: 0.3789275	total: 2.13s	remaining: 2m 20s
15:	learn: 0.3898379	total: 2.27s	remaining: 2m 19s
16:	learn: 0.4010850	total: 2.41s	remaining: 2m 19s
17:	learn: 0.4100957	total: 2.54s	remaining: 2m 18s
18:	learn: 0.4178165	total: 2.68s	remaining: 2m

<catboost.core.CatBoostRegressor at 0x1b3391d8fa0>

### Подбор гиперпараметров с помощью Optuna

## Test

### Метрика сплита

In [107]:
metric_r2 = 0.4 * views_r2 + 0.3 * depth_r2 + 0.3 * full_reads_percent_r2
metric_r2
#0.7499674577999915 - norm

0.7499674577999915

### Предикты для теста

In [37]:
test_pool = Pool(test_df)

In [38]:
y_test_pred_views = cbt_reg_views.predict(test_pool)
y_test_pred_views

array([13510.42176737,  5382.00277818, 15272.91943429, ...,
        6876.22532719,  7220.53463169, 11664.0709583 ])

In [39]:
y_test_pred_depth = cbt_reg_depth.predict(test_pool)
y_test_pred_depth

array([1.19233809, 1.0521861 , 1.05646852, ..., 1.06562283, 1.16057602,
       1.14161652])

In [40]:
y_test_pred_full_reads_percent = cbt_reg_full_reads_percent.predict(test_pool)
y_test_pred_full_reads_percent

array([49.78046415, 36.39371942, 24.24868866, ..., 37.67751212,
       41.85417688, 34.17614718])

In [72]:
test_finale = pd.DataFrame(index=test_df.index, columns=['views', 'depth', 'full_reads_percent'])
test_finale['views'] = y_test_pred_views
test_finale['depth'] = y_test_pred_depth
test_finale['full_reads_percent'] = y_test_pred_full_reads_percent
# test_finale['views'] = test_finale['views'].apply(lambda x: np.exp(x))
# test_finale['depth'] = test_finale['depth'].apply(lambda x: np.exp(x))
# test_finale['full_reads_percent'] = test_finale['full_reads_percent'].apply(lambda x: np.exp(x))
test_finale

Unnamed: 0_level_0,views,depth,full_reads_percent
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw,13510.421767,1.192338,49.780464
628c22b89a79470e553f594bQS5CqzXYRnmDdR2LaSreEw,5382.002778,1.052186,36.393719
627cb3249a7947ebdd752865XVsoyrUOT8OJJg2_finJhw,15272.919434,1.056469,24.248689
628618629a7947d4927eb812upfii3whSSuMXCqcqF8VbQ,4359.569413,1.060327,33.089957
620e76109a7947235623695b5hzCiIHdSYKQIr8WAM18bw,92138.852901,1.144804,32.146828
...,...,...,...
623ba6bc9a79470214cb9418x4OZQ1IjTb27nPAgv5wAZg,38299.769635,1.170954,33.430131
624fda1f9a79471a9bd102eetuRTrLPGSZyNbc9_YcBAeQ,28083.135831,1.129606,28.967310
6257a6a99a79477eea75b86dtBr3UG9ORW-gOybkVKX6ww,6876.225327,1.065623,37.677512
624866d59a7947421716ab46WsGDKyGjTJGRh14eR7UVvw,7220.534632,1.160576,41.854177


In [73]:
test_finale.describe()

Unnamed: 0,views,depth,full_reads_percent
count,3000.0,3000.0,3000.0
mean,30563.65,1.10644,34.316491
std,88189.46,0.05768,7.624225
min,-5127.644,1.029515,5.296252
25%,8546.969,1.059351,29.45195
50%,16038.87,1.079597,34.363404
75%,34487.5,1.149175,39.01649
max,2557494.0,1.801867,57.359423


### Объединение предиктов на log-таргетах и без

In [42]:
test_norm = test_finale.copy()

In [26]:
test_log = test_finale.copy()

In [77]:
for index, row in tqdm(test_finale.iterrows()):
    if row['views'] < 0:
        test_finale.at[index, 'views'] = test_log.at[index, 'views']
    else:
        test_finale.at[index, 'views'] = (test_log.at[index, 'views'] + test_norm.at[index, 'views']) / 2
    test_finale.at[index, 'depth'] = (test_log.at[index, 'depth'] + test_norm.at[index, 'depth']) / 2
    test_finale.at[index, 'full_reads_percent'] = (test_log.at[index, 'full_reads_percent'] + test_norm.at[index, 'full_reads_percent']) / 2

0it [00:00, ?it/s]

In [78]:
test_finale

Unnamed: 0_level_0,views,depth,full_reads_percent
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw,12689.745113,1.191363,49.985800
628c22b89a79470e553f594bQS5CqzXYRnmDdR2LaSreEw,5656.283466,1.052377,35.364026
627cb3249a7947ebdd752865XVsoyrUOT8OJJg2_finJhw,12520.080916,1.055785,25.272617
628618629a7947d4927eb812upfii3whSSuMXCqcqF8VbQ,4475.928541,1.060577,32.396272
620e76109a7947235623695b5hzCiIHdSYKQIr8WAM18bw,84398.901506,1.141041,31.935816
...,...,...,...
623ba6bc9a79470214cb9418x4OZQ1IjTb27nPAgv5wAZg,29545.029190,1.172076,33.513991
624fda1f9a79471a9bd102eetuRTrLPGSZyNbc9_YcBAeQ,29830.399551,1.129129,28.978327
6257a6a99a79477eea75b86dtBr3UG9ORW-gOybkVKX6ww,5949.237432,1.064446,37.347326
624866d59a7947421716ab46WsGDKyGjTJGRh14eR7UVvw,6476.549527,1.159703,41.378622


In [79]:
test_finale.describe()

Unnamed: 0,views,depth,full_reads_percent
count,3000.0,3000.0,3000.0
mean,27577.56,1.106168,34.001463
std,86076.63,0.057613,7.621929
min,264.3348,1.028425,5.171532
25%,7392.868,1.058951,29.120185
50%,14080.27,1.079072,34.002803
75%,29747.35,1.14886,38.823202
max,2534975.0,1.806004,56.802779


In [71]:
submission_filename = 'submission_{}.csv'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
test_finale.to_csv(f'./submissions/{submission_filename}')
print('Submission saved to {}'.format(submission_filename))

Submission saved to submission_2022-07-20_21-57-31.csv
