# catboost

In [1]:
import numpy as np
import pandas as pd
import gc
import time
import pickle
import catboost as cat

In [2]:
data = pd.read_pickle('feature_matrix.pickle')
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)
del data
gc.collect();

In [3]:
model = cat.CatBoostRegressor(
    depth=12, 
    loss_function='RMSE', 
    eval_metric='RMSE', 
    random_seed=42, 
    od_type='Iter', 
    od_wait=10,
    allow_writing_files=False) 

In [4]:
ts = time.time()
categorical_features_indices = cat_features=[0,1,2,3,4,5,6,7]
train_pool = cat.Pool(X_train, Y_train, cat_features=categorical_features_indices)
validation_pool = cat.Pool(X_valid, Y_valid, cat_features=categorical_features_indices)
time.time()- ts

237.93007731437683

In [5]:
ts = time.time()
model.fit(train_pool, eval_set=validation_pool, use_best_model=True, verbose=True)
time.time()- ts

0:	learn: 1.2048859	test: 1.1514314	best: 1.1514314 (0)	total: 47.8s	remaining: 13h 15m 8s
1:	learn: 1.1877153	test: 1.1387998	best: 1.1387998 (1)	total: 1m 36s	remaining: 13h 25m 43s
2:	learn: 1.1713530	test: 1.1254620	best: 1.1254620 (2)	total: 2m 23s	remaining: 13h 16m 22s
3:	learn: 1.1556407	test: 1.1141334	best: 1.1141334 (3)	total: 3m 10s	remaining: 13h 10m 19s
4:	learn: 1.1406679	test: 1.1027046	best: 1.1027046 (4)	total: 3m 57s	remaining: 13h 6m 11s
5:	learn: 1.1263202	test: 1.0928490	best: 1.0928490 (5)	total: 4m 40s	remaining: 12h 55m 6s
6:	learn: 1.1125700	test: 1.0823018	best: 1.0823018 (6)	total: 5m 22s	remaining: 12h 43m 38s
7:	learn: 1.0993354	test: 1.0727829	best: 1.0727829 (7)	total: 6m 9s	remaining: 12h 42m 44s
8:	learn: 1.0867246	test: 1.0636401	best: 1.0636401 (8)	total: 6m 54s	remaining: 12h 41m 30s
9:	learn: 1.0734588	test: 1.0555030	best: 1.0555030 (9)	total: 7m 42s	remaining: 12h 43m 56s
10:	learn: 1.0577549	test: 1.0482193	best: 1.0482193 (10)	total: 8m 35s	rem

86:	learn: 0.7015211	test: 0.9117415	best: 0.9117415 (86)	total: 1h 14m 33s	remaining: 13h 2m 26s
87:	learn: 0.7005225	test: 0.9115034	best: 0.9115034 (87)	total: 1h 15m 28s	remaining: 13h 2m 7s
88:	learn: 0.6998746	test: 0.9111423	best: 0.9111423 (88)	total: 1h 16m 14s	remaining: 13h 20s
89:	learn: 0.6992503	test: 0.9108445	best: 0.9108445 (89)	total: 1h 16m 59s	remaining: 12h 58m 29s
90:	learn: 0.6987136	test: 0.9106184	best: 0.9106184 (90)	total: 1h 17m 44s	remaining: 12h 56m 34s
91:	learn: 0.6979345	test: 0.9102269	best: 0.9102269 (91)	total: 1h 18m 32s	remaining: 12h 55m 12s
92:	learn: 0.6971368	test: 0.9098078	best: 0.9098078 (92)	total: 1h 19m 23s	remaining: 12h 54m 13s
93:	learn: 0.6963715	test: 0.9096761	best: 0.9096761 (93)	total: 1h 20m 10s	remaining: 12h 52m 48s
94:	learn: 0.6958372	test: 0.9094474	best: 0.9094474 (94)	total: 1h 20m 56s	remaining: 12h 51m 8s
95:	learn: 0.6950504	test: 0.9093419	best: 0.9093419 (95)	total: 1h 21m 47s	remaining: 12h 50m 13s
96:	learn: 0.69435

169:	learn: 0.6605599	test: 0.8948310	best: 0.8948310 (169)	total: 2h 26m 17s	remaining: 11h 54m 16s
170:	learn: 0.6602480	test: 0.8947859	best: 0.8947859 (170)	total: 2h 27m 9s	remaining: 11h 53m 22s
171:	learn: 0.6598950	test: 0.8946027	best: 0.8946027 (171)	total: 2h 28m 4s	remaining: 11h 52m 47s
172:	learn: 0.6591846	test: 0.8943568	best: 0.8943568 (172)	total: 2h 29m 7s	remaining: 11h 52m 50s
173:	learn: 0.6589479	test: 0.8942756	best: 0.8942756 (173)	total: 2h 29m 49s	remaining: 11h 51m 15s
174:	learn: 0.6587494	test: 0.8942499	best: 0.8942499 (174)	total: 2h 30m 44s	remaining: 11h 50m 38s
175:	learn: 0.6582475	test: 0.8942718	best: 0.8942499 (174)	total: 2h 31m 40s	remaining: 11h 50m 7s
176:	learn: 0.6580682	test: 0.8942502	best: 0.8942499 (174)	total: 2h 32m 23s	remaining: 11h 48m 33s
177:	learn: 0.6578175	test: 0.8942346	best: 0.8942346 (177)	total: 2h 33m 17s	remaining: 11h 47m 54s
178:	learn: 0.6575480	test: 0.8940318	best: 0.8940318 (178)	total: 2h 34m 6s	remaining: 11h 46m

251:	learn: 0.6384376	test: 0.8874686	best: 0.8874686 (251)	total: 3h 37m 2s	remaining: 10h 44m 12s
252:	learn: 0.6382909	test: 0.8873781	best: 0.8873781 (252)	total: 3h 37m 47s	remaining: 10h 43m 1s
253:	learn: 0.6380964	test: 0.8873850	best: 0.8873781 (252)	total: 3h 38m 39s	remaining: 10h 42m 12s
254:	learn: 0.6378949	test: 0.8873265	best: 0.8873265 (254)	total: 3h 39m 30s	remaining: 10h 41m 18s
255:	learn: 0.6375853	test: 0.8868764	best: 0.8868764 (255)	total: 3h 40m 34s	remaining: 10h 41m 2s
256:	learn: 0.6373866	test: 0.8869296	best: 0.8868764 (255)	total: 3h 41m 27s	remaining: 10h 40m 14s
257:	learn: 0.6370071	test: 0.8869097	best: 0.8868764 (255)	total: 3h 42m 23s	remaining: 10h 39m 34s
258:	learn: 0.6368230	test: 0.8868291	best: 0.8868291 (258)	total: 3h 43m 17s	remaining: 10h 38m 49s
259:	learn: 0.6366929	test: 0.8868098	best: 0.8868098 (259)	total: 3h 44m 5s	remaining: 10h 37m 48s
260:	learn: 0.6364390	test: 0.8868198	best: 0.8868098 (259)	total: 3h 45m 10s	remaining: 10h 37

334:	learn: 0.6229754	test: 0.8837895	best: 0.8837029 (332)	total: 4h 47m 8s	remaining: 9h 29m 59s
335:	learn: 0.6228025	test: 0.8836569	best: 0.8836569 (335)	total: 4h 47m 57s	remaining: 9h 29m 4s
336:	learn: 0.6222319	test: 0.8836319	best: 0.8836319 (336)	total: 4h 48m 59s	remaining: 9h 28m 33s
337:	learn: 0.6220706	test: 0.8836755	best: 0.8836319 (336)	total: 4h 49m 56s	remaining: 9h 27m 52s
338:	learn: 0.6219704	test: 0.8836323	best: 0.8836319 (336)	total: 4h 50m 48s	remaining: 9h 27m 1s
339:	learn: 0.6218593	test: 0.8835679	best: 0.8835679 (339)	total: 4h 51m 40s	remaining: 9h 26m 11s
340:	learn: 0.6217332	test: 0.8835512	best: 0.8835512 (340)	total: 4h 52m 29s	remaining: 9h 25m 15s
341:	learn: 0.6216711	test: 0.8835109	best: 0.8835109 (341)	total: 4h 53m 15s	remaining: 9h 24m 12s
342:	learn: 0.6215090	test: 0.8833041	best: 0.8833041 (342)	total: 4h 54m 18s	remaining: 9h 23m 44s
343:	learn: 0.6213370	test: 0.8832866	best: 0.8832866 (343)	total: 4h 55m 13s	remaining: 9h 22m 58s
344

417:	learn: 0.6095901	test: 0.8811299	best: 0.8811299 (417)	total: 5h 58m 25s	remaining: 8h 19m 2s
418:	learn: 0.6095148	test: 0.8811553	best: 0.8811299 (417)	total: 5h 59m 11s	remaining: 8h 18m 4s
419:	learn: 0.6094524	test: 0.8811901	best: 0.8811299 (417)	total: 6h 1s	remaining: 8h 17m 10s
420:	learn: 0.6093654	test: 0.8811586	best: 0.8811299 (417)	total: 6h 54s	remaining: 8h 16m 21s
421:	learn: 0.6091748	test: 0.8812886	best: 0.8811299 (417)	total: 6h 1m 57s	remaining: 8h 15m 45s
422:	learn: 0.6090493	test: 0.8812822	best: 0.8811299 (417)	total: 6h 2m 45s	remaining: 8h 14m 49s
423:	learn: 0.6088229	test: 0.8812618	best: 0.8811299 (417)	total: 6h 3m 45s	remaining: 8h 14m 10s
424:	learn: 0.6086451	test: 0.8812682	best: 0.8811299 (417)	total: 6h 4m 43s	remaining: 8h 13m 26s
425:	learn: 0.6085572	test: 0.8812397	best: 0.8811299 (417)	total: 6h 5m 34s	remaining: 8h 12m 34s
426:	learn: 0.6084262	test: 0.8812348	best: 0.8811299 (417)	total: 6h 6m 22s	remaining: 8h 11m 38s
427:	learn: 0.608

25985.124575138092

In [6]:
# model = pickle.load(open('cat_model.pickle', 'rb'))

In [7]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

In [9]:
# pickle.dump(model, open('cat_model.pickle', 'wb')) cannot serialize a bytes object larger than 4 GiB
pickle.dump(Y_pred, open('cat_train.pickle', 'wb'))
pickle.dump(Y_test, open('cat_test.pickle', 'wb'))

In [10]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

item_id: 14.062595780547444
shop_item_avg_item_cnt_1: 12.60464648890531
date_block_num: 9.824421395933172
shop_id: 8.917941673690395
item_cnt_month_1: 8.699023967091634
shop_item_sum_orders_1: 6.044162921485703
subtype_code: 4.293535987044791
shop_item_avg_item_cnt_2: 3.8835379759225694
date_item_avg_item_cnt_1: 2.8658550033357844
orders_1: 2.073678561868295
month: 1.8875848326075508
date_cat_sum_orders_1: 1.732331809214151
date_cat_sum_orders_2: 1.3494435214809526
date_item_sum_orders_1: 1.2701972229025529
date_cat_avg_item_price_1: 1.1484250134220992
item_category_id: 1.1144911662629042
item_shop_last_sale: 1.0824121332187993
item_cnt_month_2: 0.9273395552963921
type_code: 0.9236932975271995
date_item_city_avg_item_cnt_1: 0.8116523082146452
date_shop_avg_item_price_1: 0.6845823851970417
date_item_sum_orders_4: 0.638507711083966
shop_item_sum_orders_2: 0.6089194215662694
date_cat_avg_item_cnt_5: 0.5596726116898486
date_cat_avg_item_cnt_2: 0.5124602383478061
date_item_avg_item_price_1:

In [11]:
test = pd.read_csv('test.csv.gz', compression='gzip').set_index('ID')
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('cat_submission.csv', index=False)

LB: 1.00913