In [2]:
import numpy as np
import pandas as pd
import gc
import time
import catboost as cat

In [3]:
data = pd.read_pickle('matrix.pickle')
X_train = data[data.date_block_num < 33].drop(['date_block_num','item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['date_block_num','item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['date_block_num','item_cnt_month'], axis=1)
del data
gc.collect();

In [4]:
m2 = cat.CatBoostRegressor(
    #iterations=100, 
    #learning_rate=0.1, 
    #depth=11, 
    loss_function='RMSE', 
    eval_metric='RMSE', 
    random_seed=1, 
    od_type='Iter', 
    od_wait=20,
    allow_writing_files=False) 

In [8]:
ts = time.time()
categorical_features_indices = cat_features=[0,1,2,3,4,5,6]
train_pool = cat.Pool(X_train, Y_train, cat_features=categorical_features_indices)
validation_pool = cat.Pool(X_valid, Y_valid, cat_features=categorical_features_indices)
time.time()- ts

In [9]:
ts = time.time()
m2.fit(train_pool, eval_set=validation_pool, use_best_model=True, verbose=True)
time.time()- ts

0:	learn: 1.2070210	test: 1.1518657	best: 1.1518657 (0)	total: 14.7s	remaining: 4h 4m 55s
1:	learn: 1.1919130	test: 1.1393345	best: 1.1393345 (1)	total: 27.3s	remaining: 3h 47m 18s
2:	learn: 1.1773662	test: 1.1275545	best: 1.1275545 (2)	total: 39.6s	remaining: 3h 39m 21s
3:	learn: 1.1634251	test: 1.1159462	best: 1.1159462 (3)	total: 53.3s	remaining: 3h 41m 11s
4:	learn: 1.1501061	test: 1.1049050	best: 1.1049050 (4)	total: 1m 5s	remaining: 3h 36m 42s
5:	learn: 1.1375395	test: 1.0964606	best: 1.0964606 (5)	total: 1m 17s	remaining: 3h 32m 38s
6:	learn: 1.1254322	test: 1.0868314	best: 1.0868314 (6)	total: 1m 29s	remaining: 3h 32m 17s
7:	learn: 1.1138787	test: 1.0778750	best: 1.0778750 (7)	total: 1m 42s	remaining: 3h 31m 39s
8:	learn: 1.1026831	test: 1.0693217	best: 1.0693217 (8)	total: 1m 54s	remaining: 3h 30m 23s
9:	learn: 1.0919851	test: 1.0621615	best: 1.0621615 (9)	total: 2m 8s	remaining: 3h 31m 33s
10:	learn: 1.0817896	test: 1.0558952	best: 1.0558952 (10)	total: 2m 21s	remaining: 3h 3

951.0694074630737

In [10]:
feature_importances = m2.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

item_cnt_month_1: 24.225534780326747
shop_item_avg_item_cnt_1: 22.228063153010083
shop_item_sum_orders_1: 10.964440490331027
item_id: 5.589649858568318
shop_item_avg_item_cnt_2: 5.586771343784058
date_cat_avg_item_cnt_1: 5.307225822214774
shop_id: 4.091608173520044
month: 3.9410108535018216
date_item_avg_item_cnt_1: 3.736552357283839
date_item_sum_orders_1: 2.4636828584466666
date_item_city_avg_item_cnt_1: 2.1891192507643744
date_cat_sum_orders_2: 1.7151302767936933
subtype_code: 1.5308308203174397
shop_item_avg_item_cnt_3: 1.0224249483597059
orders_1: 0.8348838816740805
date_cat_sum_orders_4: 0.6255158107374446
date_cat_avg_item_cnt_4: 0.5054777680016567
item_cnt_month_2: 0.480645378746465
shop_item_sum_orders_2: 0.46471580580874283
date_shop_avg_item_cnt_4: 0.4283643779988445
date_cat_sum_orders_5: 0.36205157452778314
date_shop_sum_orders_4: 0.32459867938975107
item_category_id: 0.2852085219989559
shop_item_avg_item_cnt_4: 0.25977468895407235
date_cat_avg_item_cnt_3: 0.25233416961547

In [11]:
test = pd.read_csv('test.csv.gz', compression='gzip').set_index('ID')
Y_test = m2.predict(X_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('submission.csv', index=False)

In [12]:
m2.save_model('cat.model', format="cbm")

LB: 1.03672