In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import jupytools.syspath
jupytools.syspath.add('..')

In [32]:
import catboost as cb
import feather
from IPython.display import FileLink
from dataset import load, load_sample, Subset
from extract_features import extend_with_event_data, prepare, baseline_features

In [17]:
trn_data, trn_target, trn_specs = load(Subset.Train)

(11341042, 11) (17690, 7) (386, 3) 

In [18]:
[tst_data] = load(Subset.Test)

(1156414, 11) 

In [19]:
X_trn = extend_with_event_data(trn_data)
X_trn = prepare(X_trn, baseline_features, targets=trn_target)
X_trn = X_trn.drop_duplicates(subset=['game_session', 'installation_id'], keep='last')
X_trn = X_trn.reset_index(drop=True)
X_trn.to_feather('X_trn.feather')

HBox(children=(IntProgress(value=0, max=11341042), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




In [20]:
X_tst = extend_with_event_data(tst_data.iloc[:10000])
X_tst = prepare(X_tst, baseline_features)
X_tst = X_tst.drop_duplicates(subset=['game_session', 'installation_id'], keep='last')
X_tst = X_tst.reset_index(drop=True)
X_tst.to_feather('X_tst.feather')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [21]:
trn_target[['accuracy_group']].to_feather('y.feather')

In [22]:
X_trn = feather.read_dataframe('X_trn.feather')
X_tst = feather.read_dataframe('X_tst.feather')
y = feather.read_dataframe('y.feather').accuracy_group.values

In [23]:
assert X_trn.shape[0] == y.shape[0]

In [25]:
params = dict(
    loss_function='RMSE',
    custom_metric=['RMSE', 'MAE'],
    eval_metric='RMSE',
    iterations=10000,
    random_seed=1,
    max_depth=6,
    early_stopping_rounds=100,
    task_type='GPU',
    devices='0:1',
    verbose=100
)

In [28]:
X_trn_game_session = X_trn.pop('game_session')
X_trn_installation_id = X_trn.pop('installation_id')
cat_cols = X_trn.select_dtypes(object).columns.tolist()

In [29]:
cv_results = cb.cv(params=params, pool=cb.Pool(X_trn, y, cat_features=cat_cols))

0:	learn: 2.2188636	test: 2.2188564	best: 2.2188564 (0)
100:	learn: 1.2207201	test: 1.2276457	best: 1.2276457 (100)
200:	learn: 1.1979649	test: 1.2132414	best: 1.2132414 (200)
300:	learn: 1.1892059	test: 1.2096814	best: 1.2096814 (300)
400:	learn: 1.1795952	test: 1.2069756	best: 1.2069730 (399)
500:	learn: 1.1711018	test: 1.2051687	best: 1.2051687 (500)	total: 24.6s	remaining: 7m 47s
600:	learn: 1.1642015	test: 1.2042917	best: 1.2042917 (600)
700:	learn: 1.1582282	test: 1.2037230	best: 1.2037230 (700)
800:	learn: 1.1532424	test: 1.2033621	best: 1.2033610 (798)
900:	learn: 1.1481440	test: 1.2028954	best: 1.2028954 (900)	total: 44.4s	remaining: 7m 28s
1000:	learn: 1.1441578	test: 1.2026445	best: 1.2025958 (967)
1100:	learn: 1.1400081	test: 1.2024775	best: 1.2024050 (1087)
1200:	learn: 1.1361584	test: 1.2023568	best: 1.2023568 (1200)
1300:	learn: 1.1330122	test: 1.2023920	best: 1.2023233 (1228)
Stopped by overfitting detector  (100 iterations wait)


In [30]:
train_params = params.copy()
train_params['iterations'] = 1100
booster = cb.train(params=train_params, pool=cb.Pool(X_trn, y, cat_features=cat_cols))
booster.save_model('catboost_baseline.cbm')

0:	learn: 1.2552883	total: 18.2ms	remaining: 20s
100:	learn: 1.2067514	total: 1.62s	remaining: 16s
200:	learn: 1.1958430	total: 3.22s	remaining: 14.4s
300:	learn: 1.1899509	total: 4.81s	remaining: 12.8s
400:	learn: 1.1834323	total: 6.38s	remaining: 11.1s
500:	learn: 1.1764399	total: 7.93s	remaining: 9.48s
600:	learn: 1.1713608	total: 9.52s	remaining: 7.9s
700:	learn: 1.1670411	total: 11.1s	remaining: 6.31s
800:	learn: 1.1628529	total: 12.6s	remaining: 4.72s
900:	learn: 1.1594639	total: 14.2s	remaining: 3.14s
1000:	learn: 1.1565485	total: 15.8s	remaining: 1.56s
1099:	learn: 1.1535957	total: 17.4s	remaining: 0us


In [33]:
FileLink('catboost_baseline.cbm')

In [None]:
X_tst_game_session = X_tst.pop('game_session')
X_tst_installation_id = X_tst.pop('installation_id')
preds = booster.predict(X_tst)

In [None]:
preds_raw = pd.DataFrame(
    {'game_session': X_tst_game_session,
     'installation_id': X_tst_installation_id,
     'prediction': preds})

In [None]:
submit = (preds_raw
 .groupby('installation_id', as_index=False)
 .mean()
 .assign(accuracy_group=lambda df: df['prediction'].round().astype(int))
 .drop(columns=['prediction']))

In [None]:
submit.to_csv('submit_smoketest.csv', index=False)

In [None]:
FileLink('submit_smoketest.csv')