In [None]:
# !cp /kaggle/input/data-bowl-2019-external-data/*.py /kaggle/working

In [1]:
%reload_ext autoreload
%autoreload 2
import warnings
import jupytools.syspath
def ignore(*args, **kwargs): pass
warnings.warn = ignore
jupytools.syspath.add('..')

In [25]:
from collections import Counter, OrderedDict
from functools import partial

import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import GroupKFold
from tqdm.auto import tqdm

import bundle
import features as F
import selection
import utils as U
from dataset import load, load_sample, Subset
from encode import encode
from training import train, inference, submit, EnsembleTrainer, get_default_config
from meta import compute_meta_data
from metric import optimize_rounding_bounds, make_cappa_metric
from normalize import normalize

In [3]:
sample = False
if U.on_kaggle():
    U.log('Loading test set only.')
    tst_data = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
else:
    if sample:
        U.log('Warning: loading train and test data sample.')
        trn_data, _, _ = load_sample(Subset.Train, 500_000)
        [tst_data] = load_sample(Subset.Test, 500_000)
    else:
        U.log('Loading train and test.')
        trn_data, trn_spec, trn_targ = load(Subset.Train)
        [tst_data] = load(Subset.Test)

Loading train and test.
(11341042, 11) (17690, 7) (386, 3) (1156414, 11) 

## Preparing

In [4]:
transform = U.combine(
    partial(F.add_feature_combinations, pairs=[('title', 'event_code')]),
    partial(F.add_datetime, column='timestamp', prefix='ts'),
)

if U.on_kaggle():
    U.log('Transforming test data only.')
    X_tst = transform(tst_data.copy())
    U.log(X_tst.shape)
else:
    U.log('Transforming train and test data.')
    X_tst = transform(tst_data.copy())
    X_trn = transform(trn_data.copy())
    U.log(X_trn.shape, X_tst.shape)

Transforming train and test data.
(11341042, 19) (1156414, 19)


In [5]:
if U.on_kaggle():
    U.log('Reading pre-computed meta from disk.')
    meta = bundle.meta()
else:
    U.log('Computing meta using train and test datasets.')
    meta = compute_meta_data(X_trn, X_tst)
    U.log('Saving computed meta on disk.')
    bundle.save_meta(meta, 'meta')

Computing meta using train and test datasets.
Saving computed meta on disk.


In [7]:
extractor = F.FeaturesExtractor([
    F.CountingFeatures(meta),
    F.PerformanceFeatures(meta),
    F.VarietyFeatures(meta),
    F.EventDataFeatures(meta)
])

algo = F.InMemoryAlgorithm(extractor, meta, num_workers=12)

cat_cols = ['session_title']

if U.on_kaggle():
    U.log('Preparing test dataset.')
    X_tst = algo.run(X_tst, test=True)
    encoders = bundle.encoders()
    X_tst, _ = encode(X_tst, cat_cols, encoders=encoders)
else:
    U.log('Preparing train and test datasets.')
    X_trn = algo.run(X_trn)
    X_tst = algo.run(X_tst, test=True)
    X_trn, encoders = encode(X_trn, cat_cols)
    X_tst, _ = encode(X_tst, cat_cols, encoders=encoders)
    bundle.save(encoders, 'encoders')

Preparing train and test datasets.
Running algorithm in train mode.


HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))


Running algorithm in test mode.


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [8]:
if U.on_kaggle():
    U.log('Running post-processing on test set only.')
    F.add_user_wise_features(X_tst, meta)
else:
    U.log('Running post-processing on train and test sets.')
    F.add_user_wise_features(X_trn, meta)
    F.add_user_wise_features(X_tst, meta)

Running post-processing on train and test sets.


In [12]:
X_trn.to_pickle('/tmp/X_trn.pickle')

In [13]:
X_tst.to_pickle('/tmp/X_tst.pickle')

In [None]:
group_col = 'session_title'
U.log(f'Normalizing dataset using column for grouping: {group_col}')
norm_dataset = X_tst if U.on_kaggle() else X_trn
cnt_cols = U.starts_with(norm_dataset.columns, 'cnt_')
normalize(norm_dataset, cnt_cols, grouping_key=group_col)

## Experiments

In [24]:
X_trn = pd.read_pickle('/tmp/X_trn.pickle')
X_tst = pd.read_pickle('/tmp/X_tst.pickle')
cappa = make_cappa_metric(X_trn['accuracy_group'])
features = [c for c in X_trn.columns 
            if c not in ('installation_id', 'game_session', 'accuracy_group')]

In [27]:
X_trn

Unnamed: 0,installation_id,game_session,session_title,cnt_Scrub-A-Dub_4010,cnt_Bubble Bath_4020,cnt_Bubble Bath_4045,cnt_Crystals Rule_3121,cnt_Flower Waterer (Activity)_4030,cnt_Pan Balance_3121,cnt_Sandcastle Builder (Activity)_4090,...,event_weight_8,event_weight_9,event_weight_10,event_weight_11,event_weight_12,user_session_cnt,user_duration_mean,user_title_nunique,user_events_sum,user_events_mean
0,0006a69f,901acc108f55a5a1,0,1,0,0,0,50,0,2,...,0,0,0,0,0,5,40.766667,2,647,1553.0
1,0006a69f,77b8ee947eb84b4e,1,1,0,0,2,50,0,2,...,0,0,0,0,0,5,40.766667,2,1143,1553.0
2,0006a69f,6bdf9623adc94d89,0,1,0,0,2,50,0,2,...,0,0,0,0,0,5,40.766667,2,1230,1553.0
3,0006a69f,9501794defd84e4d,0,2,2,2,2,83,0,2,...,0,0,0,0,0,5,40.766667,2,2159,1553.0
4,0006a69f,a9ef3ecb3d1acc6a,1,2,2,2,9,83,0,2,...,0,0,0,0,0,5,40.766667,2,2586,1553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17685,ffc90c32,460e8bdc2822b340,4,1,2,2,10,18,8,0,...,0,0,0,0,0,6,33.011111,5,1963,1049.5
17686,ffd2871d,b05a02b52d5c1f4c,2,0,2,2,0,0,0,3,...,0,0,0,0,0,1,0.000000,1,888,888.0
17687,ffeb0b1b,dadd1a4d8ac68ab0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,3,71.500000,2,932,1274.0
17688,ffeb0b1b,a6885ab824fbc32c,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,71.500000,2,1358,1274.0


In [16]:
model_type = 'lightgbm'
trainer = EnsembleTrainer(algo=model_type, cv_metrics={'cappa': cappa})
fold = GroupKFold(n_splits=5)
config = get_default_config(model_type)
result = trainer.train(X_trn, features=features, fold=fold, config=config)

Running k-fold 1 of 5
Training until validation scores don't improve for 100 rounds.
[100]	trn's rmse: 1.03411	val's rmse: 1.0518
[200]	trn's rmse: 0.963705	val's rmse: 1.00042
[300]	trn's rmse: 0.9273	val's rmse: 0.984439
[400]	trn's rmse: 0.90143	val's rmse: 0.97657
[500]	trn's rmse: 0.880191	val's rmse: 0.972421
[600]	trn's rmse: 0.861611	val's rmse: 0.970126
[700]	trn's rmse: 0.844955	val's rmse: 0.969127
[800]	trn's rmse: 0.829251	val's rmse: 0.968091
[900]	trn's rmse: 0.814835	val's rmse: 0.967568
[1000]	trn's rmse: 0.801119	val's rmse: 0.9676
Early stopping, best iteration is:
[911]	trn's rmse: 0.813303	val's rmse: 0.967408
Running k-fold 2 of 5
Training until validation scores don't improve for 100 rounds.
[100]	trn's rmse: 1.03258	val's rmse: 1.06245
[200]	trn's rmse: 0.962772	val's rmse: 1.0114
[300]	trn's rmse: 0.927185	val's rmse: 0.993386
[400]	trn's rmse: 0.902046	val's rmse: 0.985297
[500]	trn's rmse: 0.881377	val's rmse: 0.98158
[600]	trn's rmse: 0.863598	val's rmse: 0.

In [17]:
cappa(X_trn['accuracy_group'].values, result.oof)

0.5973933709198882

In [None]:
# features from event_data
# https://www.kaggle.com/c/data-science-bowl-2019/discussion/124028

# truncated validation
# https://www.kaggle.com/ragnar123/truncated-val
# https://www.kaggle.com/c/data-science-bowl-2019/discussion/120790

# params = {'boosting_type': 'gbdt', 
#           'metric': 'rmse', 
#           'objective': 'regression', 
#           'eval_metric': 'cappa', 
#           'n_jobs': -1, 
#           'seed': 42, 
#           'num_leaves': 26, 
#           'learning_rate': 0.077439684887749, 
#           'max_depth': 33, 
#           'lambda_l1': 3.27791989030057, 
#           'lambda_l2': 1.3047627805931334, 
#           'bagging_fraction': 0.896924978584253, 
#           'bagging_freq': 1, 
#           'colsample_bytree': 0.8710772167017853}

In [None]:
columns = X_trn.columns.tolist()
cnt_cols = U.starts_with(columns, 'cnt_')
perf_cols = U.starts_with(columns, 'perf_')
var_cols = U.starts_with(columns, 'var_')
user_cols = U.starts_with(columns, 'user_')
event_cols = U.starts_with(columns, 'event_')
cat_cols = ['session_title']
cols = cnt_cols + perf_cols + var_cols + user_cols + event_cols + cat_cols

features_groups = [
    ('cnt+perf+var', cnt_cols + perf_cols + var_cols),
    ('cnt+perf+user', cnt_cols + perf_cols + user_cols),
    ('cnt+perf+cat', cnt_cols + perf_cols + cat_cols),
    ('cnt+perf+var+user+cat', cnt_cols + perf_cols + var_cols + user_cols + cat_cols),
    ('event+cnt+perf', event_cols + cnt_cols + perf_cols),
    ('event+cnt+perf+user', event_cols + cnt_cols + perf_cols + user_cols),
    ('event+cnt+perf+var', event_cols + cnt_cols + perf_cols + var_cols),
    ('event+cnt+pert+cat', event_cols + cnt_cols + perf_cols + cat_cols),
    ('event+cnt+perf+user+var+cat', 
     event_cols + cnt_cols + perf_cols + user_cols + var_cols + cat_cols),
    ('all', cols),
    ('all-event', [c for c in cols if c not in event_cols])
]

In [None]:
from sklearn.model_selection import GroupKFold
from training import EnsembleTrainer, get_default_config

model_type = 'lightgbm'
trainer = EnsembleTrainer(algo=model_type, cv_metrics={'cappa': cappa})
fold = GroupKFold(n_splits=5)
config = get_default_config(model_type)
U.set_nested(config, 'model_params.feature_fraction', 0.8)
U.set_nested(config, 'model_params.bagging_fraction', 0.75)
U.set_nested(config, 'model_params.bagging_freq', 1)

results = []
for col_group, features in features_groups:
    U.log(f'Training columns group: {col_group}')
    result = trainer.train(X_trn, features=features, fold=fold, config=config)
    results.append((col_group, result))

In [None]:
report = pd.DataFrame([
    OrderedDict([('features', name)] + list(result.cv.items())) 
    for name, result in results])
cappa_cols = U.starts_with(report.columns, 'cv_cappa')
report['mean'] = report[cappa_cols].mean(axis=1)
report['std'] = report[cappa_cols].std(axis=1)
report.sort_values(by=['mean'], inplace=True, ascending=False)
report.reset_index(drop=True, inplace=True)
report.to_csv(f'report_{U.now()}.csv', index=False)

In [None]:
def highlight_best(col):
    if not col.name.startswith('cv_cappa_'):
        return [''] * len(col)
    is_best = col.index == col.argmax()
    return ['background-color: salmon' if idx else '' for idx in is_best]

In [None]:
report.style.apply(highlight_best)

In [8]:
import json
sample = trn_data.sample(1_000_000)
event_data = pd.io.json.json_normalize(sample.event_data.apply(json.loads))

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
from dataset import existing_info
stat_info = existing_info(event_data).T.reset_index()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
f, ax = plt.subplots(figsize=(8, 12))
ax = sns.barplot(x='Percent', y='index', data=stat_info.head(40))
ax.set_title('Most Frequent Features')
ax.set_ylabel('Features')
ax.grid(True, linestyle='dotted')

In [None]:
freq_keys = stat_info['index'][stat_info['Percent'] >= 5.].tolist()

In [None]:
freq_keys

In [None]:
round_df = event_data['round'].fillna(0).value_counts().reset_index()
round_df['index'] = round_df['index'].astype(int)
round_df.sort_values(by='index', inplace=True)
round_df.plot(x='index', y='round')

In [None]:
np.isnan(event_data['coordinates.x'].iloc[3])

In [None]:
event_data['coordinates.x'].fillna(0)

In [None]:
event_data['media_type'].fillna('unknown').value_counts()

In [None]:
event_data['source'].fillna('N/A').value_counts()

In [None]:
levels = event_data['level'].fillna(0)
pd.cut(levels, [-np.inf, 3, 5, 8, 13, 21, np.inf], labels=[0, 1, 2, 3, 4, 5]).value_counts()

In [15]:
event_data['size'].fillna(0).value_counts()

0.0    950451
3.0     15747
1.0     11840
2.0     11134
4.0      6200
5.0      3457
6.0      1171
Name: size, dtype: int64

In [16]:
event_data['weight'].fillna(0).value_counts()

0.0     942524
3.0      23416
1.0      21119
6.0       3747
2.0       3524
4.0       3268
8.0        838
7.0        446
5.0        388
10.0       323
9.0        267
11.0        75
12.0        65
Name: weight, dtype: int64

In [None]:
# round: float with NaNs
# coordinates.(x|y): float with NaNs, but presumably integer
# coordinates.stage_(width|height): float with NaNs, but presumably categorical
#
# description: text messages, try to convert into categoricals?
#     (event_data['description'].fillna('n/a').value_counts()
#      .rename('count').reset_index().rename(columns={'index': 'text'}))
#
# identifier: some string, probably concatenated with commas; most simple is to compute len
# event_data['identifier'].fillna('n/a').str.split(',').apply(len)
#
# media_type: categorical string
# event_data['media_type'].fillna('n/a').value_counts()
#
# duration: should already exist in the features set
# total_duration: probably also present in the features
#
# source: also some kind of categorical string
# event_data['source'].fillna('n/a').value_counts()
#
# level: integer feature, can be used as a cumulative metric (?)
# event_data['level'].fillna('n/a').value_counts()
#
# correct: defines if attempt was done, is already used in feature processing
# 
# size: integer feature
# event_data['size'].fillna('n/a').value_counts()
#
# weight: one more integer feature
# event_data['weight'].fillna('n/a').value_counts()

## Features Selection

In [18]:
selector = selection.FeatureSelection(
    rules=[
        ('nonzero', selection.non_zero_rows_and_cols),
    ],
    ignore_cols=[
        'accuracy_group', 
        'installation_id', 
        'game_session'
    ]
)

if U.on_kaggle():
    U.log('Loading relevant features list from disk.')
    features = bundle.features()
else:
    U.log('Deriving relevant features from train dataset.')
    features = selector.select(X_trn)
    bundle.save(features, 'features')

Deriving relevant features from train dataset.
Excluding from consideration: ['accuracy_group', 'installation_id', 'game_session']
Applying feature selection rule: nonzero
Selected features: 979 of 1000
Keeping only features, selected by every rule.
Final number of features changed from 1000 to 979


In [19]:
if U.on_kaggle():
    U.log('Loading relevant features list from disk.')
    features = bundle.features()
else:
    columns = features
    cnt_cols = U.starts_with(columns, 'cnt_')
    perf_cols = U.starts_with(columns, 'perf_')
    var_cols = U.starts_with(columns, 'var_')
    user_cols = U.starts_with(columns, 'user_')
    ts_cols = U.starts_with(columns, 'ts_')
    event_cols = U.starts_with(columns, 'event_')
    cat_cols = ['session_title']
    features = cnt_cols + perf_cols + user_cols + var_cols + event_cols + cat_cols
    bundle.save(features, 'features')
U.log(f'Total number of features: {len(features)}')

Total number of features: 979


## Submission

In [20]:
algo = 'lightgbm'
version = '019'

if U.on_kaggle():
    U.log('Inference on Kaggle.')
    features = bundle.features()
    bounds = bundle.bounds()
    predicted = inference(X_tst, features, bounds=bounds, model=algo, version=version)
    U.log('Saving predictions on disk.')
    filename = submit(predicted)
    submit_df = pd.read_csv(filename)
    U.log('First 20 submission rows:')
    display(submit_df.head(20))
    
else:
    U.log(f'Training model: {algo}')
    cappa = make_cappa_metric(X_trn['accuracy_group'])
    trainer = EnsembleTrainer(algo=algo, cv_metrics={'cappa': cappa})
    fold = GroupKFold(n_splits=5)
    config = get_default_config(algo)
    U.set_nested(config, 'model_params.feature_fraction', 0.8)
    U.set_nested(config, 'model_params.bagging_fraction', 0.75)
    U.set_nested(config, 'model_params.bagging_freq', 1)
    result = trainer.train(X_trn, features=features, fold=fold, config=config)
    U.log('Saving the trained models')
    bundle.save(result.models, f'models_{algo}_{version}')
    U.log('Saving the optimal rounding bounds')
    bounds = optimize_rounding_bounds(result.oof, X_trn['accuracy_group'].values)
    U.log(f'Optimal bounds: {bounds}')
    bundle.save(bounds, 'bounds')

Training model: lightgbm
Running k-fold 1 of 5
Training until validation scores don't improve for 100 rounds.
[100]	trn's rmse: 1.0398	val's rmse: 1.0605
[200]	trn's rmse: 0.965855	val's rmse: 1.00443
[300]	trn's rmse: 0.927935	val's rmse: 0.986512
[400]	trn's rmse: 0.901667	val's rmse: 0.978045
[500]	trn's rmse: 0.880518	val's rmse: 0.973423
[600]	trn's rmse: 0.862394	val's rmse: 0.970418
[700]	trn's rmse: 0.845752	val's rmse: 0.969073
[800]	trn's rmse: 0.830479	val's rmse: 0.968149
[900]	trn's rmse: 0.816013	val's rmse: 0.967966
[1000]	trn's rmse: 0.802506	val's rmse: 0.967866
Early stopping, best iteration is:
[947]	trn's rmse: 0.809451	val's rmse: 0.967623
Running k-fold 2 of 5
Training until validation scores don't improve for 100 rounds.
[100]	trn's rmse: 1.03836	val's rmse: 1.06915
[200]	trn's rmse: 0.964114	val's rmse: 1.01315
[300]	trn's rmse: 0.927431	val's rmse: 0.994165
[400]	trn's rmse: 0.901946	val's rmse: 0.985458
[500]	trn's rmse: 0.881392	val's rmse: 0.980967
[600]	trn

In [28]:
if not U.on_kaggle():
    import os
    features = bundle.features()
    bounds = bundle.bounds()
    filename = submit(inference(X_tst, features, bounds, model=algo, version=version))
    assert os.path.exists(filename)
    assert pd.read_csv(filename).shape[0] == 1000
    bundle.package(folder='/home/ck/data/bowl2019/external/')

Running inference on dataset of shape: 979
Loading external models: lightgbm v019.
Running models on test data...
Averaging ensemble predictions.
Rounding predictions using optimal bounds.
Converting predictions into submission file.
Running locally.
(1000, 2) Packaging training results into dataset.
/tmp/bowl2019/meta.joblib --> /home/ck/data/bowl2019/external/meta.joblib
/tmp/bowl2019/models_lightgbm_019.joblib --> /home/ck/data/bowl2019/external/models_lightgbm_019.joblib
/tmp/bowl2019/models_lightgbm_017.joblib --> /home/ck/data/bowl2019/external/models_lightgbm_017.joblib
/tmp/bowl2019/models_lightgbm_018.joblib --> /home/ck/data/bowl2019/external/models_lightgbm_018.joblib
/tmp/bowl2019/models_lightgbm_016.joblib --> /home/ck/data/bowl2019/external/models_lightgbm_016.joblib
/tmp/bowl2019/bounds.joblib --> /home/ck/data/bowl2019/external/bounds.joblib
/tmp/bowl2019/features.joblib --> /home/ck/data/bowl2019/external/features.joblib
/tmp/bowl2019/encoders.joblib --> /home/ck/data/

In [29]:
pd.read_csv('submission.csv')['accuracy_group'].value_counts()

3    348
2    334
0    180
1    138
Name: accuracy_group, dtype: int64