In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import jupytools.syspath
jupytools.syspath.add('..')

In [13]:
from collections import namedtuple, defaultdict, Counter, OrderedDict
from itertools import chain
from multiprocessing import cpu_count
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from basedir import TRAIN, TEST
from dataset import load, load_sample, Subset
from metric import qwk

In [5]:
trn_data, trn_targ, trn_spec = load(Subset.Train)

(11341042, 11) (17690, 7) (386, 3) 

In [6]:
[tst_data] = load(Subset.Test)

(1156414, 11) 

In [7]:
def unique(dataframes, key):
    return list(set(chain(*[df[key].unique().tolist() for df in dataframes])))

In [8]:
def count(dataframe, key):
    return dataframe[key].value_counts().index

In [9]:
def named_tuple(name, **params):
    from collections import namedtuple
    return namedtuple(name, params.keys())(**params)

In [10]:
def as_list(seq): return list(sorted(seq))

In [11]:
def clean_up(trn, tst, target):
    trn, tst, target = [x.copy() for x in (trn, tst, target)]
    trn['title_event_code'] = trn['title'].str.cat(trn['event_code'].astype(str), '_')
    tst['title_event_code'] = tst['title'].str.cat(tst['event_code'].astype(str), '_')
    data = [trn, tst]
    
    title_event_codes = unique(data, key='title_event_code')
    titles = unique(data, key='title')
    event_codes = unique(data, key='event_code')
    event_ids = unique(data, key='event_id')
    worlds = unique(data, key='world')
    
    title_enc = {x: i for i, x in enumerate(titles)}
    title_dec = dict(enumerate(titles))
    world_enc = {x: i for i, x in enumerate(worlds)}
    
    trn_asm = trn.query('type == "Assessment"')
    tst_asm = tst.query('type == "Assessment"')
    assessment_titles = set(count(trn_asm, 'title')).union(count(tst_asm, 'title'))

    trn['title'] = trn['title'].map(title_enc)
    tst['title'] = tst['title'].map(title_enc)
    trn['world'] = trn['world'].map(world_enc)
    tst['world'] = tst['world'].map(world_enc)
    target['title'] = target['title'].map(title_enc)
    
    win_code_enc = dict(zip(title_enc.values(), [4100 for _ in title_enc]))
    win_code_enc[title_enc['Bird Measurer (Assessment)']] = 4110
    
    trn['timestamp'] = pd.to_datetime(trn['timestamp'])
    tst['timestamp'] = pd.to_datetime(tst['timestamp'])
    
    data = named_tuple('Data', x_train=trn, y_train=target, x_test=tst)
    
    meta = named_tuple('Meta', title_event_codes=title_event_codes,
                       titles=as_list(titles), event_codes=as_list(event_codes), 
                       event_ids=as_list(event_ids), worlds=as_list(worlds),
                       assessment_titles=as_list(assessment_titles),
                       win_code_enc=win_code_enc, title_enc=title_enc,
                       title_dec=title_dec, world_enc=world_enc)
    
    return data, meta

In [14]:
data, meta = clean_up(trn_data, tst_data, trn_targ)

In [15]:
meta.assessment_titles

['Bird Measurer (Assessment)',
 'Cart Balancer (Assessment)',
 'Cauldron Filler (Assessment)',
 'Chest Sorter (Assessment)',
 'Mushroom Sorter (Assessment)']

In [16]:
def savediv(a, b, fallback=0): 
    return a/b if b != 0 else fallback

In [17]:
def to_accuracy_group(accuracy):
    return (0 if accuracy == 0 else 
            3 if accuracy == 1 else 
            2 if accuracy == 0.5 else
            1)

In [18]:
def init_dict(keys, init_value=0):
    return {k: init_value for k in keys}

In [19]:
def extract_features(user, meta, test=False):
    last_activity = 0
    last_session_time_sec = 0
    acc_accuracy_group = 0
    acc_accuracy = 0
    acc_attempts_correct = 0
    acc_attempts_incorrect = 0
    acc_actions = 0
    session_no = 0

    last_accuracy_title = init_dict([f'acc_{t}' for t in meta.assessment_titles], -1)
    accuracy_groups = init_dict([0, 1, 2, 3])
    user_activities_cnt = init_dict(['Clip', 'Activity', 'Assessment', 'Game'])
    event_code_cnt = init_dict(meta.event_codes)
    event_id_cnt = init_dict(meta.event_ids)
    title_cnt = init_dict(meta.titles)
    title_event_code_cnt = init_dict(meta.title_event_codes)
    
    time_first_activity = float(user['timestamp'].values[0])
    assessments = []
    durations = []
    
    for _, session in user.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_installation_id = session['installation_id'].iloc[0]
        session_title_text = meta.title_dec[session_title]
        
        if session_type == 'Assessment' and (test or len(session) > 1):
            attempts = session.query(f'event_code == {meta.win_code_enc[session_title]}')
            t_attempts = attempts['event_data'].str.contains('true').sum()
            f_attempts = attempts['event_data'].str.contains('false').sum()

            features = OrderedDict()
            features.update(user_activities_cnt.copy())
            features.update(last_accuracy_title.copy())
            features.update(event_code_cnt.copy())
            features.update(event_id_cnt.copy())
            features.update(title_cnt.copy())
            features.update(title_event_code_cnt.copy())
            features.update(last_accuracy_title.copy())
            
            features['installation_id'] = session_installation_id
            features['session_title'] = session_title
            features['acc_attempts_correct'] = acc_attempts_correct
            features['acc_attempts_incorrect'] = acc_attempts_incorrect
            acc_attempts_correct += t_attempts
            acc_attempts_incorrect += f_attempts
            
            features['duration_mean'] = np.mean(durations) if durations else 0
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            
            features['acc_accuracy'] = savediv(acc_accuracy, session_no)
            accuracy = savediv(t_attempts, t_attempts + f_attempts)
            acc_accuracy += accuracy
            last_accuracy_title[f'acc_{session_title_text}'] = accuracy
            features['accuracy_group'] = to_accuracy_group(accuracy)
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            features['acc_accuracy_group'] = savediv(acc_accuracy_group, session_no)
            acc_accuracy_group += features['accuracy_group']
            features['acc_actions'] = acc_actions
            
            if test or (t_attempts + f_attempts) > 0:
                assessments.append(features)
            
            session_no += 1
            
        def update_counters(counter, column):
            num_of_sessions = Counter(session[column])
            for k1 in num_of_sessions.keys():
                k2 = meta.title_dec[k1] if column == 'title' else k1
                counter[k2] += num_of_sessions[k1]
        
        update_counters(event_code_cnt, 'event_code')
        update_counters(event_id_cnt, 'event_id')
        update_counters(title_cnt, 'title')
        update_counters(title_event_code_cnt, 'title_event_code')
        
        acc_actions += len(session)
        if last_activity != session_type:
            user_activities_cnt[session_type] += 1
            last_activity = session_type
    
    return [assessments[-1]] if test else assessments

In [20]:
def wrap_tqdm(data, wrap=True, **params):
    return tqdm(data, **params) if wrap else data

In [21]:
def extract_features_from_groups(data, key, meta, 
                                 test=False, pbar=True,
                                 num_workers=cpu_count()):
    
    grouped = data.groupby(key, sort=False)
    groups = (g for _, g in grouped)
    n_total = grouped.ngroups
    groups = wrap_tqdm(
        groups, wrap=pbar, total=n_total,
        desc='Test' if test else 'Train')
    with Parallel(num_workers) as p:
        results = p(delayed(extract_features)(g, meta, test) for g in groups)
    return pd.DataFrame(list(chain(*results)))

In [22]:
def create_train_test(trn, tst, meta, pbar=True, num_workers=cpu_count()):
    trn = extract_features_from_groups(
        trn, meta=meta, key='installation_id', num_workers=num_workers)
    tst = extract_features_from_groups(
        tst, meta=meta, key='installation_id', num_workers=num_workers,
        test=True)
    return trn, tst

In [23]:
X_trn, X_tst = create_train_test(data.x_train, data.x_test, meta)

HBox(children=(IntProgress(value=0, description='Train', max=17000, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='Test', max=1000, style=ProgressStyle(description_width='initi…




In [24]:
def update_with_post_processing_features(trn, tst, meta):
    for df in (trn, tst):
        grouped = df.groupby('installation_id')
        df['installation_session_count'] = grouped['Clip'].transform('count')
        df['installation_duration_mean'] = grouped['duration_mean'].transform('mean')
        df['installation_title_nunique'] = grouped['session_title'].transform('nunique')
        df['event_code_count_sum'] = df[list(meta.event_codes)].sum(axis=1)
        df['installation_event_code_count_mean'] = (
            df.groupby('installation_id')['event_code_count_sum'].transform('mean'))

In [25]:
update_with_post_processing_features(X_trn, X_tst, meta)

In [26]:
def get_relevant_features(trn, tst):
    nonzero_rows = trn.sum(axis=1) != 0
    nonzero_cols = trn.sum(axis=0) != 0
    features = trn.loc[nonzero_rows, nonzero_cols].columns.tolist()
    features = (
        [f for f in features if f not in ('accuracy_group', 'installation_id')] +
        [f'acc_{t}' for t in meta.assessment_titles])
    return features

In [27]:
available_features = get_relevant_features(X_trn, X_tst)

In [28]:
len(available_features)

891

In [None]:
from IPython.displayplay import HTML
table = '<thead><th>Feature Name</th></thead>'
for featureavailable_featuresilable_featuresavailable_features:
    table += f'<tr><td>{feature}</td></tr>'
HTML(f'<table>{table}</table>')

In [81]:
from metric import qwk
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

# [-inf, 1.0272319526554337, 1.7321346312545782, 2.238358126272799, inf]

# def eval_qwk_lgb_regr(y_true, y_pred):
#     y_pred = y_pred.copy()
#     y_pred[y_pred <= 1.12232214] = 0
#     y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
#     y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
#     y_pred[y_pred > 2.22506454] = 3
#     return 'cappa', qwk(y_true, y_pred), True

def eval_qwk_lgb_regr(y_true, y_pred):
    y_pred = y_pred.copy()
    y_pred[y_pred <= 1.0272319526554337] = 0
    y_pred[np.where(np.logical_and(y_pred > 1.0272319526554337, y_pred <= 1.7321346312545782))] = 1
    y_pred[np.where(np.logical_and(y_pred > 1.7321346312545782, y_pred <= 2.238358126272799))] = 2
    y_pred[y_pred > 2.238358126272799] = 3
    return 'cappa', qwk(y_true, y_pred), True


k = 5
folds = GroupKFold(n_splits=k)
groups = X_trn['installation_id']
X = X_trn[available_features].copy()
y = X_trn['accuracy_group']

In [82]:
params = dict(n_estimators=2000,
              boosting_type='gbdt',
              objective='regression',
              metric='rmse',
              subsample=0.75,
              subsample_freq=1,
              learning_rate=0.04,
              feature_fraction=0.9,
              max_depth=15,
              lambda_l1=1,
              lambda_l2=1)

In [83]:
results = []
models = []
oof = np.zeros(X.shape[0], dtype=np.float32)

for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups), 1):
    print(f'Running k-fold {i} of {k}')
    x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    model = lgb.LGBMRegressor(**params)
    model.fit(x_trn, y_trn,
              eval_set=[(x_trn, y_trn), (x_val, y_val)], 
              eval_names=['trn', 'val'],
              eval_metric=eval_qwk_lgb_regr, early_stopping_rounds=200,
              verbose=100, categorical_feature='auto')
    oof[val_idx] = model.predict(x_val)
    models.append(model)

Running k-fold 1 of 5
Training until validation scores don't improve for 200 rounds.
[100]	trn's rmse: 0.908413	trn's cappa: 0.676305	val's rmse: 0.978849	val's cappa: 0.610315
[200]	trn's rmse: 0.842117	trn's cappa: 0.736391	val's rmse: 0.971586	val's cappa: 0.61322
[300]	trn's rmse: 0.792227	trn's cappa: 0.773537	val's rmse: 0.971217	val's cappa: 0.617272
[400]	trn's rmse: 0.752169	trn's cappa: 0.800877	val's rmse: 0.972938	val's cappa: 0.61434
Early stopping, best iteration is:
[261]	trn's rmse: 0.810635	trn's cappa: 0.759768	val's rmse: 0.971524	val's cappa: 0.619668
Running k-fold 2 of 5
Training until validation scores don't improve for 200 rounds.
[100]	trn's rmse: 0.906489	trn's cappa: 0.680689	val's rmse: 0.9855	val's cappa: 0.608238
[200]	trn's rmse: 0.841001	trn's cappa: 0.735247	val's rmse: 0.979902	val's cappa: 0.61491
[300]	trn's rmse: 0.792855	trn's cappa: 0.772451	val's rmse: 0.979159	val's cappa: 0.614939
[400]	trn's rmse: 0.753017	trn's cappa: 0.801028	val's rmse: 0.9

In [84]:
eval_qwk_lgb_regr(y, oof)

('cappa', 0.6036508470866481, True)

In [85]:
import joblib

In [86]:
joblib.dump(models, '/home/ck/data/bowl2019/external/models_lightgbm_002.joblib')

['/home/ck/data/bowl2019/external/models_lightgbm_002.joblib']

In [94]:
joblib.dump(available_features, '/home/ck/data/bowl2019/external/available_features.joblib')

['/home/ck/data/bowl2019/external/available_features.joblib']

In [100]:
joblib.dump(meta._asdict(), '/home/ck/data/bowl2019/external/meta_dict.joblib')

['/home/ck/data/bowl2019/external/meta_dict.joblib']

In [87]:
preds = np.zeros((X_tst.shape[0], len(models)), dtype=np.float32)
for i, model in enumerate(models):
    preds[:, i] = model.predict(X_tst[available_features])
avg_preds = np.mean(preds, axis=1)

In [1]:
1

1

In [77]:
import scipy
def optimize_rounding_bounds(X, y):
    def _loss(coef):
        buckets = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=[0, 1, 2, 3])
        return -qwk(y, buckets)
    
    init_coef = [0.5, 1.5, 2.5]
    opt_coef = scipy.optimize.minimize(_loss, init_coef, method='nelder-mead')
    optimized = opt_coef['x']
    return [-np.inf] + optimized.tolist() + [np.inf]

In [78]:
assert len(oof) == len(y)

In [79]:
bounds = optimize_rounding_bounds(oof, y)

In [80]:
bounds

[-inf, 1.0272319526554337, 1.7321346312545782, 2.238358126272799, inf]

In [89]:
def round_regressor_predictions(preds, coefs):
    x = preds.copy()
    for i, (lo, hi) in enumerate(zip(coefs[:-1], coefs[1:])):
        x[(x > lo) & (x <= hi)] = i
    return x

In [90]:
y_hat = round_regressor_predictions(avg_preds, bounds)

In [None]:
from dataset import SAMPLE
sample = pd.read_csv(SAMPLE)
sample['accuracy_group'] = y_hat.astype(int)
sample.to_csv('submission.csv', index=False)

In [40]:
import copy
import re

In [32]:
def default(value, fallback=0):
    return value if value is not None else fallback

In [161]:
class DatetimeFeatures:
    def __init__(self, prefix=None, field_name='timestamp', 
                 attributes=('Year', 'Month', 'Week', 'Day', 'Dayofweek'),
                 drop=True, date=True, time=True):
        self.prefix = default(prefix, re.sub('[Dd]ate$', '', field_name))
        self.field_name = field_name
        self.attributes = attributes
        self.drop = drop
        self.date = date
        self.time = time
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        field = X[self.field_name]
        attrs = list(self.attributes)
        if date:
            attrs.append('Date')
        if time:
            attrs.extend(['Hour', 'Minute'])
        for attr in attrs:
            X[f'{prefix}{attr}'] = getattr(field.dt, attr.lower())
        if drop:
            X = X.drop(field_name, axis=1)
        return X
    def fit_transform(self, X, y=None):
        data = copy.deepcopy(X)
        return self.fit(data).transform(data)

In [165]:
class InteractionFeatures:
    def __init__(self, n_interactions=20):
        self.n_interactions = n_interactions
        self.features = None
        self.interact_1 = None
        self.interact_2 = None
    def fit(self, X, y=None):
        substrings = ('sum', 'mean', 'max', 'std', 'attempt')
        self.features = [col for col in X.columns if any(x in col for x in substrings)]
        self.interact_1 = np.random.choice(self.features, self.n_interactions)
        self.interact_2 = np.random.choice(self.features, self.n_interactions)
        return self
    def transform(self, X, y=None):
        for col1 in self.interact_1:
            for col2 in self.interact_2:
                data[f'interact_{col1}_{col2}'] = data[col1] * data[col2]
        return data
    def fit_transform(self, X, y=None, **params):
        data = copy.deepcopy(X)
        return self.fit(data).transform(data)