In [1]:
%reload_ext autoreload
%autoreload 2
import warnings
import jupytools.syspath
def ignore(*args, **kwargs): pass
warnings.warn = ignore
jupytools.syspath.add('..')

In [2]:
import os
import re
from collections import defaultdict, Counter, OrderedDict
from functools import partial
from multiprocessing import cpu_count
from os.path import join

import feather
import joblib
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm.auto import tqdm

import utils as U
from basedir import TRAIN, TEST
from dataset import load, load_sample, Subset, to_accuracy_group
from metric import qwk

In [3]:
use_sample = False
loader = partial(load_sample, size=100_000) if use_sample else load
trn_data, trn_targ, trn_spec = loader(Subset.Train)
[tst_data] = loader(Subset.Test)

(11341042, 11) (17690, 7) (386, 3) (1156414, 11) 

In [4]:
tmpdir = '/tmp/bowl2019'

In [5]:
def add_feature_combinations(data, pairs):
    for c1, c2 in pairs:
        assert c1 in data.columns, f'Column not found: {c1}'
        assert c2 in data.columns, f'Column not found: {c2}'
        data[f'{c1}_{c2}'] = data[c1].astype(str).str.cat(data[c2].astype(str), '_')
    return data

In [6]:
def add_datetime(data, column, prefix=None, with_time=True):
    data[column] = pd.to_datetime(data[column])
    prefix = U.default(prefix, re.sub('[Dd]ate$', '', column))
    attrs = ('Year', 'Month', 'Week', 'Day', 'Dayofweek')
    if with_time:
        attrs += ('Hour', 'Minute')
    for attr in attrs:
        data[f'{prefix}_{attr}'] = getattr(data[column].dt, attr.lower())
    return data

In [7]:
def add_cyclical(data, prefix, features=('Year', 'Month', 'Week', 'Hour', 'Minute'),
                 modulo=None):
    modulo = modulo or {}
    for feature in features:
        column = f'{prefix}_{feature}'
        m = modulo.get(feature, 23.0)
        data[f'{column}_sin'] = np.sin(2*np.pi*data[column] / m)
        data[f'{column}_cos'] = np.cos(2*np.pi*data[column] / m)
    return data

In [8]:
transform = U.combine(
    partial(add_feature_combinations, pairs=[('title', 'event_code')]),
    partial(add_datetime, column='timestamp', prefix='ts'),
    partial(add_cyclical, prefix='ts')
)

In [9]:
X_trn = transform(trn_data.copy())
X_tst = transform(tst_data.copy())

In [10]:
def compute_meta_data(dataset, *datasets):
    datasets = [dataset] + list(datasets)
    uniq = OrderedDict()
    uniq['title_event_code'] = U.unique(datasets, column='title_event_code')
    uniq['title'] = U.unique(datasets, column='title')
    uniq['event_code'] = U.unique(datasets, column='event_code')
    uniq['event_id'] = U.unique(datasets, column='event_id')
    uniq['world'] = U.unique(datasets, column='world')
    uniq['type'] = U.unique(datasets, column='type')
    asm_datasets = [ds.query('type == "Assessment"') for ds in datasets]
    uniq['assessment_titles'] = U.unique(asm_datasets, column='title')
    win_codes = {t: 4100 for t in uniq['title']}
    win_codes['Bird Measurer (Assessment)'] = 4110
    meta = {'win_codes': win_codes, **uniq}
    return U.named_tuple('Meta', **meta)

In [11]:
meta = compute_meta_data(X_trn, X_tst)

In [12]:
def attempt_outcomes(session, meta):
    event_code = meta.win_codes.get(session.title.iloc[0], 4100)
    total_attempts = session.query(f'event_code == {event_code}')
    pos = total_attempts.event_data.str.contains('true').sum()
    neg = total_attempts.event_data.str.contains('false').sum()
    summary = dict(pos=pos, neg=neg, total=(pos + neg))
    return U.named_tuple('Trial', **summary)

In [13]:
def session_info(session, meta, test):
    session_type = session['type'].iloc[0]
    assessment = session_type == 'Assessment'
    outcomes = attempt_outcomes(session, meta) if assessment else None
    should_include = (
        (assessment and test) or 
        (assessment and (len(session) > 1) and outcomes.total > 0))
    duration = session.timestamp.iloc[-1] - session.timestamp.iloc[0]
    return U.named_tuple(
        name='Info', 
        installation_id=session['installation_id'].iloc[0],
        game_session=session['game_session'].iloc[0],
        session_title=session['title'].iloc[0],
        session_type=session_type,
        is_assessment=assessment,
        should_include=should_include,
        outcomes=outcomes,
        duration_seconds=duration.seconds)

In [14]:
# test_user = X_trn.query('installation_id == "0235fe9a"')

In [15]:
def id_features(user, meta, test=False):
    rows = []
    for _, session in user.groupby('game_session', sort=False):
        info = session_info(session, meta, test)
        if info.should_include:
            features = OrderedDict([
                ('installation_id', info.installation_id),
                ('game_session', info.game_session),
                ('session_title', info.session_title)
            ])
            rows.append(features)
    return [rows[-1]] if test else rows

In [16]:
def counting_features(user, meta, test=False):
    cnt_title_event_code = U.init_dict(meta.title_event_code)
    cnt_title = U.init_dict(meta.title)
    cnt_event_code = U.init_dict(meta.event_code)
    cnt_event_id = U.init_dict(meta.event_id)
    cnt_activities = U.init_dict(meta.type)
    
    last_activity = None
    
    def update_counters(cnt, sess, column):
        uniq_counts = Counter(sess[column])
        for k, v in uniq_counts.items():
            if k in cnt:
                cnt[k] += v
    
    rows = []
    for _, session in user.groupby('game_session', sort=False):
        info = session_info(session, meta, test)
        
        if info.should_include:
            features = OrderedDict()
            counters = {**cnt_activities,
                        **cnt_title_event_code,
                        **cnt_title,
                        **cnt_event_code,
                        **cnt_event_id}
            features.update([(f'cnt_{k}', v) for k, v in counters.items()])
            rows.append(features)
            
        update_counters(cnt_title_event_code, session, 'title_event_code')
        update_counters(cnt_title, session, 'title')
        update_counters(cnt_event_code, session, 'event_code')
        update_counters(cnt_event_id, session, 'event_id')
        
        if last_activity is None or last_activity != info.session_type:
            cnt_activities[info.session_type] += 1
            last_activity = info.session_type
    
    return [rows[-1]] if test else rows

In [17]:
# pd.DataFrame(counting_features(test_user, meta))

In [18]:
def performance_features(user, meta, test=False):
    acc_accuracy = 0
    acc_accuracy_group = 0
    acc_correct_attempts = 0
    acc_incorrect_attempts = 0
    acc_actions = 0
    
    durations = []
    accuracy_groups = U.init_dict([0, 1, 2, 3])
    last_accuracy_title = U.init_dict([f'acc_{t}' for t in meta.title], -1)
    
    n_rows = 0
    
    rows = []
    for _, session in user.groupby('game_session', sort=False):
        info = session_info(session, meta, test)
        
        if info.should_include:
            features = OrderedDict()
            features['acc_attempts_pos'] = acc_correct_attempts
            features['acc_attempts_neg'] = acc_incorrect_attempts
            acc_correct_attempts += info.outcomes.pos
            acc_incorrect_attempts += info.outcomes.neg
            
            features['acc_accuracy'] = U.savediv(acc_accuracy, n_rows)
            accuracy = U.savediv(info.outcomes.pos, info.outcomes.total)
            acc_accuracy += accuracy
            
            features.update(last_accuracy_title)
            last_accuracy_title[f'acc_{info.session_title}'] = accuracy
            
            features['accuracy_group'] = to_accuracy_group(accuracy)
            accuracy_groups[features['accuracy_group']] += 1
            
            features['acc_accuracy_group'] = U.savediv(acc_accuracy_group, n_rows)
            acc_accuracy_group += features['accuracy_group']

            features['acc_actions'] = acc_actions
            
            features['duration_mean'] = np.mean(durations) if durations else 0
            durations.append(info.duration_seconds)

            rows.append(features)
            n_rows += 1
        
        acc_actions += len(session)
    
    return [rows[-1]] if test else rows

In [19]:
# pd.DataFrame(performance_features(test_user, meta))

In [20]:
def timestamp_features(user, meta, test=False):
    acc = defaultdict(list)

    rows = []
    
    for _, session in user.groupby('game_session', sort=False):
        info = session_info(session, meta, test)

        if info.should_include:
            features = OrderedDict()
            for dt in ('Year', 'Month', 'Week', 'Hour', 'Minute'):
                for angle in ('sin', 'cos'):
                    key = f'ts_{dt}_{angle}'
                    acc[key] += session[key].tolist()
                    features[f'{key}_mean'] = np.mean(acc[key])
                    features[f'{key}_std'] = np.std(acc[key])
                    
            rows.append(features)
    
    return [rows[-1]] if test else rows

In [21]:
# pd.DataFrame(timestamp_features(test_user, meta))

In [22]:
def save_groups_on_disk(data, output_dir='/tmp/bowl2019/groups'):
    os.makedirs(output_dir, exist_ok=True)
    filenames = []
    for iid, group in tqdm(data.groupby('installation_id')):
        filename = os.path.join(output_dir, iid)
        group.reset_index(drop=True).to_feather(filename)
        filenames.append(filename)
    return filenames

In [23]:
trn_files = save_groups_on_disk(X_trn, output_dir=f'{tmpdir}/trn')

HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




In [24]:
tst_files = save_groups_on_disk(X_tst, output_dir=f'{tmpdir}/tst')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [25]:
# class FeatureExtractor:
#     def __new__(cls, steps, meta, backend='disk', **params):
#         if backend == 'disk':
#             ext = DiskExtractor
#         elif backend == 'memory':
#             ext = MemoryExtractor
#         else:
#             raise ValueError(f'unknown backend: {backend}')
#         return object.__new__(ext)
    
#     def __init__(self, steps, meta, pbar=True,
#                  test=False, num_workers=cpu_count(),
#                  **params):
#         self.steps = steps
#         self.meta = meta
#         self.pbar = pbar
#         self.test = test
#         self.num_workers = num_workers
    
#     def __call__(self, data):
#         self.extract(data)
        
#     def extract(self, data):
#         raise NotImplementedError()
    
    
# class DiskExtractor(FeatureExtractor):
#     def __init__(self, output_dir='/tmp/bowl2019', files_per_batch=16, **params):
#         super().__init__(**params)
#         self.output_dir = output_dir
#         self.files_per_batch = files_per_batch
        
#     def extract(self, filenames):
#         os.makedirs(self.output_dir, exist_ok=True)
        
#         def _extract(filename):
#             df = feather.read_dataframe(filename)
#             features = [
#                 pd.DataFrame(f(df, self.meta, self.test)) 
#                 for f in self.steps]
#             return pd.conact(features, axis=1)
        
#         def _save(pair):
#             dataframe, filename = pair
#             name = os.path.basename(filename)
#             output_file = os.path.join(self.output_dir, name)
#             dataframe = dataframe.reset_index(drop=True)
#             dataframe.to_pickle(output_file)
#             return output_file
        
#         chunks = list(U.chunks(filenames, self.files_per_batch))
#         if self.pbar:
#             chunks = tqdm(chunks)
        
#         output_files = []
#         n = self.num_workers
#         for chunk in chunks:
#             breakpoint()
#             datasets = U.parallel(_extract, chunk, num_workers=n)
#             saved_files = U.parallel(_save, zip(datasets, chunk), num_workers=n)
#             output_files.extend(saved_files)
        
#         return output_files

In [26]:
# extractor = FeatureExtractor(
#     backend='disk',
#     meta=meta,
#     steps=[
#         id_features,
#         counting_features, 
#         performance_features, 
#         timestamp_features
#     ]
# )

In [27]:
# extractor(trn_files)

In [28]:
def process_from_disk(filenames, meta, funcs, files_per_batch=16, 
                      num_workers=cpu_count(), pbar=True, test=False,
                      output_dir='/tmp/bowl2019/prepared'):
    
    os.makedirs(output_dir, exist_ok=True)
    
    def extract(filename):
        df = feather.read_dataframe(filename)
        features = [pd.DataFrame(f(df, meta, test)) for f in funcs]
        return pd.concat(features, axis=1)
    
    def save(pair):
        dataframe, filename = pair
        name = os.path.basename(filename)
        output_file = os.path.join(output_dir, name)
        dataframe = dataframe.reset_index(drop=True)
        dataframe.to_pickle(output_file)
        return output_file
    
    chunks = list(U.chunks(filenames, files_per_batch))
    if pbar:
        chunks = tqdm(chunks)

    output_files = []
    for chunk in chunks:
        datasets = U.parallel(extract, chunk, num_workers=num_workers)
        saved_files = U.parallel(save, zip(datasets, chunk), num_workers=num_workers)
        output_files.extend(saved_files)
        
    return output_files

In [29]:
params = dict(
    meta=meta, 
    funcs=[
        id_features,
        counting_features, 
        performance_features, 
        timestamp_features
    ]
)

In [None]:
trn_prep_files = process_from_disk(filenames=trn_files, 
                                   output_dir=f'{tmpdir}/trn_prep', 
                                   **params)

In [None]:
tst_prep_files = process_from_disk(filenames=tst_files, 
                                   output_dir=f'{tmpdir}/tst_prep',
                                   test=True, 
                                   **params)

In [31]:
def post_processing(filenames, meta, pbar=True, output_dir='/tmp/bowl2019/post'):
    os.makedirs(output_dir, exist_ok=True)
    
    def transform(dataframe, key, agg): 
        return dataframe.groupby('installation_id')[key].transform(agg)
    
    if pbar:
        filenames = tqdm(filenames)
    events = [f'cnt_{code}' for code in meta.event_code]
    processed_files = []
    for filename in filenames:
        name = os.path.basename(filename)
        df = pd.read_pickle(filename)
        if df.empty:
            continue
        df['installation_session_count'] = len(df['cnt_Clip'])
        df['installation_duration_mean'] = df['duration_mean'].mean()
        df['installation_title_nunique'] = df['session_title'].nunique()
        df['installation_events_sum'] = df[events].sum(axis=1)
        df['installation_events_mean'] = df['installation_events_sum'].mean()
        new_file = os.path.join(output_dir, name)
        df.to_pickle(new_file)
        processed_files.append(new_file)
    return processed_files

In [None]:
trn_prep_files = post_processing(trn_prep_files, meta, output_dir=f'{tmpdir}/trn_post')

In [None]:
tst_prep_files = post_processing(tst_prep_files, meta, output_dir=f'{tmpdir}/tst_post')

In [34]:
def save_meta(meta, filename): 
    joblib.dump(meta._asdict(), filename)
    return filename

In [35]:
def load_meta(filename):
    meta = joblib.load(filename)
    return U.named_tuple('Meta', **meta)

In [None]:
save_meta(meta, f'{tmpdir}/meta.joblib')

## Restore Saved

In [36]:
meta = load_meta(f'{tmpdir}/meta.joblib')

In [41]:
files_root = '/tmp/bowl2019/trn_post'
trn_files = [f'{files_root}/{fn}' for fn in os.listdir(files_root)]

In [38]:
def encode(dataset, columns, encoders=None):
    encoders = encoders or {}
    for column in columns:
        encoder = encoders.get(column, {})
        if encoder:
            dataset[column] = dataset[column].map(encoders.get(column, -1))
        else:
            encoded, decoder = pd.factorize(dataset[column])
            dataset[column] = encoded
            encoders[column] = {k:i for i, k in enumerate(decoder)}
    return dataset, encoders

In [43]:
def read_dataset_from_files(filenames):
    dataset = pd.concat([
        pd.read_pickle(filename) 
        for filename in filenames], axis=0)
    dataset = dataset.reset_index(drop=True)
    return dataset

In [44]:
X_trn, title_enc = encode(read_dataset_from_files(trn_files), 
                          columns=['session_title'])

In [45]:
assert X_trn.isna().sum().sum() == 0

In [46]:
title_enc

{'session_title': {'Cart Balancer (Assessment)': 0,
  'Chest Sorter (Assessment)': 1,
  'Cauldron Filler (Assessment)': 2,
  'Mushroom Sorter (Assessment)': 3,
  'Bird Measurer (Assessment)': 4}}

In [48]:
def get_relevant_features(dataset):
    def nonzero(x): return not np.allclose(x, 0)
    columns = ['accuracy_group', 'installation_id', 'game_session']
    dataset = dataset.drop(columns=columns)
    nonzero_rows = dataset.sum(axis=1).map(nonzero)
    nonzero_cols = dataset.sum(axis=0).map(nonzero)
    features = dataset.loc[nonzero_rows, nonzero_cols].columns.tolist()
    return features

In [49]:
rel_feat = get_relevant_features(X_trn)

In [50]:
joblib.dump(rel_feat, f'{tmpdir}/features.joblib')

['/tmp/bowl2019/features.joblib']

In [51]:
joblib.dump(title_enc, f'{tmpdir}/title_enc.joblib')

['/tmp/bowl2019/title_enc.joblib']

In [52]:
X_trn.to_feather(f'{tmpdir}/X_trn.feather')

## Train

In [None]:
import lightgbm as lgb
import scipy
from sklearn.model_selection import GroupKFold

In [None]:
features = joblib.load(f'{tmpdir}/features.joblib')
X_trn = feather.read_dataframe(f'{tmpdir}/X_trn.feather')

In [None]:
k = 5
folds = GroupKFold(n_splits=k)
groups = X_trn['installation_id']
X = X_trn[features].copy()
y = X_trn['accuracy_group']

In [None]:
class RegressionCappa:
    def __init__(self, bounds):
        self.bounds = bounds
    def lightgbm(self, y_true, y_pred):
        y_rounded = round_regressor_predictions(y_pred, self.bounds)
        return 'cappa', qwk(y_true, y_rounded), True

In [None]:
def round_regressor_predictions(preds, coefs):
    x = preds.copy()
    for i, (lo, hi) in enumerate(zip(coefs[:-1], coefs[1:])):
        x[(x > lo) & (x <= hi)] = i
    return x

In [None]:
reg_metric = RegressionCappa([-np.inf, 1., 2., 3., +np.inf])

In [None]:
models = []
oof = np.zeros(X.shape[0], dtype=np.float32)
for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups), 1):
    print(f'Running k-fold {i} of {k}')
    x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    model = lgb.LGBMRegressor(n_estimators=1000, 
                              max_depth=15,
                              metric='rmse',
                              objective='regression',
                              learning_rate=1e-2)
    model.fit(x_trn, y_trn,
              eval_set=[(x_trn, y_trn), (x_val, y_val)],
              eval_names=['trn', 'val'],
              eval_metric=reg_metric.lightgbm,
              early_stopping_rounds=100,
              verbose=50,
              categorical_feature='auto')
    oof[val_idx] = model.predict(x_val)
    models.append(model)

In [None]:
def optimize_rounding_bounds(X, y):
    def _loss(coef):
        buckets = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=[0, 1, 2, 3])
        return -qwk(y, buckets)
    
    init_coef = [0.5, 1.5, 2.5]
    opt_coef = scipy.optimize.minimize(_loss, init_coef, method='nelder-mead')
    optimized = opt_coef['x']
    return [-np.inf] + optimized.tolist() + [np.inf]

In [None]:
opt_bounds = optimize_rounding_bounds(oof, y)
opt_reg_metric = RegressionCappa(opt_bounds)

In [None]:
joblib.dump(opt_bounds, '/home/ck/data/bowl2019/external/bounds.joblib')

In [None]:
models = []
oof = np.zeros(X.shape[0], dtype=np.float32)
for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups), 1):
    print(f'Running k-fold {i} of {k}')
    x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    model = lgb.LGBMRegressor(n_estimators=1000, 
                              max_depth=15,
                              metric='rmse',
                              objective='regression',
                              learning_rate=1e-2)
    model.fit(x_trn, y_trn,
              eval_set=[(x_trn, y_trn), (x_val, y_val)],
              eval_names=['trn', 'val'],
              eval_metric=opt_reg_metric.lightgbm,
              early_stopping_rounds=100,
              verbose=50,
              categorical_feature='auto')
    oof[val_idx] = model.predict(x_val)
    models.append(model)

In [None]:
joblib.dump(models, f'{tmpdir}/models_lightgbm_004.joblib')

In [None]:
import utils as U
from basedir import TRAIN, TEST
from dataset import load, load_sample, Subset, to_accuracy_group
from metric import qwk

In [None]:
export_dir = '/home/ck/data/bowl2019/external'
!cp /tmp/bowl2019/features.joblib {export_dir}
!cp /tmp/bowl2019/models.joblib {export_dir}
!cp /tmp/bowl2019/meta.joblib {export_dir}
!cp /tmp/bowl2019/train_enc.joblib {export_dir}
!cp ../basedir.py {export_dir}
!cp ../dataset.py {export_dir}
!cp ../metric.py {export_dir}
!cp ../utils.py {export_dir}

In [None]:
title_enc