In [None]:
# !cp /kaggle/input/data-bowl-2019-external-data/*.py /kaggle/working

In [1]:
%reload_ext autoreload
%autoreload 2w
%matplotlib inline
import warnings
import jupytools.syspath
def ignore(*args, **kwargs): pass
warnings.warn = ignore
jupytools.syspath.add('..')

In [2]:
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

import selection
import utils as U
from metric import optimize_rounding_bounds, make_cappa_metric
from training import EnsembleTrainer, get_default_config
from normalize import normalize

In [3]:
import joblib

In [3]:
non_train_cols = ['installation_id', 'game_session', 'accuracy_group']

def read_dataset():
    X_trn = pd.read_pickle('/tmp/X_trn.pickle')
    X_tst = pd.read_pickle('/tmp/X_tst.pickle')
    selector = selection.FeatureSelection(
        rules=[('nonzero', selection.non_zero_rows_and_cols)],
        ignore_cols=non_train_cols)
    features = selector.select(X_trn)
    X_trn['is_test'] = 0
    X_tst['is_test'] = 1
    dataset = pd.concat([X_trn, X_tst])
    return dataset, features

## Adversarial CV

In [None]:
# https://www.kaggle.com/poteman/sampling-train-data-and-use-prediction-as-feature

In [4]:
def accuracy(y_true, y_pred, t=0.5):
    from sklearn.metrics import balanced_accuracy_score
    y_hat = [0 if y < t else 1 for y in y_pred]
    return balanced_accuracy_score(y_true, y_hat)

In [5]:
def adv_cv(dataset, features, algo='lightgbm', target='is_test', config=None):
    trainer = EnsembleTrainer(
        algo=algo, eval_metric='rmse', 
        cv_metrics={'auc': roc_auc_score, 'acc': accuracy})
    fold = GroupKFold(n_splits=5)
    config = config or get_default_config(algo)
    U.set_nested(config, 'model_params.feature_fraction', 0.8)
    U.set_nested(config, 'model_params.bagging_fraction', 0.75)
    U.set_nested(config, 'model_params.bagging_freq', 1)
    result = trainer.train(dataset, features, fold, target=target)
    return result

In [6]:
def plot_feat_imp(fi, n=200, figsize=(45, 15)):
    f, ax = plt.subplots(1, 1, figsize=figsize, facecolor='white')
    fi.sort_values(ascending=False).head(n).plot.bar(ax=ax)
    ax.set_title('Most Important Features Different Between Train and Test ')

## Train As Is

In [None]:
dataset, features = read_dataset()
result_default = adv_cv(dataset, features)

In [None]:
plot_feat_imp(result_default.fi)

In [12]:
import numpy as np
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import joblib

In [19]:
dataset, features = read_dataset()
dataset = dataset.sample(dataset.shape[0]).reset_index(drop=True)

Excluding from consideration: ['installation_id', 'game_session', 'accuracy_group']
Applying feature selection rule: nonzero
Selected features: 1109 of 1145
Keeping only features, selected by every rule.
Final number of features changed from 1145 to 1109


In [None]:
dataset, features = read_dataset()
index = np.random.permutation(dataset.index)
dataset = dataset[index]
fold = GroupKFold(n_splits=5)
X = dataset[features]
y = dataset['is_test']
groups = dataset['installation_id']
oof = np.zeros(X.shape[0], dtype=np.float32)
cv = []

for i, (trn_idx, val_idx) in enumerate(fold.split(X, y, groups), 1):
    U.log(f'Running fold #{i}')
    x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    logreg = LogisticRegression(n_jobs=12)
    logreg.fit(x_trn, y_trn)
    probs = logreg.predict_proba(x_val)[:, 1]
    oof[val_idx] = probs
    cv.append(roc_auc_score(y_val, probs))

In [25]:
X = dataset[features]
y = dataset['is_test']

In [None]:
import joblib
X = dataset[features]
y = dataset['is_test']
estimator = xgb.XGBRFClassifier(
    gpu_id=1, tree_method='gpu_hist', max_depth=6, 
    learning_rate=1, n_estimators=250)
selector = RFECV(estimator, cv=5, verbose=1)
selector.fit(X, y)
joblib.dump(selector, 'selector.joblib')

In [28]:
selector = joblib.load('selector.joblib')

In [37]:
assert len(selector.ranking_) == len(features)

In [57]:
from collections import defaultdict

In [58]:
dataset, features = read_dataset()
index = np.random.permutation(dataset.index)
dataset = dataset.loc[index]
fold = GroupKFold(n_splits=5)

X = dataset[features]
y = dataset['is_test']
groups = dataset['installation_id']

ranks = pd.DataFrame({'feature': features, 'ranking': selector.ranking_})
ranks = ranks.sort_values(by=['ranking'])
threshold = (2, 5, 10, 20, 50, 100, 250, 500, 800)
cv = defaultdict(list)

for t in threshold:
    U.log(f'Predicting with ranks above : {t}')
    t_feat = ranks[ranks.ranking >= t].feature.tolist()
    oof = np.zeros(X.shape[0], dtype=np.float32)

    for i, (trn_idx, val_idx) in enumerate(fold.split(X, y, groups), 1):
        U.log(f'.. running fold #{i}')
        x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        estimator = xgb.XGBRFClassifier(
            gpu_id=1, tree_method='gpu_hist', max_depth=6, 
            learning_rate=1, n_estimators=250)
        estimator.fit(X[t_feat], y)
        oof[val_idx] = estimator.predict_proba(x_val[t_feat])[:, 1]
        cv[t].append(roc_auc_score(y_val, oof[val_idx]))

Excluding from consideration: ['installation_id', 'game_session', 'accuracy_group']
Applying feature selection rule: nonzero
Selected features: 1109 of 1145
Keeping only features, selected by every rule.
Final number of features changed from 1145 to 1109
Predicting with ranks above : 2
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold #4
.. running fold #5
Predicting with ranks above : 5
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold #4
.. running fold #5
Predicting with ranks above : 10
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold #4
.. running fold #5
Predicting with ranks above : 20
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold #4
.. running fold #5
Predicting with ranks above : 50
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold #4
.. running fold #5
Predicting with ranks above : 100
.. running fold #1
.. running fold #2
.. running fold #3
.. running fold 

In [61]:
import bundle

In [62]:
bundle.save(ranks[ranks.ranking >= t].feature.tolist(), 'features')

'/tmp/bowl2019/features.joblib'

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

estimator = xgb.XGBRFClassifier(gpu_id=1, tree_method='gpu_hist',
                                max_depth=6, learning_rate=1,
                                n_estimators=1000)

folds = StratifiedKFold()
X_sim = X[X.columns[~selector.support_]]
cv = []

for trn_idx, val_idx in folds.split(X_sim, y):
    x_trn, y_trn = X_sim.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X_sim.iloc[val_idx], y.iloc[val_idx]
    trees = clone(estimator)
    trees.fit(x_trn, y_trn)
    probs = trees.predict_proba(x_val)[:, 1]
    cv.append(roc_auc_score(y_val, probs))

In [None]:
joblib.dump(X.columns[~selector.support_].tolist(), '/tmp/bowl2019/features.joblib')

## Train With Standard Scaler

In [None]:
def standard_scaler(dataset, features, stats=None):
    stats = stats or {}
    for feature in features:
        if feature not in stats:
            m = dataset[feature].mean()
            s = dataset[feature].std() + 1e-8
            stats[feature] = {'mean': m, 'std': s}
        dataset[feature] = (dataset[feature] - stats[feature]['mean'])/stats[feature]['std']

In [None]:
dataset, features = read_dataset()
cnt_features = U.starts_with(features, 'cnt_')
standard_scaler(dataset, cnt_features)
result_std = adv_cv(dataset, features)

In [None]:
plot_feat_imp(result_std.fi)

## Train With Grouped Normalization: Session Title

In [None]:
def standard_scaler_grouped(dataset, features, grouping_key='session_title'):
    def _standard_scaler(x):
        m, s = x.mean(), x.std()
        return (x - m)/(s + 1e-8)
    groups = dataset.groupby(grouping_key)
    for feature in features:
        dataset[feature] = groups[feature].transform(_standard_scaler)

In [None]:
dataset, features = read_dataset()
cnt_features = U.starts_with(features, 'cnt_')
standard_scaler_grouped(dataset, cnt_features)
result_std_grouped = adv_cv(dataset, features)

In [None]:
plot_feat_imp(result_std_grouped.fi)

## Train With Grouped Normalization: Installation ID

In [None]:
dataset, features = read_dataset()
cnt_features = U.starts_with(features, 'cnt_')
standard_scaler_grouped(dataset, cnt_features, grouping_key='installation_id')
result_std_inst = adv_cv(dataset, features)

In [None]:
plot_feat_imp(result_std_inst.fi)

## Adversarial CV with Normalization

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as models

In [None]:
print("Num GPUs Available:", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
def root_mean_squared_error(y_true, y_pred):
    K = tf.keras.backend
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
dataset, features = read_dataset()
target_col = 'is_test'
cat_cols = ['session_title']
num_cols = [f for f in features if f not in cat_cols]

In [None]:
normalize(dataset, num_cols, grouping_key='session_title')

In [None]:
def build_model(num_cols, cat_cols, cat_sizes, 
                output_size, output_act, loss):
    
    def prepare_input(data):
        return [data[num_cols]] + [data[col].T for col in cat_cols]
    
    def numerical(input_size):
        i = L.Input(shape=(input_size,))
        x = L.Dense(2048, activation='relu', use_bias=False)(i)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.5)(x)
        m = models.Model(inputs=i, outputs=x)
        return m
    
    def categorical(cat_sizes):
        inputs, embeds = [], []
        for cat_size in cat_sizes:
            emb_sz = min(50, cat_size // 2)
            i = L.Input(shape=(1,))
            x = L.Embedding(output_dim=emb_sz, input_dim=cat_size)(i)
            inputs.append(i)
            embeds.append(x)
        if len(embeds) > 1:
            x = L.concatenate(embeds)
        x = L.Flatten()(x)
        m = models.Model(inputs=inputs, outputs=x)
        return m
    
    with tf.device('/GPU:1'):
        num = numerical(len(num_cols))
        cat = categorical(cat_sizes)
        x = L.concatenate(num.outputs + cat.outputs)
        x = L.Dense(1024, activation='relu', use_bias=False)(x)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.25)(x)
        x = L.Dense(512, activation='relu', use_bias=False)(x)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.25)(x)
        x = L.Dense(256, activation='relu', use_bias=False)(x)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.25)(x)
        x = L.Dense(output_size, activation=output_act)(x)
        model = models.Model(inputs=num.inputs + cat.inputs, outputs=x)
        model.compile(optimizer='rmsprop', loss=root_mean_squared_error)
        
    return model, prepare_input

In [None]:
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [None]:
folds = GroupKFold(n_splits=5)
dataset = dataset.sample(dataset.shape[0])
group = dataset['installation_id']
X, y = dataset[features], dataset['is_test']
oof = np.zeros(len(y), dtype=np.float32)

for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, group), 1):
    print(f'Training fold #{i}')
    x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    net, split_input = build_model(cat_cols=cat_cols, num_cols=num_cols, 
                                   cat_sizes=[5], loss='binary_crossentropy',
                                   output_size=1, output_act='sigmoid')
    net.fit(x=split_input(x_trn), y=y_trn,
            validation_data=(split_input(x_val), y_val),
            epochs=50, batch_size=2560,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)])
    probs = net.predict(split_input(x_val))
    oof[val_idx] = probs.ravel()

In [None]:
roc_auc_score(y, oof)

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(net, show_shapes=True, rankdir='LR')