## 0. Libraries and Personal Tools

In [4]:
import sys
from os.path import abspath

from multiprocessing import cpu_count
from gc import collect

In [5]:
cpu_count()

48

In [6]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [7]:
from pandas import set_option
set_option("display.max_rows", 200)
set_option("display.max_columns", 100)
set_option("display.max_colwidth", 200)

In [8]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

from src.models.train_model import BaseModel

## 1. Build Base Model

In [9]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

In [10]:
# base_model.base_pipeline

In [11]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(base_model.data)).isna().sum().sum()

## 2. Parameter Optimization

### 2.1. Split Data

In [12]:
from src.utils import create_kf_groups

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold

from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper, CheckpointSaver
from skopt.space import Real, Categorical, Integer

In [13]:
from sklearn.model_selection import GroupShuffleSplit

gsp = GroupShuffleSplit(n_splits=2, test_size=0.20, random_state=777)
train_index, test_index = next(gsp.split(base_model.data, groups=base_model.data.index.get_level_values("game_num")))

X_train = base_model.data[features + [target]].iloc[train_index]

X_valid = base_model.data[features].iloc[test_index]
y_valid = base_model.data[target].iloc[test_index]

In [14]:
train_index, test_index = next(gsp.split(X_train, groups=X_train.index.get_level_values("game_num")))

X_train_cv = X_train[features].iloc[train_index]
y_train_cv = X_train[target].iloc[train_index]

X_test = X_train[features].iloc[test_index]
y_test = X_train[target].iloc[test_index]

In [15]:
# del X_train

In [16]:
print(f"X_train_cv.shape: {X_train_cv.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_valid.shape: {X_valid.shape}")

X_train_cv.shape: (13587279, 54)
X_test.shape: (3381296, 54)
X_valid.shape: (4229461, 54)


In [17]:
base_model.base_pipeline.named_steps.keys()

dict_keys(['preprocessor'])

In [18]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(X_train_cv)).describe().transpose()

In [19]:
scale_pos_w = y_train_cv.value_counts().round(-4)
scale_pos_w = scale_pos_w[0] / scale_pos_w[1]

In [20]:
base_model.base_pipeline.fit(X_train_cv, y_train_cv)

Pipeline(steps=[('preprocessor',
                 FeatureUnion(transformer_list=[('player',
                                                 ColumnTransformer(transformers=[('demolished',
                                                                                  PlayerDemolished(),
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f1d8df01f50>),
                                                                                 ('position',
                                                                                  Pipeline(steps=[('imputer',
                                                                                                   SimpleImputer(fill_value=0.0,
                                                                                                                 strategy='constant')),
                                                                         

In [21]:
X_train_trans = base_model.base_pipeline.transform(X_train_cv)
X_test_trans = base_model.base_pipeline.transform(X_test)

### 2.3. Define K-Group-Folds

In [22]:
n_folds = 5

game_num = X_train_cv.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)

gkf = GroupKFold(n_splits=n_folds)

In [23]:
groups.value_counts()

a    2756702
b    2730467
c    2694667
d    2678892
e    2726551
dtype: int64

In [24]:
if base_model.config["model"]["ipca"]["batch_size"] == "auto":
    TOTAL_IPCA_BATCHES = 50
    ipca_batch = int(round(groups.value_counts().mean() / TOTAL_IPCA_BATCHES, -3))
else:
    ipca_batch = base_model.config["model"]["ipca"]["batch_size"]
ipca_batch

54000

In [25]:
base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))


In [None]:
cluster.

In [26]:
import lightgbm as lgb
from distributed import Client, LocalCluster

cluster = LocalCluster()
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33979 instead
  f"Port {expected} is already in use.\n"


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:33979/status,

0,1
Dashboard: http://127.0.0.1:33979/status,Workers: 8
Total threads: 48,Total memory: 128.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36569,Workers: 8
Dashboard: http://127.0.0.1:33979/status,Total threads: 48
Started: Just now,Total memory: 128.00 GiB

0,1
Comm: tcp://127.0.0.1:42419,Total threads: 6
Dashboard: http://127.0.0.1:41819/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:33007,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-jhtep7qk,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-jhtep7qk

0,1
Comm: tcp://127.0.0.1:39285,Total threads: 6
Dashboard: http://127.0.0.1:43163/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:39195,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-7zel2sdq,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-7zel2sdq

0,1
Comm: tcp://127.0.0.1:33309,Total threads: 6
Dashboard: http://127.0.0.1:46821/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:45663,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-bcl66rnn,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-bcl66rnn

0,1
Comm: tcp://127.0.0.1:46271,Total threads: 6
Dashboard: http://127.0.0.1:38259/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:37169,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-docji_gg,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-docji_gg

0,1
Comm: tcp://127.0.0.1:34601,Total threads: 6
Dashboard: http://127.0.0.1:33367/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:37345,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-tupio19u,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-tupio19u

0,1
Comm: tcp://127.0.0.1:43021,Total threads: 6
Dashboard: http://127.0.0.1:40319/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:45935,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-xph5odv7,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-xph5odv7

0,1
Comm: tcp://127.0.0.1:33161,Total threads: 6
Dashboard: http://127.0.0.1:36509/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:41533,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-vlk04gm9,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-vlk04gm9

0,1
Comm: tcp://127.0.0.1:36113,Total threads: 6
Dashboard: http://127.0.0.1:34733/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:40209,
Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-9crvquvm,Local directory: /mnt/cephfs/hadoop-compute/phoenix/ian.castillo/my-kaggle-tools/competitions/tabular-playground-series-oct-2022/notebooks/dask-worker-space/worker-9crvquvm


In [None]:
import dask.distributed
dask.config.config

{'temporary-directory': None,
 'tokenize': {'ensure-deterministic': False},
 'dataframe': {'shuffle-compression': None,
  'parquet': {'metadata-task-size-local': 512,
   'metadata-task-size-remote': 16}},
 'array': {'svg': {'size': 120},
  'slicing': {'split-large-chunks': None},
  'chunk-size': '128 MiB',
  'rechunk-threshold': 4},
 'optimization': {'fuse': {'active': None,
   'ave-width': 1,
   'max-width': None,
   'max-height': inf,
   'max-depth-new-edges': None,
   'subgraphs': None,
   'rename-keys': True}},
 'distributed': {'version': 2,
  'scheduler': {'allowed-failures': 3,
   'bandwidth': 100000000,
   'blocked-handlers': [],
   'default-data-size': '1kiB',
   'events-cleanup-delay': '1h',
   'idle-timeout': None,
   'transition-log-length': 100000,
   'events-log-length': 100000,
   'work-stealing': True,
   'work-stealing-interval': '100ms',
   'worker-ttl': None,
   'pickle': True,
   'preload': [],
   'preload-argv': [],
   'unknown-task-duration': '500ms',
   'default-t

In [33]:
dask.config.set({"array.chunk-size": "128 MiB"})
dask.config.set({"distributed.workers.memory.spill": 0.85})
dask.config.set({"distributed.workers.memory.target": 0.75})
dask.config.set({"distributed.workers.memory.terminate": 0.98})

<dask.config.set at 0x7f1a2c8d3590>

In [27]:
dask_clf = lgb.DaskLGBMClassifier(
    client=client, 
    objective="binary", random_state=777, # n_jobs=cpu_count(),
    boosting_type="goss", n_estimators=1000, learning_rate=0.01, max_bin=25,
    min_child_samples=None, min_split_gain=None
    )

In [28]:
# base_model.base_pipeline.steps.append((base_model.config["model"]["type"], dask_clf))

In [29]:
base_model.base_pipeline.named_steps.keys()

dict_keys(['preprocessor', 'ipca', 'lgbm'])

In [30]:
collect()

105

### 2.4. Hyperparameters - Bayesian Optimization

In [37]:
from skopt.space import Integer, Categorical, Real
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean as np_mean

# -----------------------------------------------------------------------------------
#                   Guide on which params to tune/ NOT to tune
#           source: https://github.com/Microsoft/LightGBM/issues/695
# -----------------------------------------------------------------------------------
# 
# For heavily unbalanced datasets such as 1:10000:
# 
# - max_bin: keep it only for memory pressure, not to tune (otherwise overfitting)
# - learning rate: keep it only for training speed, not to tune (otherwise overfitting)
# - n_estimators: must be infinite and use early stopping to auto-tune (otherwise overfitting)
# - num_leaves: [7, 4095]
# - max_depth: [2, 63] and infinite 
# - scale_pos_weight: [1, 10000] 
# - min_child_weight: [0.01, (sample size / 1000)] 
# - subsample: [0.4, 1]
# - bagging_fraction: only 1, keep as is (otherwise overfitting)
# - colsample_bytree: [0.4, 1]
# 
# Never tune following parameters unless you have an explicit requirement to tune them:
#
# - Learning rate (lower means longer to train but more accurate, higher means smaller to train but less accurate)
# - Number of boosting iterations (automatically tuned with early stopping and learning rate)
# - Maximum number of bins (RAM dependent)

# Also: https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5
# And: https://neptune.ai/blog/lightgbm-parameters-guide
# Finally: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html#deal-with-over-fitting
# About optimization: https://medium.com/sitechassethealthcenter/gaussian-process-to-optimize-hyperparameters-of-an-algorithm-5b4810277527

# set up hyperparameter space
space = [
    
    # boosting iterations
    # TODO: 10000 when training with all data
    # Integer(100, 550, name="n_estimators"),
    # Real(0.10, 0.15, name="learning_rate"), # it is recommended to use smaller learning_rate with larger num_iterations.
    
    # model complexity
    Integer(20, 1500, name="num_leaves"), # keep it relatively small to avoid overfitting
    Integer(5, 15, name="max_depth"),
    Real(5, 20, name="min_child_weight"),
    
    # penalization to reduce overfitting
    Real(0.0, 20.0, name="reg_alpha"),
    Real(0.0, 20.0, name="reg_lambda"),
    
    # model regularization
    Integer(int(groups.value_counts().mean().round(-4)*0.01), int(groups.value_counts().mean().round(-4)*0.05), name="min_data_in_leaf"),
    Real(0.05, 10, name="min_gain_to_split"),
    
    # model train speed
    # Real(15, 250, name="max_bin"),
    Real(0.25, 0.85, name="subsample"),
    Real(0.25, 1.0, name="colsample_bytree"),
    
    # target class unbalance
    Real(int(scale_pos_w)-5, int(scale_pos_w)+5, name="scale_pos_weight"),
    ]

from sklearn.model_selection import cross_val_score
from typing import Callable

import dask.dataframe as dd

dX_train_cv = dd.from_pandas(X_train_cv.reset_index(drop=True), npartitions=cpu_count()).persist()
dy_train_cv = dd.from_pandas(y_train_cv.reset_index(drop=True), npartitions=cpu_count()).persist()

@use_named_args(space)
def objective(**params):
    base_model.base_pipeline["lgbm"].set_params(**params)
    return -np_mean(
        cross_val_score(
            base_model.base_pipeline["lgbm"], X_train_trans, dy_train_cv, 
            cv=GroupKFold(n_splits=n_folds).split(X_train_cv, y_train_cv, groups=groups), 
            n_jobs=cpu_count(),
            verbose=1,
            scoring="neg_log_loss", 
            fit_params={
                "eval_set": [(X_test_trans, y_test)],
                "eval_metric": "binary_logloss",
                "callbacks": [
                    early_stopping(50),
                    log_evaluation(period=25, show_stdv=True), # the rule of thumb is to have it at 10% of your num_estimators
                    ],
            }
            )
        )

In [38]:
reg_gp = gp_minimize(
    objective, space,
    verbose=1, 
    n_jobs=cpu_count(),
    random_state=777, n_calls=50,
    n_initial_points=15, 
    callback=[
        # CheckpointSaver("../models/optmization/checkpoints/lgbm.pkl", compress=9),
        DeltaYStopper(
            delta=0.0005,
            n_best=5,
            ),
        ]
    )

Iteration No: 1 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


TypeError: Cannot serialize socket object

In [None]:
print('best score: {}'.format(reg_gp.fun))
print('best params:')
for i, param in enumerate(space):
    print(f"{param.name}: {reg_gp.x[i]} from space: [{param.low}, {param.high}]")

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

best_model_params = dict()
for i, param in enumerate(space):
    best_model_params[f"{param.name}"] = reg_gp.x[i]

best_model_params

## 3. Train with All Data

In [None]:
# from json import load
# with open(f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.json", "r") as f:
#     best_model_params = load(f)

In [None]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

if base_model.config["model"]["type"] == "xgb":
    best_model = XGBClassifier(**best_model_params, random_state=777)
elif base_model.config["model"]["type"] == "lgbm":
    best_model = LGBMClassifier(**best_model_params, min_child_samples=None, random_state=777)

base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))

X_test_trans = base_model.base_pipeline.fit_transform(X_test)

base_model.base_pipeline.steps.append((base_model.config["model"]["type"], best_model))

In [None]:
fit_params = {
    f"{model}__eval_set": [(X_test_trans, y_test)],
    f"{model}__eval_metric": "neg_log_loss",
    f"{model}__callbacks": [
        early_stopping(100),
        log_evaluation(period=50, show_stdv=True), 
    ],
}

fit_params.keys()

In [None]:
if base_model.config["model"]["type"] == "xgb":
    best_model = base_model.base_pipeline.fit(X_train_cv, y_train_cv)

elif base_model.config["model"]["type"] == "lgbm":
    best_model = base_model.base_pipeline.fit(
        X=X_train_cv, y=y_train_cv, **fit_params)

## 4. Save Model

In [None]:
best_model_params

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

from joblib import dump
dump(best_model, f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.joblib")

from json import dump, dumps
with open(f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.json", "w") as f:
    dump(dumps(best_model_params, default=str), f)

## 5. Evaluate Model

In [None]:
preds = best_model.predict_proba(X_valid)[:,1]

In [None]:
from sklearn.metrics import log_loss
log_loss(y_valid, preds)

## 4. Save Model

In [None]:
import numpy as np
import seaborn as sns
from pandas.core.frame import DataFrame, Series
from pandas import concat
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve

# Kudos to: Mateus Coelho
# https://www.kaggle.com/code/mateuscco/how-to-evaluate-model-calibration/notebook

def ece(y_test, preds, strategy = 'uniform'):
    df = DataFrame({'target': y_test, 'proba': preds, 'bin': np.nan})
    
    if(strategy == 'uniform'):
        lim_inf = np.linspace(0, 0.9, 10)
        for idx, lim in enumerate(lim_inf):
            df.loc[df['proba'] >= lim, 'bin'] = idx

    elif(strategy == 'quantile'):
        pass
    
    df_bin_groups = concat([df.groupby('bin').mean(), df['bin'].value_counts()], axis = 1)
    df_bin_groups['ece'] = (df_bin_groups['target'] - df_bin_groups['proba']).abs() * (df_bin_groups['bin'] / df.shape[0])
    return df_bin_groups['ece'].sum()

def make_report(y_test, preds):
    # Computing AUC
    auc = roc_auc_score(y_test, preds)
    display(f'AUROC: {auc}')
    display(f'AUROC: {2*auc-1}')
    display(f'Fraction of positive cases in the test set: {y_test.mean()}')
    display(f'Mean predicted value in the test set:       {preds.mean()}')
    display(f'ECE (equal width bins):       {ece(y_test, preds)}')
    
    # Plotting probabilities
    display('#### Histogram of the probability distribution')
    Series(preds).hist(bins = 40)
    plt.show()
    
    # Plotting KDE by class
    display('#### KDE plots of the probability distribution by class')
    fig, ax1 = plt.subplots()
    sns.kdeplot(preds[y_test == 0], label = 'No goal', ax = ax1)
    ax2 = ax1.twinx()
    sns.kdeplot(preds[y_test == 1], label = 'Goal within 10s', color = 'red', ax = ax2)
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    
    # Plotting calibration
    display('#### Calibration curve (equal width bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10)
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()
    
    display('#### Calibration curve (equal size bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10, strategy='quantile')
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()

In [None]:
make_report(y_valid, preds)