## 0. Libraries and Personal Tools

In [1]:
import sys
from os.path import abspath

from multiprocessing import cpu_count
from gc import collect

In [2]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [3]:
from pandas import set_option
set_option("display.max_rows", 200)
set_option("display.max_columns", 100)
set_option("display.max_colwidth", 200)

In [4]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

from src.models.train_model import BaseModel

## 1. Build Base Model

In [5]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

In [6]:
base_model.base_pipeline

In [7]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(base_model.data)).isna().sum().sum()

## 2. Parameter Optimization

### 2.1. Split Data

In [8]:
from src.utils import create_kf_groups

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold

from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper, CheckpointSaver
from skopt.space import Real, Categorical, Integer

In [9]:
from sklearn.model_selection import GroupShuffleSplit

gsp = GroupShuffleSplit(n_splits=2, test_size=0.20, random_state=777)
train_index, test_index = next(gsp.split(base_model.data, groups=base_model.data.index.get_level_values("game_num")))

X_train = base_model.data[features + [target]].iloc[train_index]

X_valid = base_model.data[features].iloc[test_index]
y_valid = base_model.data[target].iloc[test_index]

In [10]:
train_index, test_index = next(gsp.split(X_train, groups=X_train.index.get_level_values("game_num")))

X_train_cv = X_train[features].iloc[train_index]
y_train_cv = X_train[target].iloc[train_index]

X_test = X_train[features].iloc[test_index]
y_test = X_train[target].iloc[test_index]

In [11]:
del X_train

In [12]:
print(f"X_train_cv.shape: {X_train_cv.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_valid.shape: {X_valid.shape}")

X_train_cv.shape: (1358639, 54)
X_test.shape: (338135, 54)
X_valid.shape: (422949, 54)


In [13]:
base_model.base_pipeline

In [14]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(X_train_cv)).describe().transpose()

In [15]:
scale_pos_w = y_train_cv.value_counts().round(-4)
scale_pos_w = scale_pos_w[0] / scale_pos_w[1]

In [16]:
base_model.base_pipeline.fit(X_train_cv, y_train_cv)

In [17]:
X_train_trans = base_model.base_pipeline.transform(X_train_cv)
X_test_trans = base_model.base_pipeline.transform(X_test)

### 2.3. Define K-Group-Folds

In [18]:
n_folds = 5

game_num = X_train_cv.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)

gkf = GroupKFold(n_splits=n_folds)

In [19]:
groups.value_counts()

a    275668
b    273026
c    269483
d    267850
e    272612
dtype: int64

In [20]:
if base_model.config["model"]["ipca"]["batch_size"] == "auto":
    TOTAL_IPCA_BATCHES = 50
    ipca_batch = int(round(groups.value_counts().mean() / TOTAL_IPCA_BATCHES, -3))
else:
    ipca_batch = base_model.config["model"]["ipca"]["batch_size"]
ipca_batch

5000

In [21]:
base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))


In [22]:
if base_model.config["model"]["type"] == "xgb":
    clf = XGBClassifier(objective="binary:logistic", random_state=777)
    search_spaces = {
            "xgb__n_estimators": Integer(200, 400),
            "xgb__learning_rate": Real(0.05, 0.15, "uniform"),
            "xgb__max_depth": Integer(4, 6),
            "xgb__gamma": Real(0.05, 0.10, "uniform"),
            "xgb__subsample": Real(0.6, 0.8, "uniform"),
            "xgb__colsample_bytree": Real(0.8, 1.0, "uniform"),
        }

elif base_model.config["model"]["type"] == "lgbm":
    clf = LGBMClassifier(
        objective="binary", random_state=777, n_jobs=cpu_count(),
        n_estimators=1000,
        min_child_samples=None, min_split_gain=None
        )


In [23]:
base_model.base_pipeline.steps.append((base_model.config["model"]["type"], clf))

In [24]:
base_model.base_pipeline

In [25]:
collect()

256

### 2.4. Hyperparameters - Bayesian Optimization

In [26]:
from skopt.space import Integer, Categorical, Real
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean as np_mean

# -----------------------------------------------------------------------------------
#                   Guide on which params to tune/ NOT to tune
#           source: https://github.com/Microsoft/LightGBM/issues/695
# -----------------------------------------------------------------------------------
# 
# For heavily unbalanced datasets such as 1:10000:
# 
# - max_bin: keep it only for memory pressure, not to tune (otherwise overfitting)
# - learning rate: keep it only for training speed, not to tune (otherwise overfitting)
# - n_estimators: must be infinite and use early stopping to auto-tune (otherwise overfitting)
# - num_leaves: [7, 4095]
# - max_depth: [2, 63] and infinite 
# - scale_pos_weight: [1, 10000] 
# - min_child_weight: [0.01, (sample size / 1000)] 
# - subsample: [0.4, 1]
# - bagging_fraction: only 1, keep as is (otherwise overfitting)
# - colsample_bytree: [0.4, 1]
# 
# Never tune following parameters unless you have an explicit requirement to tune them:
#
# - Learning rate (lower means longer to train but more accurate, higher means smaller to train but less accurate)
# - Number of boosting iterations (automatically tuned with early stopping and learning rate)
# - Maximum number of bins (RAM dependent)

# set up hyperparameter space
space = [
    # model complexity
    Integer(1500, 3000, name="num_leaves"),
    Integer(2, 15, name="max_depth"),
    # Integer(50, 100, name="n_estimators"),
    Real(5, 20, name="min_child_weight"),
    
    # penalization to reduce overfitting
    Real(0.01, 0.10, name="learning_rate"),
    # Real(0.50, 20.0, name="reg_alpha"),
    
    # model train speed
    Integer(int(groups.value_counts().mean().round(-4)*0.01), int(groups.value_counts().mean().round(-4)*0.05), name="min_data_in_leaf"),
    Real(0.05, 0.15, name="min_gain_to_split"),
    Real(0.25, 0.85, name="subsample"),
    Real(0.25, 1.0, name="colsample_bytree"),
    
    # target class unbalance
    Real(int(scale_pos_w)-5, int(scale_pos_w)+5, name="scale_pos_weight"),
    
    # Integer(7500, 10000, name="n_estimators"),
    # Integer(500, 5000, name="num_leaves"),
    # Integer(100, 750, name="min_data_in_leaf"),
    ]

from sklearn.model_selection import cross_val_score
from typing import Callable

@use_named_args(space)
def objective(**params):
    base_model.base_pipeline["lgbm"].set_params(**params)
    return -np_mean(
        cross_val_score(
            base_model.base_pipeline["lgbm"], X_train_trans, y_train_cv, 
            cv=GroupKFold(n_splits=n_folds).split(X_train_cv, y_train_cv, groups=groups), 
            n_jobs=cpu_count(),
            scoring="neg_log_loss", 
            fit_params={
                "eval_set": [(X_test_trans, y_test)],
                "eval_metric": "neg_log_loss",
                "callbacks": [
                    early_stopping(300),
                    log_evaluation(period=100, show_stdv=True),
                    ],
            }
            )
        )

In [27]:
reg_gp = gp_minimize(
    objective, space,
    verbose=2, 
    random_state=777, n_calls=35,
    n_random_starts=3, 
    callback=[
        CheckpointSaver("../models/optmization/checkpoints/lgbm_low_memory.pkl", compress=9),
        DeltaYStopper(
            delta=0.0005,
            n_best=5,
            ),
        ]
    )

Iteration No: 1 started. Evaluating function at random point.
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.547385
[100]	valid_0's binary_logloss: 0.547523
[100]	valid_0's binary_logloss: 0.545029
[100]	valid_0's binary_logloss: 0.54815
[200]	valid_0's binary_logloss: 0.562081
[200]	valid_0's binary_logloss: 0.562028
[200]	valid_0's binary_logloss: 0.562845
[200]	valid_0's binary_logloss: 0.559453
[300]	valid_0's binary_logloss: 0.558408
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.219459
[300]	valid_0's binary_logloss: 0.557902
[300]	valid_0's binary_logloss: 0.559076
[300]	valid_0's binary_logloss: 0.555627
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.21947
Early stopping, best iteration is:
[1]	valid_0's bin



[300]	valid_0's binary_logloss: 0.461166
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.219338
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.455538
[200]	valid_0's binary_logloss: 0.464915
[300]	valid_0's binary_logloss: 0.458772
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.219221
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 770.9169
Function value obtained: 0.2177
Current minimum: 0.2177
Iteration No: 4 started. Searching for the next optimal point.
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.363375
[100]	valid_0's binary_logloss: 0.363764
[100]	valid_0's binary_logloss: 0.363345
[100]	valid_0's binary_logloss: 0.361867
[200]	valid_0's bi



[300]	valid_0's binary_logloss: 0.617856
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.235958
[300]	valid_0's binary_logloss: 0.615642
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.235813
[300]	valid_0's binary_logloss: 0.612477
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.23552
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.658956
[200]	valid_0's binary_logloss: 0.632213
[300]	valid_0's binary_logloss: 0.611893
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.235283
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 1227.9027
Function value obtained: 0.2341
Current minimum: 0.2177
Iteration No: 6 started. Searching for the next optimal point.
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Tra



[300]	valid_0's binary_logloss: 0.539978
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.221378
[300]	valid_0's binary_logloss: 0.540419
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.221313
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.572237
[200]	valid_0's binary_logloss: 0.551421
[300]	valid_0's binary_logloss: 0.535633
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.220912
Iteration No: 7 ended. Search finished for the next optimal point.
Time taken: 618.1135
Function value obtained: 0.2196
Current minimum: 0.2177
Iteration No: 8 started. Searching for the next optimal point.
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.475579
[100]	valid_0's



[300]	valid_0's binary_logloss: 0.514572
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.221604
[300]	valid_0's binary_logloss: 0.516469
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.22198
[300]	valid_0's binary_logloss: 0.51631
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.22195
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.534107
[200]	valid_0's binary_logloss: 0.52225
[300]	valid_0's binary_logloss: 0.513672
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.22157
Iteration No: 9 ended. Search finished for the next optimal point.
Time taken: 918.3988
Function value obtained: 0.2202
Current minimum: 0.2175
Iteration No: 10 started. Searching for the next optimal point.
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Training until validation scores don't improve for 300 rounds
Trainin

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
print('best score: {}'.format(reg_gp.fun))
print('best params:')
for i, param in enumerate(space):
    print(f"{param.name}: {reg_gp.x[i]} from space: [{param.low}, {param.high}]")

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

best_model_params = dict()
for i, param in enumerate(space):
    best_model_params[f"{param.name}"] = reg_gp.x[i]

best_model_params

### ~~2.4. Hyperparameters - Bayesian Optimization~~ (Deprecated)

In [None]:
# Kudos to: Luca Massaron
# Source: https://www.kaggle.com/code/lucamassaron/tutorial-bayesian-optimization-with-xgboost
# https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV
# https://github.com/fmfn/BayesianOptimization
# 

# NOTE: I was never able to pass fit parameters to the model with BayesSearchCV

# import warnings
# warnings.filterwarnings("ignore")
# 
# bayes_opt = BayesSearchCV(
#     estimator=base_model.base_pipeline["lgbm"],
#     search_spaces=search_spaces, 
#     n_iter=15,
#     pre_dispatch=30,
#     n_jobs=cpu_count(), 
#     iid=False,
#     verbose=2, 
#     scoring="neg_log_loss",
#     optimizer_kwargs={'base_estimator': 'GP'},
#     fit_params={
#         "early_stopping_rounds": 10, 
#         "verbose": 1,
#         # "eval_set": [(X_trans, y_test)],
#         # "eval_names": ["valid"],
#         # "eval_metric": "neg_log_loss",
#         "callbacks": [log_evaluation(period=25, show_stdv=True)],
#         },
#     cv=GroupKFold(n_splits=n_folds).split(X_train, y_train, groups=groups),
#     random_state=777,
#     )

In [None]:
# NOTE: Maybe this was the issue. By defining a custom function to fit the optimizer 

# import pprint
# from tabnanny import verbose
# from time import time
# 
# def report_perf(optimizer, X, y, title="model", callbacks=None):
#     """
#     A wrapper for measuring time and performances of different optmizers
#     
#     optimizer = a sklearn or a skopt optimizer
#     X = the training set 
#     y = our target
#     title = a string label for the experiment
#     """
#     start = time()
#     
#     if callbacks is not None:
#         optimizer.fit(X, y, callback=callbacks)
#     else:
#         optimizer.fit(X, y)
#         
#     d=DataFrame(optimizer.cv_results_)
#     best_score = optimizer.best_score_
#     best_score_std = d.iloc[optimizer.best_index_].std_test_score
#     best_params = optimizer.best_params_
#     
#     print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
#            + u"\u00B1"+" %.3f") % (time() - start, 
#                                    len(optimizer.cv_results_['params']),
#                                    best_score,
#                                    best_score_std))    
#     print('Best parameters:')
#     pprint.pprint(best_params)
#     print()
#     return best_params

In [None]:
# best_params = report_perf(
#     bayes_opt, 
#     X_train_trans, y_train, 
#     "LGBM", 
#     callbacks=[
#         DeltaYStopper(delta=0.01), 
#         # DeadlineStopper(120)
#         ],
#     )

## 3. Train with All Data

In [None]:
# from json import load
# with open(f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.json", "r") as f:
#     best_model_params = load(f)

In [None]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

if base_model.config["model"]["type"] == "xgb":
    best_model = XGBClassifier(**best_model_params, random_state=777)
elif base_model.config["model"]["type"] == "lgbm":
    best_model = LGBMClassifier(**best_model_params, min_child_samples=None, random_state=777)

base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))

X_test_trans = base_model.base_pipeline.fit_transform(X_test)

base_model.base_pipeline.steps.append((base_model.config["model"]["type"], best_model))

In [None]:
fit_params = {
    f"{model}__eval_set": [(X_test_trans, y_test)],
    f"{model}__eval_metric": "neg_log_loss",
    f"{model}__callbacks": [
        early_stopping(200),
        log_evaluation(period=50, show_stdv=True), 
    ],
}

fit_params.keys()

In [None]:
if base_model.config["model"]["type"] == "xgb":
    best_model = base_model.base_pipeline.fit(X_train_cv, y_train_cv)

elif base_model.config["model"]["type"] == "lgbm":
    best_model = base_model.base_pipeline.fit(
        X=X_train_cv, y=y_train_cv, **fit_params)

## 4. Save Model

In [None]:
best_model_params

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

from joblib import dump
dump(best_model, f"../models/team{team}/latest_{model}_ipca_10perc/latest_{model}_ipca_10perc.joblib")

from json import dump, dumps
with open(f"../models/team{team}/latest_{model}_ipca_10perc/latest_{model}_ipca_10perc.json", "w") as f:
    dump(dumps(best_model_params, default=str), f)

## 5. Evaluate Model

In [None]:
preds = best_model.predict_proba(X_valid)[:,1]

In [None]:
from sklearn.metrics import log_loss
log_loss(y_valid, preds)

## 4. Save Model

In [None]:
import numpy as np
import seaborn as sns
from pandas.core.frame import DataFrame, Series
from pandas import concat
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve

# Kudos to: Mateus Coelho
# https://www.kaggle.com/code/mateuscco/how-to-evaluate-model-calibration/notebook

def ece(y_test, preds, strategy = 'uniform'):
    df = DataFrame({'target': y_test, 'proba': preds, 'bin': np.nan})
    
    if(strategy == 'uniform'):
        lim_inf = np.linspace(0, 0.9, 10)
        for idx, lim in enumerate(lim_inf):
            df.loc[df['proba'] >= lim, 'bin'] = idx

    elif(strategy == 'quantile'):
        pass
    
    df_bin_groups = concat([df.groupby('bin').mean(), df['bin'].value_counts()], axis = 1)
    df_bin_groups['ece'] = (df_bin_groups['target'] - df_bin_groups['proba']).abs() * (df_bin_groups['bin'] / df.shape[0])
    return df_bin_groups['ece'].sum()

def make_report(y_test, preds):
    # Computing AUC
    auc = roc_auc_score(y_test, preds)
    display(f'AUROC: {auc}')
    display(f'AUROC: {2*auc-1}')
    display(f'Fraction of positive cases in the test set: {y_test.mean()}')
    display(f'Mean predicted value in the test set:       {preds.mean()}')
    display(f'ECE (equal width bins):       {ece(y_test, preds)}')
    
    # Plotting probabilities
    display('#### Histogram of the probability distribution')
    Series(preds).hist(bins = 40)
    plt.show()
    
    # Plotting KDE by class
    display('#### KDE plots of the probability distribution by class')
    fig, ax1 = plt.subplots()
    sns.kdeplot(preds[y_test == 0], label = 'No goal', ax = ax1)
    ax2 = ax1.twinx()
    sns.kdeplot(preds[y_test == 1], label = 'Goal within 10s', color = 'red', ax = ax2)
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    
    # Plotting calibration
    display('#### Calibration curve (equal width bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10)
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,1.0],[0,1.0], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()
    
    display('#### Calibration curve (equal size bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10, strategy='quantile')
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,1.0],[0,1.0], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()

In [None]:
make_report(y_valid, preds)