## 0. Libraries and Personal Tools

In [1]:
import dask
import sys
from os.path import abspath

from multiprocessing import cpu_count
from gc import collect

In [2]:
cpu_count()

48

In [3]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [4]:
from pandas import set_option
set_option("display.max_rows", 200)
set_option("display.max_columns", 100)
set_option("display.max_colwidth", 200)

In [5]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

from src.models.train_model import BaseModel

In [6]:
from src.utils import create_kf_groups

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold

from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper, CheckpointSaver
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import GroupShuffleSplit

In [7]:
best_model_params = {
    "n_estimators": 5000, 
    "learning_rate": 0.05, 
    "max_bin": 50,
    "num_leaves": 1500,
    "max_depth": 10,
    "min_child_weight": 10,
    "reg_alpha": 10.0,
    "reg_lambda": 10.0,
    # "min_data_in_leaf": int(groups.value_counts().mean().round(-4)*0.01),
    "min_gain_to_split": 5,
    "max_bin": 100,
    "subsample": 0.7,
    "colsample_bytree": 0.9,
    # "scale_pos_weight": int(scale_pos_w),
}

In [8]:
import dask.dataframe as dd

base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()


gsp = GroupShuffleSplit(n_splits=2, test_size=0.20, random_state=777)
train_index, test_index = next(gsp.split(base_model.data, groups=base_model.data.index.get_level_values("game_num")))

X_train = base_model.data[features + [target]].iloc[train_index]

X_valid = base_model.data[features].iloc[test_index]
y_valid = base_model.data[target].iloc[test_index]


train_index, test_index = next(gsp.split(X_train, groups=X_train.index.get_level_values("game_num")))

X_train_cv = X_train[features].iloc[train_index]
y_train_cv = X_train[target].iloc[train_index]

X_test = X_train[features].iloc[test_index]
y_test = X_train[target].iloc[test_index]


n_folds = 5
game_num = X_train_cv.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)
gkf = GroupKFold(n_splits=n_folds)


if base_model.config["model"]["ipca"]["batch_size"] == "auto":
    TOTAL_IPCA_BATCHES = 10
    ipca_batch = int(round(groups.value_counts().mean() / TOTAL_IPCA_BATCHES, -3))
else:
    ipca_batch = base_model.config["model"]["ipca"]["batch_size"]


base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))


base_model.base_pipeline.fit(X_train_cv, y_train_cv)

KeyboardInterrupt: 

In [None]:
X_train_trans = base_model.base_pipeline.transform(X_train_cv)
X_test_trans = base_model.base_pipeline.transform(X_test)
X_valid_trans = base_model.base_pipeline.transform(X_valid)

In [None]:
dX_train_cv = dd.from_pandas(X_train_trans.reset_index(drop=True), npartitions=cpu_count()).persist()
dy_train_cv = dd.from_pandas(y_train_cv.reset_index(drop=True), npartitions=cpu_count()).persist()

dX_test = dd.from_pandas(X_test_trans.reset_index(drop=True), npartitions=cpu_count()).persist()
dy_test = dd.from_pandas(y_test.reset_index(drop=True), npartitions=cpu_count()).persist()

dX_valid = dd.from_pandas(X_valid_trans.reset_index(drop=True), npartitions=cpu_count()).persist()
dy_valid = dd.from_pandas(y_valid.reset_index(drop=True), npartitions=cpu_count()).persist()

In [None]:
scale_pos_w = y_train_cv.value_counts().round(-4)
scale_pos_w = scale_pos_w[0] / scale_pos_w[1]

In [None]:
model_type = base_model.config["model"]["type"]
# base_model.base_pipeline.steps.append((model_type, best_model))

In [None]:
fit_params = {
    f"{model_type}__eval_set": [(dX_test, dy_test)],
    f"{model_type}__eval_metric": "neg_log_loss",
    f"{model_type}__callbacks": [
        early_stopping(int(best_model_params["n_estimators"]*0.1)),
        log_evaluation(period=100, show_stdv=True), 
    ],
}

fit_params.keys()

In [None]:
import lightgbm as lgb
from distributed import Client, LocalCluster

cluster = LocalCluster()
client = Client(cluster)
client

In [None]:
dask.config.set({"array.chunk-size": "128 MiB"})
dask.config.set({"distributed.workers.memory.spill": 0.85})
dask.config.set({"distributed.workers.memory.target": 0.75})
dask.config.set({"distributed.workers.memory.terminate": 0.98})

In [None]:
dask_clf = lgb.DaskLGBMClassifier(
    client=client, 
    objective="binary", random_state=777, # n_jobs=cpu_count(),
    boosting_type="goss", n_estimators=1000, learning_rate=0.01, max_bin=25,
    min_child_samples=None, min_split_gain=None
    )

In [None]:
best_model = dask_clf.fit(X=dX_train_cv, y=dy_train_cv, **fit_params)

## 4. Save Model

In [None]:
best_model_params

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

from joblib import dump
dump(best_model, f"../models/team{team}/{model}_latest/{model}_latest.joblib")

from json import dump, dumps
with open(f"../models/team{team}/{model}_latest/{model}_latest.json", "w") as f:
    dump(dumps(best_model_params, default=str), f)

## 5. Evaluate Model

In [None]:
preds = best_model.predict_proba(X_valid)[:,1]

In [None]:
from sklearn.metrics import log_loss
log_loss(y_valid, preds)

## 4. Save Model

In [None]:
import numpy as np
import seaborn as sns
from pandas.core.frame import DataFrame, Series
from pandas import concat
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve

# Kudos to: Mateus Coelho
# https://www.kaggle.com/code/mateuscco/how-to-evaluate-model-calibration/notebook

def ece(y_test, preds, strategy = 'uniform'):
    df = DataFrame({'target': y_test, 'proba': preds, 'bin': np.nan})
    
    if(strategy == 'uniform'):
        lim_inf = np.linspace(0, 0.9, 10)
        for idx, lim in enumerate(lim_inf):
            df.loc[df['proba'] >= lim, 'bin'] = idx

    elif(strategy == 'quantile'):
        pass
    
    df_bin_groups = concat([df.groupby('bin').mean(), df['bin'].value_counts()], axis = 1)
    df_bin_groups['ece'] = (df_bin_groups['target'] - df_bin_groups['proba']).abs() * (df_bin_groups['bin'] / df.shape[0])
    return df_bin_groups['ece'].sum()

def make_report(y_test, preds):
    # Computing AUC
    auc = roc_auc_score(y_test, preds)
    display(f'AUROC: {auc}')
    display(f'AUROC: {2*auc-1}')
    display(f'Fraction of positive cases in the test set: {y_test.mean()}')
    display(f'Mean predicted value in the test set:       {preds.mean()}')
    display(f'ECE (equal width bins):       {ece(y_test, preds)}')
    
    # Plotting probabilities
    display('#### Histogram of the probability distribution')
    Series(preds).hist(bins = 40)
    plt.show()
    
    # Plotting KDE by class
    display('#### KDE plots of the probability distribution by class')
    fig, ax1 = plt.subplots()
    sns.kdeplot(preds[y_test == 0], label = 'No goal', ax = ax1)
    ax2 = ax1.twinx()
    sns.kdeplot(preds[y_test == 1], label = 'Goal within 10s', color = 'red', ax = ax2)
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    
    # Plotting calibration
    display('#### Calibration curve (equal width bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10)
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()
    
    display('#### Calibration curve (equal size bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10, strategy='quantile')
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()

In [None]:
make_report(y_valid, preds)