## 0. Libraries and Personal Tools

In [1]:
import sys
from os.path import abspath

from multiprocessing import cpu_count
from gc import collect

In [2]:
cpu_count()

48

In [3]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [4]:
from pandas import set_option
set_option("display.max_rows", 200)
set_option("display.max_columns", 100)
set_option("display.max_colwidth", 200)

In [5]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

from src.models.train_model import BaseModel

## 1. Build Base Model

In [6]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

In [7]:
# base_model.base_pipeline

In [8]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(base_model.data)).isna().sum().sum()

## 2. Parameter Optimization

### 2.1. Split Data

In [9]:
from src.utils import create_kf_groups

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold

from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper, CheckpointSaver
from skopt.space import Real, Categorical, Integer

In [10]:
from sklearn.model_selection import GroupShuffleSplit

gsp = GroupShuffleSplit(n_splits=2, test_size=0.20, random_state=777)
train_index, test_index = next(gsp.split(base_model.data, groups=base_model.data.index.get_level_values("game_num")))

X_train = base_model.data[features + [target]].iloc[train_index]

X_valid = base_model.data[features].iloc[test_index]
y_valid = base_model.data[target].iloc[test_index]

In [11]:
train_index, test_index = next(gsp.split(X_train, groups=X_train.index.get_level_values("game_num")))

X_train_cv = X_train[features].iloc[train_index]
y_train_cv = X_train[target].iloc[train_index]

X_test = X_train[features].iloc[test_index]
y_test = X_train[target].iloc[test_index]

In [12]:
# del X_train

In [13]:
print(f"X_train_cv.shape: {X_train_cv.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_valid.shape: {X_valid.shape}")

X_train_cv.shape: (135831, 54)
X_test.shape: (34082, 54)
X_valid.shape: (42183, 54)


In [14]:
base_model.base_pipeline.named_steps.keys()

dict_keys(['preprocessor'])

In [15]:
# from pandas.core.frame import DataFrame
# DataFrame(base_model.base_pipeline.fit_transform(X_train_cv)).describe().transpose()

In [16]:
scale_pos_w = y_train_cv.value_counts().round(-4)
scale_pos_w = scale_pos_w[0] / scale_pos_w[1]

In [17]:
base_model.base_pipeline.fit(X_train_cv, y_train_cv)

Pipeline(steps=[('preprocessor',
                 FeatureUnion(transformer_list=[('player',
                                                 ColumnTransformer(transformers=[('demolished',
                                                                                  PlayerDemolished(),
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6ba40e9090>),
                                                                                 ('position',
                                                                                  Pipeline(steps=[('imputer',
                                                                                                   SimpleImputer(fill_value=0.0,
                                                                                                                 strategy='constant')),
                                                                         

In [18]:
X_train_trans = base_model.base_pipeline.transform(X_train_cv)
X_test_trans = base_model.base_pipeline.transform(X_test)

### 2.3. Define K-Group-Folds

In [19]:
n_folds = 5

game_num = X_train_cv.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)

gkf = GroupKFold(n_splits=n_folds)

In [20]:
groups.value_counts()

a    27711
b    27444
c    26380
d    27236
e    27060
dtype: int64

In [21]:
if base_model.config["model"]["ipca"]["batch_size"] == "auto":
    TOTAL_IPCA_BATCHES = 50
    ipca_batch = int(round(groups.value_counts().mean() / TOTAL_IPCA_BATCHES, -3))
else:
    ipca_batch = base_model.config["model"]["ipca"]["batch_size"]
ipca_batch

1000

In [22]:
base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))


In [23]:
if base_model.config["model"]["type"] == "xgb":
    clf = XGBClassifier(objective="binary:logistic", random_state=777)
    search_spaces = {
            "xgb__n_estimators": Integer(200, 400),
            "xgb__learning_rate": Real(0.05, 0.15, "uniform"),
            "xgb__max_depth": Integer(4, 6),
            "xgb__gamma": Real(0.05, 0.10, "uniform"),
            "xgb__subsample": Real(0.6, 0.8, "uniform"),
            "xgb__colsample_bytree": Real(0.8, 1.0, "uniform"),
        }

elif base_model.config["model"]["type"] == "lgbm":
    clf = LGBMClassifier(
        objective="binary", random_state=777, n_jobs=cpu_count(),
        boosting_type="goss", n_estimators=5000, learning_rate=0.05, max_bin=50,
        min_child_samples=None, min_split_gain=None
        )


In [24]:
base_model.base_pipeline.steps.append((base_model.config["model"]["type"], clf))

In [25]:
base_model.base_pipeline.named_steps.keys()

dict_keys(['preprocessor', 'ipca', 'lgbm'])

In [26]:
collect()

430

### 2.4. Hyperparameters - Bayesian Optimization

In [27]:
from skopt.space import Integer, Categorical, Real
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean as np_mean

# -----------------------------------------------------------------------------------
#                   Guide on which params to tune/ NOT to tune
#           source: https://github.com/Microsoft/LightGBM/issues/695
# -----------------------------------------------------------------------------------
# 
# For heavily unbalanced datasets such as 1:10000:
# 
# - max_bin: keep it only for memory pressure, not to tune (otherwise overfitting)
# - learning rate: keep it only for training speed, not to tune (otherwise overfitting)
# - n_estimators: must be infinite and use early stopping to auto-tune (otherwise overfitting)
# - num_leaves: [7, 4095]
# - max_depth: [2, 63] and infinite 
# - scale_pos_weight: [1, 10000] 
# - min_child_weight: [0.01, (sample size / 1000)] 
# - subsample: [0.4, 1]
# - bagging_fraction: only 1, keep as is (otherwise overfitting)
# - colsample_bytree: [0.4, 1]
# 
# Never tune following parameters unless you have an explicit requirement to tune them:
#
# - Learning rate (lower means longer to train but more accurate, higher means smaller to train but less accurate)
# - Number of boosting iterations (automatically tuned with early stopping and learning rate)
# - Maximum number of bins (RAM dependent)

# Also: https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5
# And: https://neptune.ai/blog/lightgbm-parameters-guide
# Finally: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html#deal-with-over-fitting
# About optimization: https://medium.com/sitechassethealthcenter/gaussian-process-to-optimize-hyperparameters-of-an-algorithm-5b4810277527

# set up hyperparameter space
space = [
    
    # boosting iterations
    # TODO: 10000 when training with all data
    # Integer(100, 550, name="n_estimators"),
    # Real(0.10, 0.15, name="learning_rate"), # it is recommended to use smaller learning_rate with larger num_iterations.
    
    # model complexity
    Integer(20, 3500, name="num_leaves"), # keep it relatively small to avoid overfitting
    Integer(5, 15, name="max_depth"),
    Real(5, 20, name="min_child_weight"),
    
    # penalization to reduce overfitting
    Real(0.0, 20.0, name="reg_alpha"),
    Real(0.0, 20.0, name="reg_lambda"),
    
    # model regularization
    Integer(int(groups.value_counts().mean().round(-4)*0.01), int(groups.value_counts().mean().round(-4)*0.05), name="min_data_in_leaf"),
    Real(0.05, 10, name="min_gain_to_split"),
    
    # model train speed
    # Real(15, 250, name="max_bin"),
    Real(0.25, 0.85, name="subsample"),
    Real(0.25, 1.0, name="colsample_bytree"),
    
    # target class unbalance
    Real(int(scale_pos_w)-5, int(scale_pos_w)+5, name="scale_pos_weight"),
    ]

from sklearn.model_selection import cross_val_score
from typing import Callable

@use_named_args(space)
def objective(**params):
    base_model.base_pipeline["lgbm"].set_params(**params)
    return -np_mean(
        cross_val_score(
            base_model.base_pipeline["lgbm"], X_train_trans, y_train_cv, 
            cv=GroupKFold(n_splits=n_folds).split(X_train_cv, y_train_cv, groups=groups), 
            n_jobs=cpu_count(),
            verbose=1,
            scoring="neg_log_loss", 
            fit_params={
                "eval_set": [(X_test_trans, y_test)],
                "eval_metric": "binary_logloss",
                "callbacks": [
                    early_stopping(50),
                    log_evaluation(period=25, show_stdv=True), # the rule of thumb is to have it at 10% of your num_estimators
                    ],
            }
            )
        )

In [28]:
reg_gp = gp_minimize(
    objective, space,
    verbose=1, 
    n_jobs=cpu_count(),
    random_state=777, n_calls=50,
    n_initial_points=15, 
    callback=[
        # CheckpointSaver("../models/optmization/checkpoints/lgbm.pkl", compress=9),
        DeltaYStopper(
            delta=0.0005,
            n_best=5,
            ),
        ]
    )

Iteration No: 1 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.38819
[25]	valid_0's binary_logloss: 0.388498
[25]	valid_0's binary_logloss: 0.387551
[25]	valid_0's binary_logloss: 0.388714
[25]	valid_0's binary_logloss: 0.389352
[50]	valid_0's binary_logloss: 0.465187
[50]	valid_0's binary_logloss: 0.464799
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231626
[50]	valid_0's binary_logloss: 0.465569
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231422
[50]	valid_0's binary_logloss: 0.466461
[50]	valid_0's binary_logloss: 0.463681
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231669
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231571


[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 12.4min finished


Iteration No: 1 ended. Evaluation done at random point.
Time taken: 746.7118
Function value obtained: 0.2275
Current minimum: 0.2275
Iteration No: 2 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.462031
[25]	valid_0's binary_logloss: 0.461493
[25]	valid_0's binary_logloss: 0.458494
[25]	valid_0's binary_logloss: 0.460305
[25]	valid_0's binary_logloss: 0.459243
[50]	valid_0's binary_logloss: 0.567623
[50]	valid_0's binary_logloss: 0.567465
[50]	valid_0's binary_logloss: 0.563847
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.234098
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.234
[50]	valid_0's binary_logloss: 0.56617
[50]	valid_0's binary_logloss: 0.56429
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233761
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.234042
Earl

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 13.1min finished


Iteration No: 2 ended. Evaluation done at random point.
Time taken: 785.8021
Function value obtained: 0.2301
Current minimum: 0.2275
Iteration No: 3 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.332127
[25]	valid_0's binary_logloss: 0.331345
[25]	valid_0's binary_logloss: 0.331387
[25]	valid_0's binary_logloss: 0.330632
[25]	valid_0's binary_logloss: 0.33248
[50]	valid_0's binary_logloss: 0.382877
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230316
[50]	valid_0's binary_logloss: 0.380899
[50]	valid_0's binary_logloss: 0.383793
[50]	valid_0's binary_logloss: 0.382093
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230122
[50]	valid_0's binary_logloss: 0.381766
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230324
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230216


[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 20.0min finished


Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1197.9786
Function value obtained: 0.2262
Current minimum: 0.2262
Iteration No: 4 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.422741
[25]	valid_0's binary_logloss: 0.42182
[25]	valid_0's binary_logloss: 0.423756
[25]	valid_0's binary_logloss: 0.422474
[25]	valid_0's binary_logloss: 0.422442
[50]	valid_0's binary_logloss: 0.513472
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.232187
[50]	valid_0's binary_logloss: 0.512058
[50]	valid_0's binary_logloss: 0.515138
[50]	valid_0's binary_logloss: 0.513373
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.232024
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.23231
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.232144
[50]	valid_0's binary_logloss: 0.513267
E

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed:  9.0min finished


Iteration No: 4 ended. Evaluation done at random point.
Time taken: 539.2567
Function value obtained: 0.2283
Current minimum: 0.2262
Iteration No: 5 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.370063
[25]	valid_0's binary_logloss: 0.370172
[25]	valid_0's binary_logloss: 0.36851
[25]	valid_0's binary_logloss: 0.370503
[25]	valid_0's binary_logloss: 0.36943
[50]	valid_0's binary_logloss: 0.439265
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230727
[50]	valid_0's binary_logloss: 0.439213
[50]	valid_0's binary_logloss: 0.440416
[50]	valid_0's binary_logloss: 0.438807
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230804
[50]	valid_0's binary_logloss: 0.437469
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230746
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230581
E

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 14.8min finished


Iteration No: 5 ended. Evaluation done at random point.
Time taken: 891.1050
Function value obtained: 0.2266
Current minimum: 0.2262
Iteration No: 6 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.387682
[25]	valid_0's binary_logloss: 0.388881
[25]	valid_0's binary_logloss: 0.387428
[25]	valid_0's binary_logloss: 0.3882
[25]	valid_0's binary_logloss: 0.388608
[50]	valid_0's binary_logloss: 0.464849
[50]	valid_0's binary_logloss: 0.465366
[50]	valid_0's binary_logloss: 0.466844
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231271
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231392
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231449
[50]	valid_0's binary_logloss: 0.466748
[50]	valid_0's binary_logloss: 0.467021
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231647
E

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 12.2min finished


Iteration No: 6 ended. Evaluation done at random point.
Time taken: 734.9580
Function value obtained: 0.2274
Current minimum: 0.2262
Iteration No: 7 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.331841
[25]	valid_0's binary_logloss: 0.331642
[25]	valid_0's binary_logloss: 0.330739
[25]	valid_0's binary_logloss: 0.331581
[25]	valid_0's binary_logloss: 0.332044
[50]	valid_0's binary_logloss: 0.384661
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.23062
[50]	valid_0's binary_logloss: 0.384258
[50]	valid_0's binary_logloss: 0.384553
[50]	valid_0's binary_logloss: 0.383314
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230585
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230416
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230356
[50]	valid_0's binary_logloss: 0.385104


[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 15.9min finished


Iteration No: 7 ended. Evaluation done at random point.
Time taken: 952.3342
Function value obtained: 0.2264
Current minimum: 0.2262
Iteration No: 8 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.462096
[25]	valid_0's binary_logloss: 0.463246
[25]	valid_0's binary_logloss: 0.462455
[25]	valid_0's binary_logloss: 0.462015
[25]	valid_0's binary_logloss: 0.460447
[50]	valid_0's binary_logloss: 0.573793
[50]	valid_0's binary_logloss: 0.57408
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233132
[50]	valid_0's binary_logloss: 0.575457
[50]	valid_0's binary_logloss: 0.573763
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233412
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233156
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233326
[50]	valid_0's binary_logloss: 0.570887


[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed:  7.7min finished


Iteration No: 8 ended. Evaluation done at random point.
Time taken: 462.5295
Function value obtained: 0.2293
Current minimum: 0.2262
Iteration No: 9 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.448854
[25]	valid_0's binary_logloss: 0.446111
[25]	valid_0's binary_logloss: 0.445954
[25]	valid_0's binary_logloss: 0.446345
[25]	valid_0's binary_logloss: 0.445132
[50]	valid_0's binary_logloss: 0.553558
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233502
[50]	valid_0's binary_logloss: 0.550155
[50]	valid_0's binary_logloss: 0.551359
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.232976
[50]	valid_0's binary_logloss: 0.549963
[50]	valid_0's binary_logloss: 0.548359
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.233236
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.232951

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 29.2min finished


Iteration No: 9 ended. Evaluation done at random point.
Time taken: 1754.2332
Function value obtained: 0.2294
Current minimum: 0.2262
Iteration No: 10 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.357491
[25]	valid_0's binary_logloss: 0.359046
[25]	valid_0's binary_logloss: 0.359074
[25]	valid_0's binary_logloss: 0.358114
[25]	valid_0's binary_logloss: 0.359312
[50]	valid_0's binary_logloss: 0.420042
[50]	valid_0's binary_logloss: 0.421818
[50]	valid_0's binary_logloss: 0.420852
[50]	valid_0's binary_logloss: 0.421562
[50]	valid_0's binary_logloss: 0.4221
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231405
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.231479
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.23172
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.23152
Ear

[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 41.9min finished


Iteration No: 10 ended. Evaluation done at random point.
Time taken: 2513.4114
Function value obtained: 0.2275
Current minimum: 0.2262
Iteration No: 11 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
[25]	valid_0's binary_logloss: 0.347846
[25]	valid_0's binary_logloss: 0.348162
[25]	valid_0's binary_logloss: 0.348619
[25]	valid_0's binary_logloss: 0.346886
[25]	valid_0's binary_logloss: 0.34706
[50]	valid_0's binary_logloss: 0.406905
[50]	valid_0's binary_logloss: 0.407949
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230997
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230944
[50]	valid_0's binary_logloss: 0.408334
[50]	valid_0's binary_logloss: 0.406526
[50]	valid_0's binary_logloss: 0.405816
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230953
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.230654


[Parallel(n_jobs=48)]: Done   5 out of   5 | elapsed: 16.5min finished


Iteration No: 11 ended. Evaluation done at random point.
Time taken: 989.4646
Function value obtained: 0.2267
Current minimum: 0.2262
Iteration No: 12 started. Evaluating function at random point.


[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


KeyboardInterrupt: 

In [None]:
print('best score: {}'.format(reg_gp.fun))
print('best params:')
for i, param in enumerate(space):
    print(f"{param.name}: {reg_gp.x[i]} from space: [{param.low}, {param.high}]")

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

best_model_params = dict()
for i, param in enumerate(space):
    best_model_params[f"{param.name}"] = reg_gp.x[i]

best_model_params

## 3. Train with All Data

In [None]:
# from json import load
# with open(f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.json", "r") as f:
#     best_model_params = load(f)

In [None]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

if base_model.config["model"]["type"] == "xgb":
    best_model = XGBClassifier(**best_model_params, random_state=777)
elif base_model.config["model"]["type"] == "lgbm":
    best_model = LGBMClassifier(**best_model_params, min_child_samples=None, random_state=777)

base_model.base_pipeline.steps.append((
    "ipca", 
    IncrementalPCA(
        n_components=base_model.config["model"]["ipca"]["n_components"], 
        batch_size=ipca_batch,
        whiten=base_model.config["model"]["ipca"]["whiten"]
        ),
    ))

X_test_trans = base_model.base_pipeline.fit_transform(X_test)

base_model.base_pipeline.steps.append((base_model.config["model"]["type"], best_model))

In [None]:
fit_params = {
    f"{model}__eval_set": [(X_test_trans, y_test)],
    f"{model}__eval_metric": "neg_log_loss",
    f"{model}__callbacks": [
        early_stopping(100),
        log_evaluation(period=50, show_stdv=True), 
    ],
}

fit_params.keys()

In [None]:
if base_model.config["model"]["type"] == "xgb":
    best_model = base_model.base_pipeline.fit(X_train_cv, y_train_cv)

elif base_model.config["model"]["type"] == "lgbm":
    best_model = base_model.base_pipeline.fit(
        X=X_train_cv, y=y_train_cv, **fit_params)

## 4. Save Model

In [None]:
best_model_params

In [None]:
team = base_model.config["model"]["team"]
model = base_model.config["model"]["type"]

from joblib import dump
dump(best_model, f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.joblib")

from json import dump, dumps
with open(f"../models/team{team}/{model}_ipca_10perc/{model}_ipca_10perc.json", "w") as f:
    dump(dumps(best_model_params, default=str), f)

## 5. Evaluate Model

In [None]:
preds = best_model.predict_proba(X_valid)[:,1]

In [None]:
from sklearn.metrics import log_loss
log_loss(y_valid, preds)

## 4. Save Model

In [None]:
import numpy as np
import seaborn as sns
from pandas.core.frame import DataFrame, Series
from pandas import concat
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve

# Kudos to: Mateus Coelho
# https://www.kaggle.com/code/mateuscco/how-to-evaluate-model-calibration/notebook

def ece(y_test, preds, strategy = 'uniform'):
    df = DataFrame({'target': y_test, 'proba': preds, 'bin': np.nan})
    
    if(strategy == 'uniform'):
        lim_inf = np.linspace(0, 0.9, 10)
        for idx, lim in enumerate(lim_inf):
            df.loc[df['proba'] >= lim, 'bin'] = idx

    elif(strategy == 'quantile'):
        pass
    
    df_bin_groups = concat([df.groupby('bin').mean(), df['bin'].value_counts()], axis = 1)
    df_bin_groups['ece'] = (df_bin_groups['target'] - df_bin_groups['proba']).abs() * (df_bin_groups['bin'] / df.shape[0])
    return df_bin_groups['ece'].sum()

def make_report(y_test, preds):
    # Computing AUC
    auc = roc_auc_score(y_test, preds)
    display(f'AUROC: {auc}')
    display(f'AUROC: {2*auc-1}')
    display(f'Fraction of positive cases in the test set: {y_test.mean()}')
    display(f'Mean predicted value in the test set:       {preds.mean()}')
    display(f'ECE (equal width bins):       {ece(y_test, preds)}')
    
    # Plotting probabilities
    display('#### Histogram of the probability distribution')
    Series(preds).hist(bins = 40)
    plt.show()
    
    # Plotting KDE by class
    display('#### KDE plots of the probability distribution by class')
    fig, ax1 = plt.subplots()
    sns.kdeplot(preds[y_test == 0], label = 'No goal', ax = ax1)
    ax2 = ax1.twinx()
    sns.kdeplot(preds[y_test == 1], label = 'Goal within 10s', color = 'red', ax = ax2)
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    
    # Plotting calibration
    display('#### Calibration curve (equal width bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10)
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()
    
    display('#### Calibration curve (equal size bins)')
    fop, mpv = calibration_curve(y_test, preds, n_bins=10, strategy='quantile')
    plt.plot(mpv, fop, "s-", label='model')
    plt.plot([0,0.25],[0,0.25], label='ideal')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.legend()
    plt.show()

In [None]:
make_report(y_valid, preds)