## 0. Libraries and Personal Tools

In [1]:
import sys
from os.path import abspath

from multiprocessing import cpu_count

from gc import collect

In [2]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [3]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

from src.models.train_model import BaseModel

## 1. Build Base Model

In [4]:
base_model = BaseModel()
base_model.read_config("../models/config.yaml")
features, target = base_model.get_data()
base_model.build_base_pipeline()

In [5]:
from pandas.core.frame import DataFrame
DataFrame(base_model.base_pipeline.fit_transform(base_model.data.sample(frac=0.1, random_state=777))).isna().sum().sum()

0

## 2. Dimensionality Reduction

### 2.1. Split Data

In [6]:
from src.utils import create_kf_groups

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold
from xgboost import XGBClassifier

In [7]:
base_model.base_pipeline.steps.append(("dim_reducer", "passthrough"))
base_model.base_pipeline.steps.append(("model", XGBClassifier(objective='binary:logistic')))

In [8]:
from sklearn.model_selection import GroupShuffleSplit

gsp = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=777)
train_index, test_index = next(gsp.split(base_model.data, groups=base_model.data.index.get_level_values("game_num")))

X_train = base_model.data[features].iloc[train_index]
y_train = base_model.data[target].iloc[train_index]

X_test = base_model.data[features].iloc[test_index]
y_test = base_model.data[target].iloc[test_index]

In [9]:
DataFrame(base_model.base_pipeline[:2].fit_transform(X_train.sample(frac=0.1, random_state=777))).isna().sum().sum()

0

### 2.2. Define Parameter Grid

In [10]:
TOTAL_IPCA_BATCHES = 100
ipca_batch = int(round(len(X_train) / TOTAL_IPCA_BATCHES, -3))
ipca_batch

17000

In [11]:
from scipy import stats

param_dists = [
    {
        "dim_reducer": [IncrementalPCA(batch_size=ipca_batch)],
        "dim_reducer__n_components": stats.poisson(35),
    },
]

### 4.3. Define K-Group-Folds

In [12]:
n_folds = 5

game_num = X_train.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)

gkf = GroupKFold(n_splits=n_folds)

In [13]:
groups.value_counts()

a    342325
b    341751
c    336117
d    335230
e    341351
dtype: int64

In [14]:
base_model.base_pipeline

### 4.4. Run Grid Search

In [15]:
pca_grid_search = RandomizedSearchCV(
    estimator=base_model.base_pipeline, 
    param_distributions=param_dists, 
    n_iter=1,
    n_jobs=cpu_count(), 
    verbose=2, 
    scoring="neg_log_loss",
    cv=GroupKFold(n_splits=n_folds).split(X_train, y_train, groups=groups),
    random_state=777,
    )

In [16]:
collect()

168

In [17]:
# %env JOBLIB_TEMP_FOLDER=/home/ian/Desktop/tmp
pca_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END dim_reducer=IncrementalPCA(batch_size=17000), dim_reducer__n_components=32; total time=65.0min
[CV] END dim_reducer=IncrementalPCA(batch_size=17000), dim_reducer__n_components=32; total time=65.0min
[CV] END dim_reducer=IncrementalPCA(batch_size=17000), dim_reducer__n_components=32; total time=65.1min
[CV] END dim_reducer=IncrementalPCA(batch_size=17000), dim_reducer__n_components=32; total time=65.2min
[CV] END dim_reducer=IncrementalPCA(batch_size=17000), dim_reducer__n_components=32; total time=28.9min


### 4.5. Best PCA Configuration

In [18]:
pca_grid_search.best_params_

{'dim_reducer': IncrementalPCA(batch_size=17000, n_components=32),
 'dim_reducer__n_components': 32}

In [19]:
pca_grid_search.best_score_

-0.2027685317229809

In [None]:
var = pca_grid_search.best_estimator_["dim_reducer"].explained_variance_ratio_
var_explained = var.cumsum()

In [None]:
plt.figure()

plt.plot(range(1, len(var_explained)+1), var, label="per Component", marker="o", markersize=7)
plt.plot(range(1, len(var_explained)+1), var_explained, label="Cumulated", marker="o", markersize=7)

plt.xlabel("Number of Components")
plt.ylabel("Variance Explained")
plt.ylim(-0.1, 1.1)

plt.title("Best Estimator (IPCA, 50 Components, 2k BS)")
plt.legend()
plt.show()

In [None]:
df_cv_results = DataFrame(
        data=pca_grid_search.cv_results_["mean_test_score"].reshape(4, 2),
        index=[15, 25, 35, 45],
        columns=["whiten__True", "whiten__False"],
        )

In [None]:
(
    df_cv_results
    .abs()
    .plot.bar(rot=0, ylim=(0.225, 0.235), title="Neg Log Loss (AutoPCA, Components)")
);

In [None]:
(
    DataFrame(
        data=pca_grid_search.cv_results_["mean_fit_time"].reshape(4, 2),
        index=[15, 25, 35, 45],
        columns=["whiten__True", "whiten__False"],
        )
    .div(60)
    .plot.bar(rot=0, title="Neg Log Loss (AutoPCA, Components)")
);