## 0. Libraries and Personal Tools

In [29]:
import sys

from os.path import abspath
from yaml import safe_load

from pandas import read_feather, read_csv
from gc import collect
from sklearn.model_selection import GroupShuffleSplit

In [30]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the default figure size and theme to display good looking matplotlib plots.
rcParams["figure.figsize"] = (10, 6)
plt.style.use("fivethirtyeight")

In [31]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

In [32]:
from src.features.build_features import *
from src.utils import *

In [33]:
model_config = safe_load(open("../models/config.yaml", "r"))
model_config

{'paths': {'dtypes': './data/raw/test_dtypes.csv',
  'data': './data/interim/sample_5perc_train_compressed.ftr'},
 'data': {'index': ['game_num', 'event_id', 'event_time'],
  'goal_post1': [0.0, -100.0, 6.8],
  'goal_post2': [0.0, 100.0, 6.8]},
 'model': {'team': 'A',
  'type': 'xgb',
  'params': {'objective': 'binary', 'metric': 'neg_log_loss'}}}

## 1. Get Train Data

In [34]:
df = read_feather("../" + model_config["paths"]["data"])
df.set_index(model_config["data"]["index"], inplace=True)

## 2. Get Features and Targets

In [35]:
model_features = read_csv("../" + model_config["paths"]["dtypes"])
FEATURES = [feature for feature in model_features.column.tolist() if feature != "id"]

TEAM = model_config["model"]["team"]
TARGET = f"team_{TEAM}_scoring_within_10sec"

## 5. Pipeline Building

In [36]:
# Kudos to: https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f
# Kudos to: https://towardsdatascience.com/4-scikit-learn-tools-every-data-scientist-should-use-4ee942958d9e

from multiprocessing import cpu_count

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler

### 5.1. Player Pipelines

In [37]:
player_position_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", MinMaxScaler()),
        ]
)

player_velocity_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", MaxAbsScaler()),
        ]
)

player_speed_pipe = Pipeline(
    steps=[
        ("speed", PlayerSpeed()),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", StandardScaler()),
        ]
)

In [38]:
player_transformations = ColumnTransformer(
    transformers=[
        ("player_demolished", PlayerDemolished(), selector("p[0-5]")),
        ("player_velocity_pipe",  player_velocity_pipe, selector("p[0-5]_vel_")),
        ("player_speed", player_speed_pipe, selector(pattern="p[0-5]_vel_")),
        ("player_position", player_position_pipe, selector("p[0-5]_pos_")),
        ],
    )


### 5.2. Ball Pipelines

In [39]:
ball_velocity_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", MaxAbsScaler()),
        ]
)

ball_speed_pipe = Pipeline(
    steps=[
        ("speed", BallSpeed()),
        ("scaler", StandardScaler()),
        ]
)

ball_position_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", MinMaxScaler()),
        ]
)

distance_ball_goal_posts_pipe = Pipeline(
    steps=[
        ("distance", DistanceBallGoalPosts(posts=[model_config["data"]["goal_post1"], model_config["data"]["goal_post2"]])),
        ("scaler", MinMaxScaler()),
        ]
)

In [40]:
ball_transformations = ColumnTransformer(
    transformers=[
        ("ball_velocity", ball_velocity_pipe, selector("ball_vel_")),
        ("ball_speed", ball_speed_pipe, selector("ball_vel_")),
        ("ball_position", ball_position_pipe, selector("ball_pos_")),
        ("distance_ball_goal_posts", distance_ball_goal_posts_pipe, selector("ball_pos_")),
        ],
    )


### 5.3. Booster Pipelines

In [41]:
booster_timer_pipe = Pipeline(
    steps=[
        ("timer", FunctionTransformer(lambda x: abs(x), feature_names_out="one-to-one")),
        ("scaler", MaxAbsScaler()),
        ]
)

player_boosters_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", MaxAbsScaler()),
        ]
)
        

In [42]:
booster_transformations = ColumnTransformer(
    transformers=[
        ("booster_timer", booster_timer_pipe, selector(pattern="boost[0-5]_timer")),
        ("player_boosters", player_boosters_pipe, selector(pattern="p[0-5]_boost")),
        ],
    )

### 5.4. Team Pipelines

In [43]:
team_centroid_pipe = Pipeline(
    steps=[
        ("team_centroid", TeamCentroid()),
        ("scaler", StandardScaler()),
        ]
)

In [44]:
team_transformations = ColumnTransformer(
    transformers=[
        ("team_centroid", team_centroid_pipe, selector("p[0-5]_pos_")),
        ],
    )   

### 5.5. Main Pipeline

In [45]:
preprocessor = FeatureUnion(
    transformer_list=[
        ("player_transformations", player_transformations),
        ("ball_transformations", ball_transformations),
        ("booster_transformations", booster_transformations),
        ("team_transformations", team_transformations),
        ],
    )

## 6. Dimensionality Reduction (Non-linear approach)

In [46]:
# https://scikit-learn.org/stable/auto_examples/decomposition/plot_kernel_pca.html#sphx-glr-auto-examples-decomposition-plot-kernel-pca-py

In [47]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.model_selection import GridSearchCV, GroupKFold
from xgboost import XGBClassifier

In [48]:
base_model = XGBClassifier(objective='binary:logistic')


In [49]:
main_pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("variance_threshold", VarianceThreshold()),
        ("dim_reducer", "passthrough"),
        ("model", base_model),
        ]
    )

In [50]:
from sklearn.model_selection import GroupShuffleSplit

gsp = GroupShuffleSplit(n_splits=2, test_size=0.7, random_state=777)
train_index, test_index = next(gsp.split(df, groups=df.index.get_level_values("game_num")))

X_train = df[FEATURES].iloc[train_index]
y_train = df[TARGET].iloc[train_index]

X_test = df[FEATURES].iloc[test_index]
y_test = df[TARGET].iloc[test_index]

In [51]:
del df

In [52]:
# df_sample = X_train.sample(frac=0.1, random_state=777)
# X_trans = DataFrame(
#     data=main_pipe.fit_transform(df_sample[FEATURES], df_sample[TARGET])
#     )

#TODO: Fix get_feature_names_out with custom transformers
# main_pipe.get_feature_names_out()

In [53]:
param_grid = [
#     {
#         "dim_reducer": [PCA(svd_solver = 'full')],
#         "dim_reducer__n_components": [0.80, 0.85, 0.90, 0.95],
# 
#     },
#     {
#         "dim_reducer": [PCA(svd_solver = 'auto', random_state=777)],
#         "dim_reducer__n_components": [26, 28, 30, 32, 34],
# 
#     },
#     {
#         "dim_reducer": [IncrementalPCA()],
#         "dim_reducer__n_components": [20, 30, 40, 50],
#         "dim_reducer__batch_size": [1000, 2000, 3000],
#     },
    {
        "dim_reducer": [KernelPCA(random_state=777)],
        "dim_reducer__n_components": [20, 30, 40, 50],
        "dim_reducer__kernel": ["linear"],
    },
    {
        "dim_reducer": [KernelPCA(random_state=777)],
        "dim_reducer__n_components": [20, 30, 40, 50],
        "dim_reducer__kernel": ["rbf"],
        "dim_reducer__gamma": [0.05, 0.1, 0.15, 0.2],  
        
    },
    # !bash echo 1 | sudo tee /proc/sys/vm/overcommit_memory
    # https://docs.dask.org/en/stable/
    {
        "dim_reducer": [KernelPCA(random_state=777)],
        "dim_reducer__n_components": [20, 30, 40, 50],
        "dim_reducer__kernel": ["poly"],
        "dim_reducer__degree": [2, 3, 4],
    },
]

reducer_labels = ["LinearKernelPCA", "RBFKernelPCA", "PolyKernelPCA"]
# reducer_labels = ["Full_PCA", "Auto_PCA", "IncrementalPCA", "KernelPCA"]

In [54]:
n_folds = 5

game_num = X_train.index.get_level_values("game_num")
groups = create_kf_groups(game_num, n_folds=n_folds)

gkf = GroupKFold(n_splits=n_folds)

In [55]:
groups.value_counts()

a    64155
b    66311
c    62812
d    58245
e    66057
dtype: int64

In [56]:
pca_grid_search = GridSearchCV(
    estimator=main_pipe, 
    param_grid=param_grid, 
    n_jobs=cpu_count(), 
    verbose=2, 
    scoring="neg_log_loss",
    cv=GroupKFold(n_splits=n_folds).split(X_train, y_train, groups=groups),
    )

In [57]:
collect()

224

In [58]:
# %env JOBLIB_TEMP_FOLDER=/home/ian/Desktop/tmp
pca_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
pca_grid_search.best_params_

In [None]:
pca_grid_search.best_score_


In [None]:
pca_grid_search.cv_results_.keys()

In [None]:
var = pca_grid_search.best_estimator_["dim_reducer"].explained_variance_ratio_
var_explained = var.cumsum()

In [None]:
plt.figure()

plt.plot(range(1, len(var_explained)+1), var, label="per Component", marker="o", markersize=7)
plt.plot(range(1, len(var_explained)+1), var_explained, label="Cumulated", marker="o", markersize=7)

plt.xlabel("Number of Components")
plt.ylabel("Variance Explained")
plt.ylim(-0.1, 1.1)

plt.title("Principal Component Analysis")
plt.legend()
plt.show()

In [None]:
mean_scores = array(pca_grid_search.cv_results_["mean_test_score"])

plt.figure()
plt.title("PCA Grid Search")
plt.bar([str(label) for label in N_FEATURES_OPTIONS], -mean_scores)
plt.xlabel("n_components")
plt.ylabel("Negative Log Loss")
plt.ylim(0.20, 0.22)
plt.show()