# Optimization of XGBoost model

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

import sys
import os

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models




plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')

In [None]:
random_state= 42

In [None]:
# path to csv file
path_df = os.path.join("..", "data", "df_deepgaze2e.csv")

# recalculate ???
recalculate_df = False

# get features
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df, index_col=0)
else:
    df = ft.get_features()
    df.to_csv(path_df)

# set id as index
#df = df.set_index([0], drop=True)

print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} features")

In [None]:
# Add them manually later no need for Encoder
# Random Forrest and XGBoost both handle categorical Variables well
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

In [None]:
# train-test-split
X = df.drop({"img", "sp_idx"}, axis=1)
y = X.pop("asd")

X_train, X_test, y_train, y_test = pp.split(X, y)

print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

In [None]:
# column transformer for data preprocessing
transformer = [("scaler", MinMaxScaler(), num_cols),
               ("ohe", OneHotEncoder(drop="first"), cat_cols  )]               
pre_processing = ColumnTransformer(transformer,
                                  remainder="passthrough")

### xgb pipeline

integrate evaluate models in pipeline?

In [None]:
xgb_pipeline = Pipeline([
    ("preprocessor", pre_processing),
    ("classifier", XGBClassifier())
])

In [None]:
# Fit the pipeline to training data & make predictions 
xgb_pipeline.fit(X_train, y_train)
predicted_labels = xgb_pipeline.predict(X_test)

In [None]:
# evaluate model
evaluate_models(predicted_labels, y_test)

### Beyond Grid Search: Hyperparameter Tuning for XGBoost

1. Bayesian optimization 

### Feature importance 

1. Permutation importance

Train a baseline model and record the score on the validation set.
Re-shuffle values for one feature, use the model to predict again, and calculate scores on the validation set. The feature importance for the feature is the difference between the baseline in 1 and the permutation score in 2.
Repeat the process for all features.

In [None]:
from sklearn.inspection import permutation_importance
#calculate permutation importance for test data 
result_test = permutation_importance(
    xgb, X_test, y_test, n_repeats=20, random_state=42, n_jobs=2
)

sorted_importances_idx_test = result_test.importances_mean.argsort()
importances_test = pd.DataFrame(
    result_test.importances[sorted_importances_idx_test].T,
    columns=X.columns[sorted_importances_idx_test],
)

#calculate permutation importance for training data 
result_train = permutation_importance(
    xgb, X_train, y_train, n_repeats=20, random_state=42, n_jobs=2
)

sorted_importances_idx_train = result_train.importances_mean.argsort()
importances_train = pd.DataFrame(
    result_train.importances[sorted_importances_idx_train].T,
    columns=X.columns[sorted_importances_idx_train],
)

f, axs = plt.subplots(1,2,figsize=(15,5))

importances_test.plot.box(vert=False, whis=10, ax = axs[0])
axs[0].set_title("Permutation Importances (test set)")
axs[0].axvline(x=0, color="k", linestyle="--")
axs[0].set_xlabel("Decrease in accuracy score")
axs[0].figure.tight_layout()

importances_train.plot.box(vert=False, whis=10, ax = axs[1])
axs[1].set_title("Permutation Importances (train set)")
axs[1].axvline(x=0, color="k", linestyle="--")
axs[1].set_xlabel("Decrease in accuracy score")
axs[1].figure.tight_layout()

2. SHAP-based importance 

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")

3. Filter methods  
rank features independent of any prediction model 

4. Wrapper methods  
high accuracy/ high computational cost  
* methods to choose from: hill-climbing, particle swarm optimization, whale optimization 