### Import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

# plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use(
    "https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle"
)


---
## Preparations

### Load the CSV into a Dataframe

- load csv (or calculate again if not in data folder)
- update index=id
- drop useless columns
- find numerical & object columns

In [None]:
# path to csv file
path_df = os.path.join("..", "data", "df_deepgaze2e.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=["img", "sp_idx"])
df = df.drop(
    columns=[col for col in df.columns if "_obj" in col]
)  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### First, drop instances & features

- unusual long fixation durations are identified as outliers and dropped
  - drop if "dur > 5000 ms"


In [None]:
# processing
df = df[df["sp_fix_duration_ms_total"] <= 5000]

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### Split into train & test sets

As soon as the dataset is in its final form, perform train-test-split with our own split function to have out 30-image-set always as our test set.

In [None]:
# prepare features and target
X = df
y = X.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

### Set variables

- define `metric`
- behavior for saving models as pickles
- defaults for model-objects

In [None]:
# metric
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# models
folder_name = "RF_RFE_CV"

# defaults
RSEED = 42
cv = 10
n_jobs = -1
verbose = 0

###  Pipelines for each Model

In [None]:
# Random Forest: no scaling / no encoding
rf_pipeline = Pipeline(
    [
        (
            "classifier",
            RandomForestClassifier(
                random_state=RSEED,
                n_jobs=n_jobs,
                verbose=0,
            ),
        )
    ]
)

---
## Modeling

### Random Forest

defaults: _no restrictions whatsoever_

In [None]:
# Create GridSearchCV object & fit it
rf_defaults = RandomForestClassifier(
    random_state=RSEED,
    n_jobs=n_jobs,
    verbose=0,
)

In [None]:
# set file & folder name
model_name = "RF_defaults.pickle"

# fit or load
rf_defaults = em.fit_or_load(
    rf_defaults, X_train, y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = rf_defaults.predict(X_test)
proba_test = rf_defaults.predict_proba(X_test)

pred_train = rf_defaults.predict(X_train)
proba_train = rf_defaults.predict_proba(X_train)

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(rf_defaults)

# # feature importances
# em.feat_importance(rf_defaults, X_train, y_train, X_test, y_test, n_reps=100)

# # learning curves - for one model - f2 score
# em.learning(rf_defaults, X_train, y_train)

----
----

### feature selection

#### RFE

In [None]:
estimator = RandomForestClassifier(
    random_state=RSEED,
    n_jobs=n_jobs,
    verbose=0,
)
selector_RFE = RFE(estimator, verbose=0)
selector_RFE = selector_RFE.fit(X_train, y_train)
feat_RFE = selector_RFE.get_feature_names_out()
print(f"selected {selector_RFE.n_features_} features: {feat_RFE}")

In [None]:
# Create GridSearchCV object & fit it
rf_RFE_defaults = RandomForestClassifier(
    random_state=RSEED,
    n_jobs=2,
    verbose=0,
)

# set file & folder name
model_name = "RF_RFE_defaults.pickle"

# fit or load
rf_RFE_defaults = em.fit_or_load(
    rf_RFE_defaults, X_train[feat_RFE], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = rf_RFE_defaults.predict(X_test[feat_RFE])
proba_test = rf_RFE_defaults.predict_proba(X_test[feat_RFE])

pred_train = rf_RFE_defaults.predict(X_train[feat_RFE])
proba_train = rf_RFE_defaults.predict_proba(X_train[feat_RFE])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(rf_RFE_defaults)

-----

#### grid I 

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7, 9, 11],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [20, 30, 40],
    "classifier__min_samples_split": [40, 50, 60],
    "classifier__n_estimators": [100, 200, 300],
}

# Create GridSeardchCV object & fit it
gs_rf_RFE_v1 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFE_grid_v1.pickle"

# fit or load
gs_rf_RFE_v1 = em.fit_or_load(
    gs_rf_RFE_v1, X_train[feat_RFE], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFE_v1.predict(X_test[feat_RFE])
proba_test = gs_rf_RFE_v1.predict_proba(X_test[feat_RFE])

pred_train = gs_rf_RFE_v1.predict(X_train[feat_RFE])
proba_train = gs_rf_RFE_v1.predict_proba(X_train[feat_RFE])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFE_v1)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid II 

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7, 8],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [25, 30, 35],
    "classifier__min_samples_split": [55, 60, 65],
    "classifier__n_estimators": [50, 100, 150, 200],
}

# Create GridSeardchCV object & fit it
gs_rf_RFE_v2 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFE_grid_v2.pickle"

# fit or load
gs_rf_RFE_v2 = em.fit_or_load(
    gs_rf_RFE_v2, X_train[feat_RFE], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFE_v2.predict(X_test[feat_RFE])
proba_test = gs_rf_RFE_v2.predict_proba(X_test[feat_RFE])

pred_train = gs_rf_RFE_v2.predict(X_train[feat_RFE])
proba_train = gs_rf_RFE_v2.predict_proba(X_train[feat_RFE])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFE_v2)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid III

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [30, 35],
    "classifier__min_samples_split": [50, 55, 60],
    "classifier__n_estimators": [100, 150, 200],
}

# Create GridSeardchCV object & fit it
gs_rf_RFE_v3 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFE_grid_v3.pickle"

# fit or load
gs_rf_RFE_v3 = em.fit_or_load(
    gs_rf_RFE_v3, X_train[feat_RFE], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFE_v3.predict(X_test[feat_RFE])
proba_test = gs_rf_RFE_v3.predict_proba(X_test[feat_RFE])

pred_train = gs_rf_RFE_v3.predict(X_train[feat_RFE])
proba_train = gs_rf_RFE_v3.predict_proba(X_train[feat_RFE])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFE_v3)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid IV

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [1],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__n_estimators": [50, 100, 150, 200, 250, 300],
}

# Create GridSeardchCV object & fit it
gs_rf_RFE_v4 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFE_grid_v4.pickle"

# fit or load
gs_rf_RFE_v4 = em.fit_or_load(
    gs_rf_RFE_v4, X_train[feat_RFE], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFE_v4.predict(X_test[feat_RFE])
proba_test = gs_rf_RFE_v4.predict_proba(X_test[feat_RFE])

pred_train = gs_rf_RFE_v4.predict(X_train[feat_RFE])
proba_train = gs_rf_RFE_v4.predict_proba(X_train[feat_RFE])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFE_v4)

# # feature importances
em.feat_importance(gs_rf_RFE_v4, X_train[feat_RFE], y_train, X_test[feat_RFE], y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

----
----

#### RFECV


In [None]:
estimator = RandomForestClassifier(
    random_state=RSEED,
    n_jobs=n_jobs,
    verbose=0,
)
selector_RFECV = RFECV(estimator, scoring=ftwo_scorer)
selector_RFECV = selector_RFECV.fit(X_train, y_train)
feat_RFECV = selector_RFECV.get_feature_names_out()
print(f"selected {selector_RFECV.n_features_} features: {feat_RFECV}")

In [None]:
# Create GridSearchCV object & fit it
rf_RFECV_defaults = RandomForestClassifier(
    random_state=RSEED,
    n_jobs=n_jobs,
    verbose=0,
)

# set file & folder name
model_name = "RF_RFECV_defaults.pickle"

# fit or load
rf_RFECV_defaults = em.fit_or_load(
    rf_RFECV_defaults, X_train[feat_RFECV], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = rf_RFECV_defaults.predict(X_test[feat_RFECV])
proba_test = rf_RFECV_defaults.predict_proba(X_test[feat_RFECV])

pred_train = rf_RFECV_defaults.predict(X_train[feat_RFECV])
proba_train = rf_RFECV_defaults.predict_proba(X_train[feat_RFECV])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

-----

#### grid I 

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7, 9, 11],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [20, 30, 40],
    "classifier__min_samples_split": [40, 50, 60],
    "classifier__n_estimators": [100, 200, 300],
}

# Create GridSeardchCV object & fit it
gs_rf_RFECV_v1 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFECV_grid_v1.pickle"

# fit or load
gs_rf_RFECV_v1 = em.fit_or_load(
    gs_rf_RFECV_v1, X_train[feat_RFECV], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFECV_v1.predict(X_test[feat_RFECV])
proba_test = gs_rf_RFECV_v1.predict_proba(X_test[feat_RFECV])

pred_train = gs_rf_RFECV_v1.predict(X_train[feat_RFECV])
proba_train = gs_rf_RFECV_v1.predict_proba(X_train[feat_RFECV])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFECV_v1)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid II

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7, 9],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [30, 40],
    "classifier__min_samples_split": [50, 60, 70],
    "classifier__n_estimators": [50, 100, 200, 300],
}

# Create GridSeardchCV object & fit it
gs_rf_RFECV_v2 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFECV_grid_v2.pickle"

# fit or load
gs_rf_RFECV_v2 = em.fit_or_load(
    gs_rf_RFECV_v2, X_train[feat_RFECV], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFECV_v2.predict(X_test[feat_RFECV])
proba_test = gs_rf_RFECV_v2.predict_proba(X_test[feat_RFECV])

pred_train = gs_rf_RFECV_v2.predict(X_train[feat_RFECV])
proba_train = gs_rf_RFECV_v2.predict_proba(X_train[feat_RFECV])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFECV_v2)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid III

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [7],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__min_samples_leaf": [40],
    "classifier__min_samples_split": [50],
    "classifier__n_estimators": [50, 100, 150, 200],
}

# Create GridSeardchCV object & fit it
gs_rf_RFECV_v3 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFECV_grid_v3.pickle"

# fit or load
gs_rf_RFECV_v3 = em.fit_or_load(
    gs_rf_RFECV_v3, X_train[feat_RFECV], y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
pred_test = gs_rf_RFECV_v3.predict(X_test[feat_RFECV])
proba_test = gs_rf_RFECV_v3.predict_proba(X_test[feat_RFECV])

pred_train = gs_rf_RFECV_v3.predict(X_train[feat_RFECV])
proba_train = gs_rf_RFECV_v3.predict_proba(X_train[feat_RFECV])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFECV_v3)

# # feature importances
# em.feat_importance(grid_search_rf, X_train, y_train, X_test, y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)

-----

#### grid IV

In [None]:
# grid
param_grid_rf = {
    "classifier__max_depth": [2],
    "classifier__max_features": ["sqrt", "log2"],
    "classifier__n_estimators": [50, 100, 150, 200, 250, 300],
}

# Create GridSeardchCV object & fit it
gs_rf_RFECV_v4 = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=1,
)

# set file & folder name
model_name = "RF_RFECV_grid_v4.pickle"

# fit or load
gs_rf_RFECV_v4 = em.fit_or_load(
    gs_rf_RFECV_v4, X_train[feat_RFECV], y_train, model_name, folder=folder_name, overwrite=True
)

In [None]:
# predict & proba
pred_test = gs_rf_RFECV_v4.predict(X_test[feat_RFECV])
proba_test = gs_rf_RFECV_v4.predict_proba(X_test[feat_RFECV])

pred_train = gs_rf_RFECV_v4.predict(X_train[feat_RFECV])
proba_train = gs_rf_RFECV_v4.predict_proba(X_train[feat_RFECV])

# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# Get the best parameters and best estimator
em.model_info(gs_rf_RFECV_v4)

# feature importances
em.feat_importance(gs_rf_RFECV_v4, X_train[feat_RFECV], y_train, X_test[feat_RFECV], y_test)

# # learning curves - for one model - f2 score
# em.learning(grid_search_rf.best_estimator_, X_train, y_train)