### Import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

#plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

import pickle

---
## Preparations

### Load the CSV into a Dataframe

- load csv (or calculate again if not in data folder)
- update index=id
- drop useless columns
- find numerical & object columns

In [None]:
# path to csv file
path_df = os.path.join("..", "data", "df_deepgaze2e.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=['img', 'sp_idx'])
df = df.drop(columns=[col for col in df.columns if "_obj" in col])  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### First, drop instances & features

- unusual long fixation durations are identified as outliers and dropped
  - drop if "dur > 5000 ms"
- drop `sal_first_above_0.75*max_rank` & `sal_first_above_0.9*max_rank` since this features consists of mainly (33% / 61%) outlier values
  - outliers are coded as "20" 


In [None]:
# processing
df = df[df["sp_fix_duration_ms_total"] <= 5000]
df = df.drop(columns=['sal_first_above_0.75*max_rank', 'sal_first_above_0.9*max_rank'])

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### Checking for highly correlated columns
think after running this lines, which column to additionally drop

In [None]:
# check for correlations
pp.check_correlations(df[num_cols], thresh=0.8)

### further processing of correlating features
- `sp_fix_duration_ms_total` & `sp_fix_duration_ms_mean`
  - keep the `saliency weighted` version of both fixation duration measures
- `obj_n_fix_*`
  - keep the time measures of object recognition features

In [None]:
# processing
df = df.drop(
    columns=[
        "sp_fix_duration_ms_total",
        "sp_fix_duration_ms_mean",
        "obj_n_fix_face",
        "obj_n_fix_animate",
        "obj_n_fix_inanimate",
        "obj_n_fix_background",
    ]
)

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# check for correlations, again
pp.check_correlations(df[num_cols], thresh=0.8)

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

### Split into train & test sets

As soon as the dataset is in its final form, perform train-test-split with our own split function to have out 30-image-set always as our test set.

In [None]:
# prepare features and target
X = df
y = X.pop("asd")

# define numerical columns once more
num_cols = X.columns[X.dtypes != "object"]

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

### Set variables

- define `metric`
- behavior for saving models as pickles
- defaults for model-objects

In [None]:
# metric
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# defaults
RSEED = 42
cv = 10
n_jobs = -1
verbose = 1

---
## Modelling

### Example

just to demonstrate what functions are available in the `evaluate_models.py` file...

In [None]:
# decision tree
_dt = DecisionTreeClassifier(max_depth=10)


In [None]:
# set file & folder name
folder_name = "test_DT"
model_name = "test_DT_v3.pickle"

# fit or load
_dt = em.fit_or_load(
    _dt, X_train, y_train, model_name, folder=folder_name
)

In [None]:
# predict & proba
_pred_test = _dt.predict(X_test)
_proba_test = _dt.predict_proba(X_test)

_pred_train = _dt.predict(X_train)
_proba_train = _dt.predict_proba(X_train)

#### Report classifications

In [None]:
# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=_pred_train,
    y_train_proba=_proba_train,
    y_test=y_test,
    y_test_pred=_pred_test,
    y_test_proba=_proba_test,
)

#### Learning curves

In [None]:
# learning curves - for one model - default score = f2
em.learning(_dt, X_train, y_train)

# learning curves - for one model - accuracy score
em.learning(_dt, X_train, y_train, score='accuracy', score_name="Accuracy")

# learning curves - for a list of models
em.learning([_dt, _dt, _dt], X_train, y_train, cv=5)

#### Feature Importances

In [None]:
# feature importances
em.feat_importance(_dt, X_train, y_train, X_test, y_test)

#### Some model infos

In [None]:
# print some model infos   -> let me know what i should add !?!?!?
em.model_info(_dt)

#### Saving the model

In [None]:
# save model
model_name = "DT_test.pickle"
em.save_model(_dt, model_name)

# save into specific folder
model_folder = "testing"
em.save_model(_dt, model_name, folder=model_folder)

# save again -> suffix will be added
em.save_model(_dt, model_name, folder=model_folder)

# save again & OVERWRITE
em.save_model(_dt, model_name, folder=model_folder, overwrite=True)

#### Error

In [None]:
em.error_images(y_test, _pred_test, proba_test=_proba_test)

In [None]:
inp = {
    "test_name_1": _dt,
    "test_name_2": _dt,
}
em.error_compare_models(inp, X_test, y_test)
