# Pipeline Notebook for Base Models

### Contains Pipelines for Random Forrest, SVC and XGBoost Model


### Import the Libraries

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

# plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

import pickle

### Import the Dataframe with Custom Functions

In [2]:
# path to csv file
path_df = os.path.join("..", "data", "df_deep_sam.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=['img', 'sp_idx'])
df = df.drop(columns=[col for col in df.columns if "_obj" in col])  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

 -> dataframe has 7598 instances and 45 columns
 -> there are 45 numerical columns
 -> there are 0 categoricals columns


### Examining the Columns we need and produce Feature Lists for each Model

In [3]:
df.head()

Unnamed: 0_level_0,asd,sp_fix_count,sp_fix_duration_ms_total,sp_fix_duration_ms_mean,sp_fix_duration_ms_var,sp_len_px_total,sp_saccade_amplitude_px_mean,sp_saccade_amplitude_px_var,sp_distance_to_centre_px_mean,sp_distance_to_centre_px_var,...,sam_sal_first_fixation,sam_sal_first_above_0.75*max_rank,sam_sal_first_above_0.9*max_rank,sam_sal_mean,sam_sal_sum,sam_sal_max,sam_sal_weighted_duration_sum,sam_sal_weighted_duration_mean,sam_sal_KLD,sam_sal_NSS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
asd_001_00,1,11,748,68.0,5386.727273,2016.66291,201.666291,41837.807088,223.323476,5350.784055,...,119.0,9,20,83.545455,919.0,218.0,48441.0,4403.727273,4.54992,4.081538
asd_001_01,1,14,1272,90.857143,15406.408163,2036.608219,156.662171,23915.733507,206.22922,12117.973303,...,6.0,20,20,39.571429,554.0,157.0,82387.0,5884.785714,1.496699,1.795978
asd_001_02,1,10,4605,460.5,50657.45,2512.203305,279.133701,21119.599452,247.888536,3495.723783,...,3.0,20,20,39.8,398.0,189.0,220431.0,22043.1,10.209363,1.807858
asd_001_03,1,16,4288,268.0,24633.625,3594.951757,239.66345,33871.363837,281.053706,6743.001659,...,4.0,12,12,48.5625,777.0,232.0,299288.0,18705.5,3.742902,2.263291
asd_001_04,1,16,3724,232.75,12623.5625,3184.717712,212.314514,22004.347075,250.037923,10092.036872,...,0.0,20,20,15.75,252.0,105.0,77999.0,4874.9375,12.593081,0.557854


### Defining the Lists for the Features each Model uses

### !!! NOTE THAT ASD IS IN THE FEATURE LIST !!! DO THE X and y Variable declaration as Provided here !!!

In [4]:
df.columns

Index(['asd', 'sp_fix_count', 'sp_fix_duration_ms_total',
       'sp_fix_duration_ms_mean', 'sp_fix_duration_ms_var', 'sp_len_px_total',
       'sp_saccade_amplitude_px_mean', 'sp_saccade_amplitude_px_var',
       'sp_distance_to_centre_px_mean', 'sp_distance_to_centre_px_var',
       'sp_distance_to_sp_mean_px_mean', 'sp_distance_to_sp_mean_px_var',
       'dg_sal_first_fixation', 'dg_sal_first_above_0.75*max_rank',
       'dg_sal_first_above_0.9*max_rank', 'dg_sal_mean', 'dg_sal_sum',
       'dg_sal_max', 'dg_sal_weighted_duration_sum',
       'dg_sal_weighted_duration_mean', 'dg_sal_KLD', 'dg_sal_NSS',
       'obj_n_fix_face', 'obj_t_abs_on_face', 'obj_t_rel_on_face',
       'obj_n_fix_animate', 'obj_n_fix_inanimate', 'obj_n_fix_background',
       'obj_t_abs_on_animate', 'obj_t_abs_on_inanimate',
       'obj_t_abs_on_background', 'obj_t_rel_on_animate',
       'obj_t_rel_on_inanimate', 'obj_t_rel_on_background', 'Unnamed: 0',
       'sam_sal_first_fixation', 'sam_sal_first_above_0.

In [5]:
# 11 Features on Best SVC Model
svc_feature_list = ["sp_fix_duration_ms_total","sp_fix_duration_ms_mean","sp_fix_duration_ms_var", "sam_sal_first_fixation","sam_sal_sum","sam_sal_KLD", "obj_t_abs_on_background","obj_t_abs_on_animate", "obj_n_fix_background","obj_n_fix_inanimate","obj_n_fix_animate","asd"]

# Features on Best XGB Model
xgb_feature_list = ['sp_fix_count', 'sp_fix_duration_ms_var', 'sp_len_px_total',
       'sp_saccade_amplitude_px_mean', 'sp_saccade_amplitude_px_var',
       'sp_distance_to_centre_px_mean', 'sp_distance_to_centre_px_var',
       'sp_distance_to_sp_mean_px_mean', 'sp_distance_to_sp_mean_px_var',
       'dg_sal_first_fixation', 'dg_sal_sum', 'dg_sal_max', 'dg_sal_weighted_duration_sum',
       'dg_sal_weighted_duration_mean', 'dg_sal_KLD', 'dg_sal_NSS', 'obj_t_abs_on_face',
       'obj_t_rel_on_face', 'obj_t_abs_on_animate', 'obj_t_abs_on_inanimate',
       'obj_t_abs_on_background', 'obj_t_rel_on_animate',
       'obj_t_rel_on_inanimate', 'obj_t_rel_on_background', 'asd']

### Function for Dropping Columns and Give back X and y

In [None]:
# Prepare the X and y for each Model based on it's features

def xy_feature_selector(df, features_to_keep):
    # Select features
    X = df[features_to_keep]
    
    # Select target
    y = X.pop("asd")
    
    return X, y

### Example for Usage of the Function

In [None]:
# For SVC Model

X,y = xy_feature_selector(df, svc_feature_list)
# then u can continue with: X_train, X_test, y_train, y_test = pp.split(X, y)


# For XGB Model

X,y = xy_feature_selector(df, xgb_feature_list)

# For Random Forest Model - TBD

### Declare X and y for Different Models
- SVC
- XGBoost
- RF

### For SVC Model


In [None]:
# prepare features and target
X = df[svc_feature_list]
y = X.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

### For XGBoost Model

In [None]:
# prepare features and target
X = df[xgb_feature_list]
y = X.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")