# Pipeline Notebook for Base Models

### Contains Pipelines for Random Forrest, SVC and XGBoost Model


### Import the Libraries

In [14]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

# plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

import pickle

### Import the Dataframe with Custom Functions

In [2]:
# path to csv file
path_df = os.path.join("..", "data", "df_deep_sam.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=['img', 'sp_idx'])
df = df.drop(columns=[col for col in df.columns if "_obj" in col])  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

 -> dataframe has 7598 instances and 45 columns
 -> there are 45 numerical columns
 -> there are 0 categoricals columns


### Examining the Columns we need and produce Feature Lists for each Model

### Defining the Lists for the Features each Model uses


In [1]:
# 11 Features on Best SVC Model
svc_feature_list = ["sp_fix_duration_ms_total","sp_fix_duration_ms_mean","sp_fix_duration_ms_var", "sam_sal_first_fixation","sam_sal_sum",
                    "sam_sal_KLD", "obj_t_abs_on_background","obj_t_abs_on_animate", "obj_n_fix_background","obj_n_fix_inanimate",
                    "obj_n_fix_animate"]

# Features on Best XGB Model
xgb_feature_list = ['sp_fix_count', 'sp_fix_duration_ms_var', 'sp_len_px_total',
       'sp_saccade_amplitude_px_mean', 'sp_saccade_amplitude_px_var',
       'sp_distance_to_centre_px_mean', 'sp_distance_to_centre_px_var',
       'sp_distance_to_sp_mean_px_mean', 'sp_distance_to_sp_mean_px_var',
       'dg_sal_first_fixation', 'dg_sal_sum', 'dg_sal_max', 'dg_sal_weighted_duration_sum',
       'dg_sal_weighted_duration_mean', 'dg_sal_KLD', 'dg_sal_NSS', 'obj_t_abs_on_face',
       'obj_t_rel_on_face', 'obj_t_abs_on_animate', 'obj_t_abs_on_inanimate',
       'obj_t_abs_on_background', 'obj_t_rel_on_animate',
       'obj_t_rel_on_inanimate', 'obj_t_rel_on_background']

# Best Features for Random Forrest Model
rf_feature_list = ['sp_fix_count', 'sp_fix_duration_ms_total', 'sp_fix_duration_ms_mean',
 'sp_fix_duration_ms_var', 'sp_len_px_total', 'sp_saccade_amplitude_px_mean',
 'sp_saccade_amplitude_px_var', 'sp_distance_to_centre_px_mean',
 'sp_distance_to_centre_px_var', 'sp_distance_to_sp_mean_px_mean',
 'sp_distance_to_sp_mean_px_var', 'dg_sal_first_fixation', 'dg_sal_mean',
 'dg_sal_sum', 'dg_sal_max', 'dg_sal_weighted_duration_sum',
 'dg_sal_weighted_duration_mean', 'dg_sal_KLD', 'dg_sal_NSS',
 'obj_t_abs_on_animate', 'obj_t_abs_on_background']

### Function for Dropping Columns and Give back X 

In [3]:
# Prepare the X for each Model based on it's features

def feature_selector(df, features_to_keep):
    # Select features
    X = df[features_to_keep]
    
    return X

# Transformers, Preproccessing and Pipeline for our 3 Base Models

### SVC Model

In [10]:
# Setting up a Transformer for SVC with my Custom Function and the Scaler

transformer_svc = [("feature_selector", FunctionTransformer(feature_selector, validate=False),svc_feature_list),
                   ("scaler", StandardScaler(),'X')
                   ]

# Wrap a ColumnTransformer around the transformer_svc

preprocessing_svc = ColumnTransformer(transformer_svc, remainder="drop")

# pack the preprocessing steps into a Pipeline
svc_pipeline = Pipeline([
    ("preprocessor", preprocessing_svc),
    ("classifier", SVC(C=0.1, degree=4, kernel='poly',gamma='scale', probability=True))
])


### XGBoost Model

In [8]:
# Setting up a Transformer for SVC with my Custom Function and the Scaler

transformer_xgb = [("feature_selector", FunctionTransformer(feature_selector, validate=False),xgb_feature_list),
                   ("scaler", MinMaxScaler(),'X')
                   ]

# Wrap a ColumnTransformer around the transformer_svc

preprocessing_xgb = ColumnTransformer(transformer_xgb, remainder="drop")

# pack the preprocessing steps into a Pipeline
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessing_xgb),
    ("classifier", XGBClassifier(learning_rate = 0.01, max_depth=5, n_estimators=100))
])

### Random Forrest Pipeline

In [11]:
# Setting up a Transformer for SVC with my Custom Function and the Scaler

transformer_rf = [("feature_selector", FunctionTransformer(feature_selector, validate=False),rf_feature_list),
                   
                   ]

# Wrap a ColumnTransformer around the transformer_svc

preprocessing_rf = ColumnTransformer(transformer_rf, remainder="drop")

# pack the preprocessing steps into a Pipeline
rf_pipeline = Pipeline([
    ("preprocessor", preprocessing_rf),
    ("classifier", RandomForestClassifier(max_depth=7, max_features="sqrt", min_samples_leaf=40, min_samples_split=50,n_estimators=50, verbose=0))
])

### Declare X and y for Different Models
- SVC
- XGBoost
- RF

### For SVC Model


In [None]:
# prepare features and target
X = df[svc_feature_list]
y = df.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

### For XGBoost Model

In [None]:
# prepare features and target
X = df[xgb_feature_list]
y = df.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

# Stacking Pipeline
### 2 Versions of the Stacking Pipeline
- in 1 Version you can run it and it will compute all steps (long runtime)
- in 2 Version you load the pickle files of the base Model and perform Grid Search on the Logistic Regression Model 

### 1 Version full run! Only do this if u have lots of time !!!

- We have 2 Pipelines here 1 gives us out the Proba of our master model for each Class

- The other Pipeline is for Predicting 0 or 1  

i added the Proba pipeline due to better error analysis reasons

In [None]:
param_grid = {
    'final_estimator__C': np.logspace(-3, 3, 10),  # Reduced values for C
    'final_estimator__penalty': ['l1', 'l2'],  # Removed 'elasticnet'
    'final_estimator__solver': ['liblinear', 'lbfgs', 'saga'],  # Removed 'newton-cg', 'sag'
    'final_estimator__class_weight': [None, 'balanced'],  # Class weights
    'final_estimator__max_iter': [50, 100, 200],  # Reduced maximum number of iterations
    'final_estimator__tol': [1e-4, 1e-3],  # Reduced tolerance for stopping criteria
    'final_estimator__fit_intercept': [True, False],  # Whether to calculate the intercept
    'final_estimator__l1_ratio': [None, 0.1, 0.5, 0.9]  # Elastic-Net mixing parameter
}

In [12]:
# Define an empty Master Model - LogReg
master_model = LogisticRegression()

# Stacking Classifier Pipeline
stacking_pipeline_proba = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('xgb', xgb_pipeline),
        ('rf', rf_pipeline)
    ],
    final_estimator=master_model,
    cv=5,
    stack_method='predict_proba',
    passthrough=True,
    
)


stacking_pipeline_predict = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('xgb', xgb_pipeline),
        ('rf', rf_pipeline)
    ],
    final_estimator=master_model,
    cv=5,
    stack_method='predict',
    passthrough=True,
    
)

NameError: name 'StackingClassifier' is not defined

In [None]:
# Grid search for hyperparameter tuning of the master model
grid_search_pipeline = GridSearchCV(
    estimator=stacking_pipeline_predict,
    param_grid=param_grid,
    cv=5,
    verbose=2

)

### Version 2 - Easy Version with loaded Pickle Files!!!

In [None]:
# Load the base models from pickle files
with open('path_to_svc_pipeline.pkl', 'rb') as file:
    svc_pipeline = pickle.load(file)

with open('path_to_xgb_pipeline.pkl', 'rb') as file:
    xgb_pipeline = pickle.load(file)

with open('path_to_rf_pipeline.pkl', 'rb') as file:
    rf_pipeline = pickle.load(file)

# Define an empty Master Model - LogReg
master_model = LogisticRegression()

# Stacking Classifier Pipeline with predict_proba as stack_method
stacking_pipeline_proba = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('xgb', xgb_pipeline),
        ('rf', rf_pipeline)
    ],
    final_estimator=master_model,
    cv=5,
    stack_method='predict_proba',  # Use predict_proba for classification tasks
    passthrough=True
)

# Stacking Classifier Pipeline with predict as stack_method
stacking_pipeline_predict = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('xgb', xgb_pipeline),
        ('rf', rf_pipeline)
    ],
    final_estimator=master_model,
    cv=5,
    stack_method='predict',  # Use predict for classification tasks
    passthrough=True
)

In [None]:
param_grid = {
    'final_estimator__C': np.logspace(-3, 3, 10),  # Reduced values for C
    'final_estimator__penalty': ['l1', 'l2'],  # Removed 'elasticnet'
    'final_estimator__solver': ['liblinear', 'lbfgs', 'saga'],  # Removed 'newton-cg', 'sag'
    'final_estimator__class_weight': [None, 'balanced'],  # Class weights
    'final_estimator__max_iter': [50, 100, 200],  # Reduced maximum number of iterations
    'final_estimator__tol': [1e-4, 1e-3],  # Reduced tolerance for stopping criteria
    'final_estimator__fit_intercept': [True, False],  # Whether to calculate the intercept
    'final_estimator__l1_ratio': [None, 0.1, 0.5, 0.9]  # Elastic-Net mixing parameter
}

In [None]:
# Grid search for hyperparameter tuning of the master model
grid_search_pipeline = GridSearchCV(
    estimator=stacking_pipeline_predict,
    param_grid=param_grid,
    cv=5,
    verbose=2

)