# **Best Model Search - *TEMPLATE***

##### Imports + Specific Data Preparation 

In [None]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# For ML 
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif, mutual_info_classif
from model_utils import * 
# Pandas' options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Estimators and Hyper-parameters

In [None]:
n_neighbors = list(range(1, 200, 3))
estimators_full = {
    'NAIVE BAYES': [GaussianNB(), {'var_smoothing': np.logspace(0,-12, num=120)}],
    'KNN': [KNeighborsClassifier(), {'n_neighbors':n_neighbors}],
    'LOGISTIC REGRESSION': [LogisticRegression(), {'solver': ['saga', 'lbfgs', 'sag'], 'C':[0.01, 0.1, 1, 10], 'multi_class':['multinomial'], 'max_iter':[5000]}],
    'DECISION TREE': [DecisionTreeClassifier(), {'criterion': ['gini', 'entropy'], 'max_depth': list(range(1,12)), 'random_state':[14]}],
    'RANDOM FORESTS': [RandomForestClassifier(), {'max_depth': [5, 10, 20], 'min_samples_leaf': [4, 10, 15], 'min_samples_split': [5, 10, 20], 'n_estimators': [100, 500], 'random_state':[14]}],
    'ADABOOST': [AdaBoostClassifier(), {'n_estimators':[100, 500],'learning_rate':[0.0001, 0.001,0.01, 0.1]}]
}

Feature Selection Range

In [None]:
# Specify the range for feature selection techniques
the_range = list(range(4, 20, 3)) + list(range(25, 46, 5)) + list(range(50, 91, 10)) + list(range(100, 150, 15))

**Datsets Import**

In [None]:
df = pd.read_csv('').set_index('id')
# Define numerical and categorical columns + Downcast numeric data types
numerical_colums = []
categorical_columns = []

**Split Data & Pre-processing methods**

In [None]:
first_col = ''
X = df.loc[:, first_col:]
y = df['result'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=14, stratify=y)
print('X train shape: ', X_train.shape, '. Y train shape: ',y_train.shape, '\nX test shape: ', X_test.shape, '. Y test shape: ', y_test.shape)

In [None]:
# Standard scaler + One-Hot encoding
full_pipeline_std = ColumnTransformer([
    ('num', StandardScaler(), numerical_colums), 
    ('cat_hot', OneHotEncoder(handle_unknown = 'ignore'), categorical_columns)
], remainder='passthrough')
X_tr_std = full_pipeline_std.fit_transform(X_train)
X_te_std = full_pipeline_std.transform(X_test)
print('***** Standard Scaler + One-Hot Encoding *****', '\nX_train Shape: ', X_train.shape, '\nX_tr_std Shape: ', X_tr_std.shape)

# MinMax scaler + One-Hot encoding
full_pipeline_mms = ColumnTransformer([
    ('num', MinMaxScaler(), numerical_colums), 
    ('cat_hot', OneHotEncoder(handle_unknown = 'ignore'), categorical_columns)
], remainder='passthrough')
X_tr_mms = full_pipeline_mms.fit_transform(X_train)
X_te_mms = full_pipeline_mms.transform(X_test)
print('***** MinMax Scaler + One-Hot Encoding *****', '\nX_train Shape: ', X_train.shape, '\nX_tr_std Shape: ', X_tr_mms.shape)

# Standard scaler + Ordinal encoding
full_pipeline_ord = ColumnTransformer([
    ('num', StandardScaler(), numerical_colums),
    ('cat_ord', OrdinalEncoder(), categorical_columns)], 
    remainder='passthrough')
X_tr_ord = full_pipeline_ord.fit_transform(X_train)
X_te_ord = full_pipeline_ord.transform(X_test)
print('***** Standard Scaler + Ordinal Encoding *****', '\nX_train Shape: ', X_train.shape, '\nX_tr_std Shape: ', X_tr_ord.shape)

# MinMax scaler + Ordinal encoding
full_pipeline_ord_mms = ColumnTransformer([
    ('num', MinMaxScaler(), numerical_colums),
    ('cat_ord', OrdinalEncoder(), categorical_columns)], 
    remainder='passthrough')
X_tr_ord_mms = full_pipeline_ord_mms.fit_transform(X_train)
X_te_ord_mms = full_pipeline_ord_mms.transform(X_test)
print('***** MinMax Scaler + Ordinal Encoding *****', '\nX_train Shape: ', X_train.shape, '\nX_tr_std Shape: ', X_tr_ord_mms.shape)

## ***Modelling Search***

#### **Models using Complete Feature Set**

In [None]:
print('STD + HOT: Complete')
models_complete_feat(X_training=X_tr_std, y_training=y_train, X_testing=X_te_std, y_testing=y_test, estimators=estimators_full)

In [None]:
print('MMS + HOT: Complete')
models_complete_feat(X_training=X_tr_mms, y_training=y_train, X_testing=X_te_mms, y_testing=y_test, estimators=estimators_full)

In [None]:
print('STD + ORD: Complete')
models_complete_feat(X_training=X_tr_ord, y_training=y_train, X_testing=X_te_ord, y_testing=y_test, estimators=estimators_full)

In [None]:
print('MMS + ORD: Complete')
models_complete_feat(X_training=X_tr_ord_mms, y_training=y_train, X_testing=X_te_ord_mms, y_testing=y_test, estimators=estimators_full)

#### **Models using PCA for Dimensionality Reduction**

In [None]:
print('STD + ONE-HOT: rf')
models_pca(X_training=X_tr_std, y_training=y_train, X_testing=X_te_std, y_testing=y_test, estimators=estimators_full)

In [None]:
print('MMS + ONE-HOT: PCA')
models_pca(X_training=X_tr_mms, y_training=y_train, X_testing=X_te_mms, y_testing=y_test, estimators=estimators_full)

In [None]:
print('MMS + ONE-HOT: PCA')
models_pca(X_training=X_tr_ord, y_training=y_train, X_testing=X_te_ord, y_testing=y_test, estimators=estimators_full)

In [None]:
print('MMS + ONE-HOT: PCA')
models_pca(X_training=X_tr_ord_mms, y_training=y_train, X_testing=X_te_ord_mms, y_testing=y_test, estimators=estimators_full)

#### **Models using Feature Selection Techniques**

###### **Random Forest Importance**

In [None]:
print('STD + ONE-HOT: rf')
models_feature_selection(X_train=X_tr_std, y_train=y_train, X_test=X_te_std, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='rf')

In [None]:
print('STD + ORD: rf')
models_feature_selection(X_train=X_tr_ord, y_train=y_train, X_test=X_te_ord, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='rf')

In [None]:
print('MMS + ONE-HOT: rf')
models_feature_selection(X_train=X_tr_mms, y_train=y_train, X_test=X_te_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='rf')

In [None]:
print('MMS + ORD: rf')
models_feature_selection(X_train=X_tr_ord_mms, y_train=y_train, X_test=X_te_ord_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='rf')

###### **F Statistic**

In [None]:
print('STD + ONE-HOT: mutual info')
models_feature_selection(X_train=X_tr_std, y_train=y_train, X_test=X_te_std, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='mi')

In [None]:
print('\nSTD + ORD: mutual info')
models_feature_selection(X_train=X_tr_ord, y_train=y_train, X_test=X_te_ord, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='mi')

In [None]:
print('\nMMS + ONE-HOT: mutual info')
models_feature_selection(X_train=X_tr_mms, y_train=y_train, X_test=X_te_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='mi')

In [None]:
print('\nMMS + ORD: mutual info')
models_feature_selection(X_train=X_tr_ord_mms, y_train=y_train, X_test=X_te_ord_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='mi')

###### **Mutual Information**

In [None]:
print('STD + ONE-HOT: F stat')
models_feature_selection(X_train=X_tr_std, y_train=y_train, X_test=X_te_std, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='f')

In [None]:
print('\nSTD + ORD: F stat')
models_feature_selection(X_train=X_tr_ord, y_train=y_train, X_test=X_te_ord, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='f')

In [None]:
print('\nMMS + ONE-HOT: F stat')
models_feature_selection(X_train=X_tr_mms, y_train=y_train, X_test=X_te_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='f')

In [None]:
print('\nMMS + ORD: F stat')
models_feature_selection(X_train=X_tr_ord_mms, y_train=y_train, X_test=X_te_ord_mms, y_test=y_test, estimators=estimators_full, sel_range=the_range, feat_sel='f')