# **Setup**

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

#from sklearn.metrics import plot_roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

In [3]:
df['label'].value_counts()

label
1    50199
0    25220
2     1288
Name: count, dtype: int64

**GET ONLY PHISHING/HAM EMAILS**

In [4]:
# Remove spam emails, only consider phishing: (ham = 0, spam = 1, phishing = 2)
df_phish = df[(df['label'] == 2)]
print(df_phish.shape)

(1288, 95)


In [5]:
# Ham data
df_ham = df[(df['label'] == 0)]
df_ham.shape

(25220, 95)

In [6]:
# Split ham data into 50/50 so some can be used for validation
df_split = df_ham[:int(len(df_ham)/2)]
df_split = df_split.reset_index()
df_split = df_split.drop('index', axis=1)

df_val_ham = df_ham[int(len(df_ham)/2):]
df_val_ham = df_val_ham.reset_index()
df_val_ham = df_val_ham.drop('index', axis=1)

df = pd.concat([df_phish, df_split])
print(df.shape)

(13898, 95)


In [7]:
df['label'].value_counts()

label
0    12610
2     1288
Name: count, dtype: int64

In [8]:
# Randomly Sample 1288 Ham emails to create a balanced dataset (match the number of phishing emails)
df_ham = df[df['label'] == 0].sample(1288)
df_phish = df[df['label'] == 2]

df_phish = df_phish.assign(label=1)

df_new = df_ham._append(df_phish, ignore_index=True)
df_new = df_new.sample(frac=1)
df = df_new.reset_index(drop=True)
df

Unnamed: 0,hops,missing_subject,missing_to,missing_content-type,missing_mime-version,missing_x-mailer,missing_content-transfer-encoding,missing_x-mimeole,missing_x-priority,missing_list-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,1,0,0,0,1,1,0,1,1,0,...,1,1,0,0,0,1,1,1,0,0
1,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,1
2,2,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,1,0,0,0,0,0,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
2572,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2573,2,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2574,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,1


In [9]:
df['label'].value_counts()

label
0    1288
1    1288
Name: count, dtype: int64

**FEATURE SELECTION**

In [10]:
# Reduce feature set
# The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue.

feature_list = [
'domain_match_from_return-path',
'domain_match_message-id_from',
'domain_match_message-id_return-path',
'domain_match_to_from',
'domain_match_errors-to_from',
'domain_match_message-id_reply-to',
'domain_match_errors-to_message-id',
'domain_match_sender_from',
'domain_match_to_received',
'domain_match_errors-to_reply-to',
'domain_match_to_message-id',
'label']

feature_list = ['domain_val_message-id',
       'domain_match_message-id_from', 'domain_match_from_return-path',
       'domain_match_message-id_return-path', 'domain_match_message-id_sender',
       'domain_match_message-id_reply-to', 'domain_match_return-path_reply-to',
       'domain_match_reply-to_to', 'domain_match_to_in-reply-to',
       'domain_match_errors-to_message-id', 'domain_match_errors-to_from',
       'domain_match_errors-to_sender', 'domain_match_errors-to_reply-to',
       'domain_match_sender_from', 'domain_match_references_reply-to',
       'domain_match_references_in-reply-to', 'domain_match_references_to',
       'domain_match_from_reply-to', 'domain_match_to_from',
       'domain_match_to_message-id', 'domain_match_to_received', 'label']

df = df[feature_list]

In [11]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

features_list = df_X.columns

In [12]:
# Apply a standard scaler to the full data set
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_X)
df_X = scaler.transform(df_X)
df_X = pd.DataFrame(df_X, columns=features_list)

In [13]:
# Breaking the data into a test and training set (20% test, 80% train)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20, random_state=42)

In [14]:
df.shape

(2576, 22)

In [15]:
df.head(5)

Unnamed: 0,domain_val_message-id,domain_match_message-id_from,domain_match_from_return-path,domain_match_message-id_return-path,domain_match_message-id_sender,domain_match_message-id_reply-to,domain_match_return-path_reply-to,domain_match_reply-to_to,domain_match_to_in-reply-to,domain_match_errors-to_message-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,0,1,1,1,1,1,1,1,0,1,...,1,1,0,0,0,1,1,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


**NOVELTY: GET VALIDATION DATA**

In [16]:
# This is needed in order to have ham data
df_val = pd.read_csv('preprocessed_phishing_2022.csv')

In [17]:
# The solution to get ham data is to transfer from other data...

df_val = pd.concat([df_val, df_val_ham])

In [18]:
df_val['label'].value_counts()

label
0    12610
2      245
Name: count, dtype: int64

In [19]:
# Randomly Sample 245 Ham emails to create a balanced dataset
df_ham_val = df_val[df_val['label'] == 0].sample(245)

df_phish_val = df_val[df_val['label'] == 2]

df_phish_val = df_phish_val.assign(label=1)

df_new_val = df_ham_val._append(df_phish_val, ignore_index=True)

df_new_val = df_new_val.sample(frac=1)
df_val = df_new_val.reset_index(drop=True)

In [20]:
# Reduce feature set
# The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue.

feature_list = [
'domain_match_from_return-path',
'domain_match_message-id_from',
'domain_match_message-id_return-path',
'domain_match_to_from',
'domain_match_errors-to_from',
'domain_match_message-id_reply-to',
'domain_match_errors-to_message-id',
'domain_match_sender_from',
'domain_match_to_received',
'domain_match_errors-to_reply-to',
'domain_match_to_message-id',
'label']

feature_list = ['domain_val_message-id',
       'domain_match_message-id_from', 'domain_match_from_return-path',
       'domain_match_message-id_return-path', 'domain_match_message-id_sender',
       'domain_match_message-id_reply-to', 'domain_match_return-path_reply-to',
       'domain_match_reply-to_to', 'domain_match_to_in-reply-to',
       'domain_match_errors-to_message-id', 'domain_match_errors-to_from',
       'domain_match_errors-to_sender', 'domain_match_errors-to_reply-to',
       'domain_match_sender_from', 'domain_match_references_reply-to',
       'domain_match_references_in-reply-to', 'domain_match_references_to',
       'domain_match_from_reply-to', 'domain_match_to_from',
       'domain_match_to_message-id', 'domain_match_to_received', 'label']

df_val = df_val[feature_list]

X_val = df_val.drop('label', axis=1)
y_val = df_val['label']

In [21]:
df_val.shape

(490, 22)

In [22]:
df_val.head(5)

Unnamed: 0,domain_val_message-id,domain_match_message-id_from,domain_match_from_return-path,domain_match_message-id_return-path,domain_match_message-id_sender,domain_match_message-id_reply-to,domain_match_return-path_reply-to,domain_match_reply-to_to,domain_match_to_in-reply-to,domain_match_errors-to_message-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,1,1,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# **Random Forest**

In [23]:
%%time
# Original: Hyperparameter Tuning with GRID SEARCH
pipe = Pipeline([("scale", StandardScaler()),
                ("rf", RandomForestClassifier())
                ])

param_grid_list = {'rf__n_estimators': [100, 150],
                  'rf__criterion': ['entropy', 'gini'],
                  'rf__min_samples_split': [2, 3],
                  'rf__min_samples_leaf': [1, 2],
                  'rf__max_features': ['sqrt', 'log2']} # removed 'auto' 
         
grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
rf_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Test the best performing model on the test set
original_predict = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, original_predict)*100)
print('F1 Score:', f1_score(y_test, original_predict)*100)
print('Recall:', recall_score(y_test, original_predict)*100)
print('Precision:', precision_score(y_test, original_predict)*100)
print('ROC AUC:', roc_auc_score(y_test, original_predict)*100)
print('Confusion Matrix:', confusion_matrix(y_test, original_predict))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

{'rf__criterion': 'entropy', 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 3, 'rf__n_estimators': 150} 

Pipeline(steps=[('scale', StandardScaler()),
                ('rf',
                 RandomForestClassifier(criterion='entropy', min_samples_leaf=2,
                                        min_samples_split=3,
                                        n_estimators=150))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 76.53061224489795
F1 Score: 80.99173553719008
Recall: 100.0
Precision: 68.05555555555556
ROC AUC: 76.53061224489795
Confusion Matrix: [[130 115]
 [  0 245]]
CPU times: user 645 ms, sys: 164 ms, total: 809 ms
Wall time: 5.19 s


**Novelty: Trying hyperparameter tuning with RANDOM SEARCH**

In [24]:
%%time
# Hyperparameter Tuning with RANDOM SEARCH
from scipy.stats import randint

pipe = Pipeline([("scale", StandardScaler()),
                ("rf", RandomForestClassifier())
                ])

rs_space={'rf__n_estimators': [100, 150],
               'rf__criterion': ['entropy', 'gini'],
               'rf__min_samples_split': [2, 3],
               'rf__min_samples_leaf': [1, 2],
               'rf__max_features': ['sqrt', 'log2']
         }

from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(pipe, rs_space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=10)
model_random = rf_random.fit(X_train, y_train)

rf_df = pd.DataFrame(model_random.cv_results_)
print('Best hyperparameters are: '+str(model_random.best_params_))
print('Best score is: '+str(model_random.best_score_))
print('Best model is: '+str(model_random.best_estimator_))

# Get the best performing model
best_model = model_random.best_estimator_

# Test the best performing model on the test set
original_predict = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('\nORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, original_predict)*100)
print('F1 Score:', f1_score(y_test, original_predict)*100)
print('Recall:', recall_score(y_test, original_predict)*100)
print('Precision:', precision_score(y_test, original_predict)*100)
print('ROC AUC:', roc_auc_score(y_test, original_predict)*100)
print('Confusion Matrix:', confusion_matrix(y_test, original_predict))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#rf_df[rf_df['rank_test_score'] <= 5].head(5)



Best hyperparameters are: {'rf__n_estimators': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2, 'rf__max_features': 'sqrt', 'rf__criterion': 'gini'}
Best score is: 0.9810679611650486
Best model is: Pipeline(steps=[('scale', StandardScaler()),
                ('rf', RandomForestClassifier(min_samples_leaf=2))])

ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 76.53061224489795
F1 Score: 80.99173553719008
Recall: 100.0
Precision: 68.05555555555556
ROC AUC: 76.53061224489795
Confusion Matrix: [[130 115]
 [  0 245]]
CPU times: user 380 ms, sys: 61.5 ms, total: 442 ms
Wall time: 4.05 s


# **MLP**

In [25]:
%%time

# Original: Hyperparameter Tuning with GRID SEARCH
pipe = Pipeline([("scale", StandardScaler()),
                ("mlp", MLPClassifier(max_iter=500))
                ])

param_grid_list = {'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
                   'mlp__activation': ['tanh', 'relu'],
                   'mlp__learning_rate': ['constant', 'adaptive'],
                   'mlp__solver': ['adam', 'sgd'],
                   'mlp__alpha': [0.0001, 0.001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
mlp_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#mlp_df[mlp_df['rank_test_score'] <= 5].head(5)

{'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (40, 40), 'mlp__learning_rate': 'constant', 'mlp__solver': 'sgd'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('mlp',
                 MLPClassifier(hidden_layer_sizes=(40, 40), max_iter=500,
                               solver='sgd'))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.48062015503875
F1 Score: 97.56097560975608
Recall: 99.61685823754789
Precision: 95.58823529411765
ROC AUC: 97.45548794230334
Confusion Matrix: [[243  12]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 91.63265306122449
F1 Score: 91.91321499013806
Recall: 95.10204081632652
Precision: 88.93129770992367
ROC AUC: 91.63265306122447
Confusion Matrix: [[216  29]
 [ 12 233]]
CPU times: user 24.5 s, sys: 1.53 s, total: 26.1 s
Wall time: 40.5 s


**Novelty: Trying hyperparameter tuning with RANDOM SEARCH**

In [26]:
%%time
# Hyperparameter Tuning with RANDOM SEARCH
from scipy.stats import randint

pipe = Pipeline([("scale", StandardScaler()),
                ("mlp", MLPClassifier(max_iter=500))
                ])

rs_space={'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
            'mlp__activation': ['tanh', 'relu'],
            'mlp__learning_rate': ['constant', 'adaptive'],
            'mlp__solver': ['adam', 'sgd'],
            'mlp__alpha': [0.0001, 0.001, 0.01]}

from sklearn.model_selection import RandomizedSearchCV

mlp_random = RandomizedSearchCV(pipe, rs_space, scoring='accuracy', n_jobs=-1, cv=3)
model_random = mlp_random.fit(X_train, y_train)

mlp_df = pd.DataFrame(model_random.cv_results_)
print('Best hyperparameters are: '+str(model_random.best_params_))
print('Best score is: '+str(model_random.best_score_))
print('Best model is: '+str(model_random.best_estimator_))

# Get the best performing model
best_model = model_random.best_estimator_

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#mlp_df[mlp_df['rank_test_score'] <= 5].head(5)

Best hyperparameters are: {'mlp__solver': 'adam', 'mlp__learning_rate': 'constant', 'mlp__hidden_layer_sizes': (20,), 'mlp__alpha': 0.0001, 'mlp__activation': 'tanh'}
Best score is: 0.9805820435889058
Best model is: Pipeline(steps=[('scale', StandardScaler()),
                ('mlp',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(20,),
                               max_iter=500))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 91.42857142857143
F1 Score: 90.78947368421053
Recall: 84.48979591836735
Precision: 98.10426540284361
ROC AUC: 91.42857142857143
Confusion Matrix: [[241   4]
 [ 38 207]]
CPU times: user 769 ms, sys: 151 ms, total: 921 ms
Wall time: 1.47 s


# **Logistic Regression**

In [46]:
%%time

# Original: Hyperparameter Tuning with GRID SEARCH
pipe = Pipeline([("scale", StandardScaler()),
                ("lr", LogisticRegression(max_iter=1000))
                ])

param_grid_list = {'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'lr__fit_intercept': [True, False],
                  'lr__tol': [0.0001, 0.001],
                  'lr__penalty': ['l2'],
                  'lr__C': [0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
lr_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#lr_df[lr_df['rank_test_score'] <= 5].head(5)

{'lr__C': 10, 'lr__fit_intercept': True, 'lr__penalty': 'l2', 'lr__solver': 'newton-cg', 'lr__tol': 0.0001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('lr',
                 LogisticRegression(C=10, max_iter=1000, solver='newton-cg'))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 80.81632653061224
F1 Score: 76.73267326732675
Recall: 63.26530612244898
Precision: 97.48427672955975
ROC AUC: 80.81632653061224
Confusion Matrix: [[241   4]
 [ 90 155]]
CPU times: user 482 ms, sys: 63.1 ms, total: 545 ms
Wall time: 1.96 s


# **SVM**

In [28]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("svc", SVC())
                ])

param_grid_list = {'svc__C': [0.1, 1, 10],
                  'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'svc__degree': [3, 4, 5],
                  'svc__tol': [0.001, 0.0001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
svm_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#svm_df[svm_df['rank_test_score'] <= 5].head(5)

{'svc__C': 1, 'svc__degree': 3, 'svc__kernel': 'rbf', 'svc__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()), ('svc', SVC(C=1))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 80.61224489795919
F1 Score: 83.06595365418895
Recall: 95.10204081632652
Precision: 73.73417721518987
ROC AUC: 80.61224489795917
Confusion Matrix: [[162  83]
 [ 12 233]]
CPU times: user 1.27 s, sys: 485 ms, total: 1.75 s
Wall time: 2.58 s


# **Decision Tree**

In [29]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("dt", DecisionTreeClassifier())
                ])

param_grid_list = {'dt__criterion': ['entropy', 'gini'],
                  'dt__min_samples_split': [2, 3, 4],
                  'dt__min_samples_leaf': [1, 2, 3],
                  'dt__ccp_alpha': [0, 0.005, 0.01, 0.025, 0.05, 0.1]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
dt_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#dt_df[dt_df['rank_test_score'] <= 5].head(5)

{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy', 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2} 

Pipeline(steps=[('scale', StandardScaler()),
                ('dt',
                 DecisionTreeClassifier(ccp_alpha=0, criterion='entropy'))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 76.53061224489795
F1 Score: 80.99173553719008
Recall: 100.0
Precision: 68.05555555555556
ROC AUC: 76.53061224489795
Confusion Matrix: [[130 115]
 [  0 245]]
CPU times: user 319 ms, sys: 66.1 ms, total: 385 ms
Wall time: 550 ms


# **Naive Bayes (Gaussian)**

In [30]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gnb", GaussianNB())
                ])

param_grid_list = {'gnb__var_smoothing': [1E-9, 1E-10, 1E-8]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
nb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#nb_df[nb_df['rank_test_score'] <= 5].head(5)

{'gnb__var_smoothing': 1e-08} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gnb', GaussianNB(var_smoothing=1e-08))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 89.72868217054264
F1 Score: 90.68541300527241
Recall: 98.85057471264368
Precision: 83.76623376623377
ROC AUC: 89.62136578769439
Confusion Matrix: [[205  50]
 [  3 258]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 50.0
F1 Score: 0.0
Recall: 0.0
Precision: 0.0
ROC AUC: 50.0
Confusion Matrix: [[245   0]
 [245   0]]
CPU times: user 31.2 ms, sys: 8.53 ms, total: 39.7 ms
Wall time: 55.4 ms


  _warn_prf(average, modifier, msg_start, len(result))


# **AdaBoost**

In [31]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("ab", AdaBoostClassifier())
                ])

param_grid_list = {'ab__n_estimators': [50, 100, 150, 200],
                  'ab__learning_rate': [0.95, 1, 1.05, 1.25, 1.5, 1.75, 2],
                  'ab__algorithm': ['SAMME', 'SAMME.R']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
ab_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#ab_df[ab_df['rank_test_score'] <= 5].head(5)

{'ab__algorithm': 'SAMME.R', 'ab__learning_rate': 0.95, 'ab__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('ab',
                 AdaBoostClassifier(learning_rate=0.95, n_estimators=100))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.28682170542635
F1 Score: 97.35849056603773
Recall: 98.85057471264368
Precision: 95.91078066914498
ROC AUC: 97.26842461122381
Confusion Matrix: [[244  11]
 [  3 258]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 81.83673469387756
F1 Score: 84.6286701208981
Recall: 100.0
Precision: 73.35329341317365
ROC AUC: 81.83673469387756
Confusion Matrix: [[156  89]
 [  0 245]]
CPU times: user 676 ms, sys: 110 ms, total: 786 ms
Wall time: 8.22 s


# **GradientBoostClassifier**

In [32]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gbc", GradientBoostingClassifier())
                ])

param_grid_list = {'gbc__max_features': ['sqrt', 'log2'], # Removed 'auto'
                   'gbc__learning_rate': [0.05, 0.1, 0.2, 0.25, 0.30, 0.35, 0.40, 0.5, 0.6, 0.7, 0.9],
                   'gbc__n_estimators': [100, 200]
                  }

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
gb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#gb_df[gb_df['rank_test_score'] <= 5].head(5)

{'gbc__learning_rate': 0.1, 'gbc__max_features': 'log2', 'gbc__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gbc', GradientBoostingClassifier(max_features='log2'))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 76.53061224489795
F1 Score: 80.99173553719008
Recall: 100.0
Precision: 68.05555555555556
ROC AUC: 76.53061224489795
Confusion Matrix: [[130 115]
 [  0 245]]
CPU times: user 441 ms, sys: 60.2 ms, total: 501 ms
Wall time: 3.98 s


# **KNN**

In [33]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("knn", KNeighborsClassifier())
                ])

param_grid_list = {'knn__n_neighbors': [1, 10, 20],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__p': [1, 2],
                  'knn__algorithm': ['auto'],
                  'knn__leaf_size': [15, 30, 45]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=0)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
knn_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

### Validation
val_pred = best_model.predict(X_val)

# Get the evaluation metrics
print('ORIGINAL EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

# Get the validation evaluation metrics
print('\nVALIDATION EVALUATION METRICS')
print('-----------------------------')
print('Accuracy:', accuracy_score(y_val, val_pred)*100)
print('F1 Score:', f1_score(y_val, val_pred)*100)
print('Recall:', recall_score(y_val, val_pred)*100)
print('Precision:', precision_score(y_val, val_pred)*100)
print('ROC AUC:', roc_auc_score(y_val, val_pred)*100)
print('Confusion Matrix:', confusion_matrix(y_val, val_pred))

#knn_df[knn_df['rank_test_score'] <= 5].head(5)

{'knn__algorithm': 'auto', 'knn__leaf_size': 15, 'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'distance'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(leaf_size=15, n_neighbors=20,
                                      weights='distance'))])
ORIGINAL EVALUATION METRICS
-----------------------------
Accuracy: 97.67441860465115
F1 Score: 97.74436090225565
Recall: 99.61685823754789
Precision: 95.9409594095941
ROC AUC: 97.65156637367592
Confusion Matrix: [[244  11]
 [  1 260]]

VALIDATION EVALUATION METRICS
-----------------------------
Accuracy: 75.71428571428571
F1 Score: 79.65811965811966
Recall: 95.10204081632652
Precision: 68.52941176470588
ROC AUC: 75.71428571428571
Confusion Matrix: [[138 107]
 [ 12 233]]
CPU times: user 2.73 s, sys: 403 ms, total: 3.14 s
Wall time: 731 ms
