## MMTHE01 - Masters Thesis

### D. Thesis - Create a working model - with SMOTE
* Applying SMOTE to the data
* When applying SMOTE, it is best not to use one-hot-encoding for categorical data. Therefore the data is imported before encoding and and all categorical 

#### Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

In [None]:
### Import libraries to save models
import pickle
from tensorflow.keras.models import Model, load_model

In [None]:
# Check your current working directory
cwd = os.getcwd()

# Define your relative path
relative_path = r"6. Analysis"  # adjust this relative to cwd

# Build the full path
full_path = os.path.join(cwd, relative_path)

# Check if it exists before changing
if os.path.exists(full_path):
    os.chdir(full_path)
    print("Changed directory to:", full_path)
else:
    print("Folder does not exist:", full_path)

#### Loading the test and train datasets

In [None]:
# Loading the unscaled test and training features and labels datasets
with open("saved_data/features_label_unscaled_SMOTE.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [None]:
# loading the scaled features test and training dataset
with open("saved_data/features_scaled_SMOTE.pkl", "rb") as f:
    X_train_scaled, X_test_scaled = pickle.load(f)

In [None]:
X_train.shape

#### Loading the models

In [None]:
# load Isolation Forest model
with open("saved_models/iso_forest_model_SMOTE.pkl", "rb") as f:
    iso_forest = pickle.load(f)

In [None]:
# load Autoencoder model
autoencoder = load_model("saved_models/autoencoder_model_SMOTE.keras")

In [None]:
# load Random Forest model
with open("saved_models/random_forest_model_SMOTE.pkl", "rb") as f:
    rf = pickle.load(f)

In [None]:
# load XGBoost model
xgb = xgb.XGBClassifier()
xgb.load_model("saved_models/xgb_model_SMOTE.json")

In [None]:
# load ANN model
model = load_model("saved_models/ann_model_SMOTE.keras")

In [None]:
# load KNN model
with open("saved_models/knn_model_SMOTE.pkl", "rb") as f:
    knn = pickle.load(f)

In [None]:
# load SVM model
with open("saved_models/svm_pipeline_nonSMOTE.pkl", "rb") as f:
    pipeline = pickle.load(f)

In [None]:
# load GBM model
with open("saved_models/gbm_model_SMOTE.pkl", "rb") as f:
    knn = pickle.load(f)

### 4.6 Cross Validation Checks

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
# Define multiple scoring metrics
scoring = {
    'auc': 'roc_auc',
    'recall': 'recall',
    'precision': 'precision',
    'F1': make_scorer(f1_score, average='macro')
}

#### 4.6.1 Cross Validation for the Random Forest Model

In [None]:
# Run cross-validation once
cv_results_rf = cross_validate(
    estimator=rf,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - RF AUC
print("AUC Random Forest: {:.3f} %".format(cv_results_rf['test_auc'].mean() * 100))
print("AUC Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_auc'].std()))

In [None]:
# Print results - RF Recall
print("Recall Random Forest: {:.4f}".format(cv_results_rf['test_recall'].mean()))
print("Recall Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_recall'].std()))

In [None]:
# Print results - RF precision
print("Precision Random Forest: {:.4f}".format(cv_results_rf['test_precision'].mean()))
print("Precision Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_precision'].std()))

In [None]:
# Print results - Random Forest F1 Score
print("F1-score Random Forest: {:.4f}".format(cv_results_rf['test_F1'].mean()))
print("F1-score Standard Deviation Random Forest: {:.5f}".format_rf(cv_results['test_F1'].std()))

#### 4.6.2 Cross Validation for the XGBoost Model

In [None]:
# Run cross-validation once
cv_results_xgb = cross_validate(
    estimator=xgb,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - XGB AUC
print("AUC XGBoost: {:.3f} %".format(cv_results_xgb['test_auc'].mean() * 100))
print("AUC Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_auc'].std()))

In [None]:
# Print results - XGB Recall
print("Recall XGBoost: {:.4f}".format(cv_results_xgb['test_recall'].mean()))
print("Recall Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_recall'].std()))

In [None]:
# Print results - XGB Precision
print("Precision XGBoost: {:.4f}".format(cv_results_xgb['test_precision'].mean()))
print("Precision Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_precision'].std()))

In [None]:
# Print results - XGB F1 Score
print("F1-score XGBoost: {:.4f}".format(cv_results_xgb['test_F1'].mean()))
print("F1-score Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_F1'].std()))

#### 4.6.3 Cross Validation for the KNN Model

In [None]:
# Run cross-validation once
cv_results_knn = cross_validate(
    estimator=knn,
    X=X_train_scaled,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - KNN AUC
print("AUC KNN: {:.3f} %".format(cv_results_knn['test_auc'].mean() * 100))
print("AUC Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_auc'].std()))

In [None]:
# Print results - KNN Recall
print("Recall KNN: {:.4f}".format(cv_results_knn['test_recall'].mean()))
print("Recall Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_recall'].std()))

In [None]:
# Print results - KNN Precision
print("Precision KNN: {:.4f}".format(cv_results_knn['test_precision'].mean()))
print("Precision Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_precision'].std()))

In [None]:
# Print results - KNN F1 Score
print("F1-score KNN: {:.4f}".format(cv_results['test_F1'].mean()))
print("F1-score Standard Deviation KNN: {:.5f}".format(cv_results['test_F1'].std()))

#### 4.6.4 Cross Validation for the SVM Model

In [None]:
# Run cross-validation
cv_results_svm = cross_validate(
    estimator=pipeline,
    X=X_train_scaled,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - SVM AUC
print("AUC SVM: {:.3f} %".format(cv_results_svm['test_AUC'].mean() * 100))
print("AUC Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_AUC'].std()))

In [None]:
# Print results - SVM Recall
print("Recall SVM: {:.4f}".format(cv_results_svm['test_Recall'].mean()))
print("Recall Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_Recall'].std()))

In [None]:
# Print results - SVM Precision
print("Precision SVM: {:.4f}".format(cv_results_svm['test_Precision'].mean()))
print("Precision Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_Precision'].std()))

In [None]:
# Print results - SVM F1 Score
print("F1-score SVM: {:.4f}".format(cv_results_svm['test_F1'].mean()))
print("F1-score Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_F1'].std()))

#### 4.6.5 Cross Validation for the GBM Model

In [None]:
# Run cross-validation
cv_results_gbm = cross_validate(
    estimator=gbm,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - GBM AUC
print("AUC GBM: {:.3f} %".format(cv_results_gbm['test_AUC'].mean() * 100))
print("AUC Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_AUC'].std()))

In [None]:
# Print results - GBM Recall
print("Recall GBM: {:.4f}".format(cv_results_gbm['test_Recall'].mean()))
print("Recall Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_Recall'].std()))

In [None]:
# Print results - GBM Precision
print("Precision GBM: {:.4f}".format(cv_results_gbm['test_Precision'].mean()))
print("Precision Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_Precision'].std()))

In [None]:
# Print results - GBM F1 Score
print("F1-score GBM: {:.4f}".format(cv_results_gbm['test_F1'].mean()))
print("F1-score Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_F1'].std()))