## MMTHE01 - Masters Thesis

### C. Thesis - Create a working model - without SMOTE
* Splitting the data into train and test data
* Feature scaling
* Model training

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

In [2]:
### Import libraries to save models
import pickle
from tensorflow.keras.models import Model, load_model

In [3]:
# Check your current working directory
cwd = os.getcwd()

# Define your relative path
relative_path = r"6. Analysis"  # adjust this relative to cwd

# Build the full path
full_path = os.path.join(cwd, relative_path)

# Check if it exists before changing
if os.path.exists(full_path):
    os.chdir(full_path)
    print("Changed directory to:", full_path)
else:
    print("Folder does not exist:", full_path)

Changed directory to: C:\Users\eaber\Documents\11. Masters Thesis Final\6. Analysis


#### Load Dataset

In [4]:
with open("saved_data/train_dataset_final_encoded.pkl","rb") as f:
    dataset = pickle.load(f)
#dataset = pd.read_csv('train_dataset_final_encoded.csv')

In [5]:
dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,card1,C3,C9,C12,C13,C14,TransactionID,...,card4_discover,card4_mastercard,card4_visa,card6_charge card,card6_credit,card6_debit,card6_debit or credit,M4_M0,M4_M1,M4_M2
0,0,86400,68.5,13926,0.0,1.0,0.0,1.0,1.0,2987000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,86401,29.0,2755,0.0,0.0,0.0,1.0,1.0,2987001,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,86469,59.0,4663,0.0,1.0,0.0,1.0,1.0,2987002,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,86499,50.0,18132,0.0,1.0,0.0,25.0,1.0,2987003,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,86506,50.0,4497,0.0,0.0,0.0,1.0,1.0,2987004,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [6]:
dataset.shape

(590540, 201)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 201 entries, isFraud to M4_M2
dtypes: float64(179), int64(8), object(14)
memory usage: 905.6+ MB


In [8]:
for col in dataset.columns:
    if dataset[col].dtype == "object":
        try:
            dataset[col] = pd.to_numeric(dataset[col])
        except:
            pass

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 201 entries, isFraud to M4_M2
dtypes: float64(193), int64(8)
memory usage: 905.6 MB


#### Split the data into Train-Test

In [10]:
dataset_final = dataset.drop('TransactionID', axis=1)

In [11]:
X = dataset_final.iloc[:, 1:].values
y = dataset_final.iloc[:,0].values

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

#### Feature Scaling

In [13]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

#### Loading the models

In [14]:
# load Isolation Forest model
with open("saved_models/iso_forest_model_nonSMOTE.pkl", "rb") as f:
    iso_forest = pickle.load(f)

In [15]:
# load Autoencoder model
autoencoder = load_model("saved_models/autoencoder_model_nonSMOTE.keras")

In [16]:
# load Random Forest model
with open("saved_models/random_forest_model_nonSMOTE.pkl", "rb") as f:
    rf = pickle.load(f)

In [17]:
# load XGBoost model
xgb = xgb.XGBClassifier()
xgb.load_model("saved_models/xgb_model_nonSMOTE.json")

In [18]:
# load ANN model
model = load_model("saved_models/ann_model_nonSMOTE.keras")

In [19]:
# load KNN model
with open("saved_models/knn_model_nonSMOTE.pkl", "rb") as f:
    knn = pickle.load(f)

In [20]:
# load SVM model
with open("saved_models/svm_pipeline_nonSMOTE.pkl", "rb") as f:
    pipeline = pickle.load(f)

In [21]:
# load GBM model
with open("saved_models/gbm_model_nonSMOTE.pkl", "rb") as f:
    gbm = pickle.load(f)

### 3.6 Cross Validation Checks for the Supervised Learning Models

In [22]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score

In [23]:
# Define multiple scoring metrics
scoring = {
    'auc': 'roc_auc',
    'recall': 'recall',
    'precision': 'precision',
    'F1': make_scorer(f1_score, average='macro')
}

#### 3.6.1 Cross Validation for the Random Forest Model

In [24]:
# Run cross-validation once
cv_results_rf = cross_validate(
    estimator=rf,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [25]:
# Print results - RF AUC
print("AUC Random Forest: {:.3f} %".format(cv_results_rf['test_auc'].mean() * 100))
print("AUC Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_auc'].std()))

AUC Random Forest: 88.752 %
AUC Standard Deviation Random Forest: 0.00339


In [26]:
# Print results - RF Recall
print("Recall Random Forest: {:.4f}".format(cv_results_rf['test_recall'].mean()))
print("Recall Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_recall'].std()))

Recall Random Forest: 0.7406
Recall Standard Deviation Random Forest: 0.00608


In [27]:
# Print results - RF precision
print("Precision Random Forest: {:.4f}".format(cv_results_rf['test_precision'].mean()))
print("Precision Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_precision'].std()))

Precision Random Forest: 0.1833
Precision Standard Deviation Random Forest: 0.00301


In [28]:
# Print results - Random Forest F1 Score
print("F1-score Random Forest: {:.4f}".format(cv_results_rf['test_F1'].mean()))
print("F1-score Standard Deviation Random Forest: {:.5f}".format(cv_results_rf['test_F1'].std()))

F1-score Random Forest: 0.6128
F1-score Standard Deviation Random Forest: 0.00261


#### 3.6.2 Cross Validation for the XGBoost Model

In [29]:
# Run cross-validation once
cv_results_xgb = cross_validate(
    estimator=xgb,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [30]:
# Print results - XGB AUC
print("AUC XGBoost: {:.3f} %".format(cv_results_xgb['test_auc'].mean() * 100))
print("AUC Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_auc'].std()))

AUC XGBoost: 93.726 %
AUC Standard Deviation XGBoost: 0.00374


In [31]:
# Print results - XGB Recall
print("Recall XGBoost: {:.4f}".format(cv_results_xgb['test_recall'].mean()))
print("Recall Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_recall'].std()))

Recall XGBoost: 0.4678
Recall Standard Deviation XGBoost: 0.00648


In [32]:
# Print results - XGB Precision
print("Precision XGBoost: {:.4f}".format(cv_results_xgb['test_precision'].mean()))
print("Precision Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_precision'].std()))

Precision XGBoost: 0.9062
Precision Standard Deviation XGBoost: 0.00879


In [33]:
# Print results - XGB F1 Score
print("F1-score XGBoost: {:.4f}".format(cv_results_xgb['test_F1'].mean()))
print("F1-score Standard Deviation XGBoost: {:.5f}".format(cv_results_xgb['test_F1'].std()))

F1-score XGBoost: 0.8033
F1-score Standard Deviation XGBoost: 0.00361


#### 3.6.3 Cross Validation for the KNN Model

In [34]:
# Run cross-validation once
cv_results_knn = cross_validate(
    estimator=knn,
    X=X_train_scaled,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [35]:
# Print results - KNN AUC
print("AUC KNN: {:.3f} %".format(cv_results_knn['test_auc'].mean() * 100))
print("AUC Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_auc'].std()))

AUC KNN: 84.526 %
AUC Standard Deviation KNN: 0.00533


In [36]:
# Print results - KNN Recall
print("Recall KNN: {:.4f}".format(cv_results_knn['test_recall'].mean()))
print("Recall Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_recall'].std()))

Recall KNN: 0.4858
Recall Standard Deviation KNN: 0.01140


In [37]:
# Print results - KNN Precision
print("Precision KNN: {:.4f}".format(cv_results_knn['test_precision'].mean()))
print("Precision Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_precision'].std()))

Precision KNN: 0.8658
Precision Standard Deviation KNN: 0.00796


In [38]:
# Print results - KNN F1 Score
print("F1-score KNN: {:.4f}".format(cv_results_knn['test_F1'].mean()))
print("F1-score Standard Deviation KNN: {:.5f}".format(cv_results_knn['test_F1'].std()))

F1-score KNN: 0.8059
F1-score Standard Deviation KNN: 0.00551


#### 3.6.4 Cross Validation for the SVM Model

In [39]:
# Run cross-validation
cv_results_svm = cross_validate(
    estimator=pipeline,
    X=X_train_scaled,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [40]:
# Print results - SVM AUC
print("AUC SVM: {:.3f} %".format(cv_results_svm['test_auc'].mean() * 100))
print("AUC Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_auc'].std()))

AUC SVM: 84.729 %
AUC Standard Deviation SVM: 0.00535


In [41]:
# Print results - SVM Recall
print("Recall SVM: {:.4f}".format(cv_results_svm['test_recall'].mean()))
print("Recall Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_recall'].std()))

Recall SVM: 0.7122
Recall Standard Deviation SVM: 0.01103


In [42]:
# Print results - SVM Precision
print("Precision SVM: {:.4f}".format(cv_results_svm['test_precision'].mean()))
print("Precision Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_precision'].std()))

Precision SVM: 0.1239
Precision Standard Deviation SVM: 0.00196


In [43]:
# Print results - SVM F1 Score
print("F1-score SVM: {:.4f}".format(cv_results_svm['test_F1'].mean()))
print("F1-score Standard Deviation SVM: {:.5f}".format(cv_results_svm['test_F1'].std()))

F1-score SVM: 0.5528
F1-score Standard Deviation SVM: 0.00205


#### 3.6.5 Cross Validation for the GBM Model

In [None]:
# Run cross-validation
cv_results_gbm = cross_validate(
    estimator=gbm,
    X=X_train,
    y=y_train,
    cv=10,
    scoring=scoring
)

In [None]:
# Print results - GBM AUC
print("AUC GBM: {:.3f} %".format(cv_results_gbm['test_AUC'].mean() * 100))
print("AUC Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_AUC'].std()))

In [None]:
# Print results - GBM Recall
print("Recall GBM: {:.4f}".format(cv_results_gbm['test_Recall'].mean()))
print("Recall Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_Recall'].std()))

In [None]:
# Print results - GBM Precision
print("Precision GBM: {:.4f}".format(cv_results_gbm['test_Precision'].mean()))
print("Precision Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_Precision'].std()))

In [None]:
# Print results - GBM F1 Score
print("F1-score GBM: {:.4f}".format(cv_results_gbm['test_F1'].mean()))
print("F1-score Standard Deviation GBM: {:.5f}".format(cv_results_gbm['test_F1'].std()))