## MMTHE01 - Masters Thesis

### C. Thesis - Create a working model - without SMOTE
* Splitting the data into train and test data
* Feature scaling
* Model training

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import LabelEncoder

In [None]:
### Import libraries to save models
import pickle
from tensorflow.keras.models import Model, load_model

In [None]:
# Check your current working directory
cwd = os.getcwd()

# Define your relative path
relative_path = r"6. Analysis"  # adjust this relative to cwd

# Build the full path
full_path = os.path.join(cwd, relative_path)

# Check if it exists before changing
if os.path.exists(full_path):
    os.chdir(full_path)
    print("Changed directory to:", full_path)
else:
    print("Folder does not exist:", full_path)

#### Importing the dataset

In [None]:
with open("saved_data/train_dataset_final_encoded.pkl","rb") as f:
    dataset = pickle.load(f)
#dataset = pd.read_csv('train_dataset_final_encoded.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

### 3.1 Split the data into Train-Test

#### 3.1.1 Separate the features and the label

In [None]:
dataset_final = dataset.drop('TransactionID', axis=1)

In [None]:
X = dataset_final.iloc[:, 1:].values
y = dataset_final.iloc[:,0].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

In [None]:
with open("saved_data/features_label_unscaled_nonSMOTE.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

### 3.2 Feature Scaling

#### 3.2.1 Checking if the data has outliers

In [None]:
# Detect outliers using IQR method for each column
def detect_iqr_outliers(df):
    outlier_flags = pd.DataFrame(False, index=df.index, columns=df.columns)

    for col in df.select_dtypes(include=[int, float]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Define bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Mark outliers
        outlier_flags[col] = (df[col] < lower_bound) | (df[col] > upper_bound)

    return outlier_flags

In [None]:
# Call the function
outliers_df = detect_iqr_outliers(dataset)

In [None]:
outlier_count_df = outliers_df.apply(lambda col: col.value_counts()).T

In [None]:
outlier_count_df.to_csv('outlier_count_df.csv')

#### 3.2.2 Feature Scaling

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [None]:
with open("saved_data/features_scaled_nonSMOTE.pkl", "wb") as f:
    pickle.dump((X_train_scaled, X_test_scaled), f)

### 3.3 Baseline Model
* The logistic model is a weighted sum based model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Build a LogisticRegression Base model
logR = LogisticRegression(random_state=1)

In [None]:
start_time = time.time()

In [None]:
# Train the LogisticRegression Base model
logR.fit(X_train_scaled, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"LR Base Model Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred = logR.predict(X_test_scaled)
y_pred_proba = logR.predict_proba(X_test_scaled)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("LR Base Model AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('LR Base Model ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('LR Base Model ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the baseline model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"LR Base Model Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the baseline model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"LR Base Model Precision Score: {precision:.4f}")

### 3.4 Unsupervised Learning Models

#### 3.4.1 Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Train Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.035,  # Approximate fraud ratio in dataset
    max_samples='auto',
    random_state=1,
    n_jobs=-1
)

In [None]:
start_time = time.time()

In [None]:
iso_forest.fit(X_train_scaled)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Isolation Forest Training Time: {training_time:.2f} seconds")

In [None]:
# Predict: -1 for outliers (fraud), 1 for inliers (non-fraud)
y_pred = iso_forest.predict(X_test_scaled)

In [None]:
# Convert to 1 for fraud, 0 for non-fraud to match label
y_pred_binary = np.where(y_pred == -1, 1, 0)

In [None]:
# Use anomaly scores for ROC AUC
y_pred_proba = iso_forest.decision_function(X_test_scaled)*-1  # Higher score = more anomalous

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("Isolation Forest AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y))]
p_fpr, p_tpr, thresholds = roc_curve(y, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Isolation Forest ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Isolation Forest ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Isolation Forest to determine its sensitivity

sensitivity = recall_score(y_test, y_pred_binary, pos_label=1)
print(f"Isolation Forest Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Isolation Forest to determine its sensitivity

precision = precision_score(y_test, y_pred_binary, pos_label=1)
print(f"Isolation Forest Precision Score: {precision:.4f}")

In [None]:
# save model
with open("saved_models/iso_forest_model_nonSMOTE.pkl", "wb") as f:
    pickle.dump(iso_forest, f)

#### 3.4.2 Autoencoder

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# NOTE: Training is done on only on non-fraudulent samples
X_train_ae = X_train_scaled[y_train == 0] 

In [None]:
# Build the autoencoder
input_dim = X_train_ae.shape[1]
encoding_dim = 32  # compressed representation size

input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
encoded = layers.Dense(16, activation='relu')(encoded)

decoded = layers.Dense(encoding_dim, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)

autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
start_time = time.time()

In [None]:
# Train the autoencoder
history = autoencoder.fit(
    X_train_ae, X_train_ae,
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,
    verbose=0
)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Autoencoder Training Time: {training_time:.2f} seconds")

In [None]:
# Compute reconstruction errors on test data
X_test_pred = autoencoder.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)

In [None]:
# Evaluate anomaly detection performance
# A simple threshold for anomaly score
threshold = np.percentile(mse[y_test == 0], 95)  # 95th percentile of reconstruction error on non-fraud

In [None]:
# Predict fraud if reconstruction error > threshold
y_pred = (mse > threshold).astype(int)

In [None]:
y_pred_proba = mse

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("Autoencoder AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Autoencoder ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Autoencoder ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Autoencoder to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"Autoencoder Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Autoencoder

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"Autoencoder Precision Score: {precision:.4f}")

In [None]:
autoencoder.save("saved_models/autoencoder_model_nonSMOTE.keras")

### 3.5 Supervised Learning Models

#### 3.5.1 Random Forest (RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Build a Random Forest Model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1, class_weight='balanced')

In [None]:
start_time = time.time()

In [None]:
# Train the Random Forest Model
rf.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Random Forest Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("Random Forest AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Random Forest ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Random Forest ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Random Forest to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"Random Forest Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Random Forest

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"Random Forest Precision Score: {precision:.4f}")

In [None]:
# Save model
with open("saved_models/random_forest_model_nonSMOTE.pkl", "wb") as f:
    pickle.dump(rf, f)

#### 3.5.2 XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
# Build an XGBoost Model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=5,  # Adjust based on imbalance
    eval_metric='auc',
    random_state=1
)

In [None]:
start_time = time.time()

In [None]:
# Train the XGBoost model
xgb.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"XGBoost Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("XGBoost AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('XGBoost ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('XGBoost ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the XGBoost model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"XGBoost Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the XGBoost Model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"XGBoost Precision Score: {precision:.4f}")

In [None]:
# Save model (JSON is preferred, supports portability)
xgb.save_model("saved_models/xgb_model_nonSMOTE.json")

#### 3.5.3 Artificial Neural Network (ANN)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Build an ANN model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Explicit Input layer instead of input_dim in Dense
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_AUC', patience=3, restore_best_weights=True, mode='max')

In [None]:
start_time = time.time()

In [None]:
# Train the ANN model with timing
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    callbacks=[early_stop],
    verbose=0
)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"ANN Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("ANN AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ANN ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('ANN ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the ANN model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"ANN Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the ANN model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"ANN Precision Score {precision:.4f}")

In [None]:
# Save ANN
model.save("saved_models/ann_model_nonSMOTE.keras")

#### 3.5.4 K Nearest Neighbour (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Build a KNN model
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)

In [None]:
start_time = time.time()

In [None]:
# Train the KNN model
knn.fit(X_train_scaled, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"KNN Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred = knn.predict(X_test_scaled)
y_pred_proba = knn.predict_proba(X_test_scaled)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("KNN AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('KNN ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('KNN ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the KNN model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"KNN Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the KNN model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"KNN Precision Score (Sensitivity): {precision:.4f}")

In [None]:
# Save model
with open("saved_models/knn_model_nonSMOTE.pkl", "wb") as f:
    pickle.dump(knn, f)

#### 3.5.5 Support Vector Machine (SVM) - Linear SVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [None]:
# Define pipeline with scaling + LinearSVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', LinearSVC(class_weight='balanced', max_iter=10000, random_state=42))
])

In [None]:
start_time = time.time()

In [None]:
# Fit the model
pipeline.fit(X_train_scaled, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"SVM Base Model Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred = pipeline.predict(X_test_scaled)
y_pred_proba = pipeline.decision_function(X_test_scaled)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("SVM AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('SVM ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('SVM ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the SVM model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"SVM Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the SVM model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"SVM Precision Score {precision:.4f}")

In [None]:
# Save model
with open("saved_models/svm_model_nonSMOTE.pkl", "wb") as f:
    pickle.dump(svm, f)

#### 3.5.6 Gradient Boosting Machine (GBM)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Define GBM model
gbm = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    random_state=1
)

In [None]:
start_time = time.time()

In [None]:
gbm.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"GBM Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = gbm.predict(X_test)
y_pred_proba = gbm.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("GBM AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('GBM ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('GBM ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the baseline model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"GBM Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the SVM model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"GBM Precision Score {precision:.4f}")

In [None]:
# Save model
with open("saved_models/gbm_model_nonSMOTE.pkl", "wb") as f:
    pickle.dump(gbm, f)