## MMTHE01 - Masters Thesis

### D. Thesis - Create a working model - with SMOTE
* Applying SMOTE to the data
* When applying SMOTE, it is best not to use one-hot-encoding for categorical data. Therefore the data is imported before encoding and and all categorical 

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC

from sklearn.preprocessing import LabelEncoder

In [2]:
os.chdir(r'S:\Semester 4\Masters Thesis Report\6. Analysis')

#### Importing the dataset

In [3]:
dataset = pd.read_csv('train_dataset_final_unencoded.csv')

In [4]:
dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,V304,V305,V309,V310,V311,V312,V314,V315,V318,V321
0,0,86400,68.5,W,13926,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,86401,29.0,W,2755,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,86469,59.0,W,4663,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,86499,50.0,W,18132,0.0,1.0,0.0,25.0,1.0,...,0.0,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0
4,0,86506,50.0,H,4497,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
dataset.shape

(590540, 189)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 189 entries, isFraud to V321
dtypes: float64(175), int64(4), object(10)
memory usage: 851.5+ MB


### 4.1 Encode data using label encoding

In [7]:
#list of categorical features which require encoding
categorical_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M3', 'M4', 'M5', 'M6', 'M9', 'DeviceInfo','ProductCD', 'card4', 'card6', 'M4']

In [8]:
def label_encode_keep_nan(series):
    """
    Label encodes a pandas Series while leaving NaNs untouched.
    Returns the encoded series and the fitted encoder.
    """
    # Keep mask of missing values
    mask = series.isna()

    # Apply LabelEncoder to non-NaN values only
    le = LabelEncoder()
    encoded = le.fit_transform(series[~mask])

    # Create a full-length array with NaNs
    full_encoded = pd.Series(np.nan, index=series.index)
    full_encoded[~mask] = encoded

    return full_encoded, le

In [9]:
dataset['P_emaildomain'], email_encoder = label_encode_keep_nan(dataset['P_emaildomain'])
dataset['DeviceInfo'], device_infoencoder = label_encode_keep_nan(dataset['DeviceInfo'])
dataset['ProductCD'], email_encoder = label_encode_keep_nan(dataset['ProductCD'])
dataset['card4'], device_infoencoder = label_encode_keep_nan(dataset['card4'])
dataset['card6'], email_encoder = label_encode_keep_nan(dataset['card6'])
dataset['M4'], device_infoencoder = label_encode_keep_nan(dataset['M4'])
dataset["M3"] = dataset["M3"].map({"T": 1, "F": 0})
dataset["M5"] = dataset["M5"].map({"T": 1, "F": 0})
dataset["M6"] = dataset["M6"].map({"T": 1, "F": 0})
dataset["M9"] = dataset["M9"].map({"T": 1, "F": 0})

### 4.2 Split the data into Train-Test

#### 4.2.1 Separate the features and the label

In [10]:
dataset_final = dataset.drop('TransactionID', axis=1)

In [11]:
#X = dataset_final.iloc[:, 1:].values
#y = dataset_final.iloc[:,0].values

X = dataset_final.iloc[:, 1:]
y = dataset_final.iloc[:,0]

In [12]:
from sklearn.model_selection import train_test_split
X_train_im, X_test, y_train_im, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

In [13]:
dataset_final.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,V304,V305,V309,V310,V311,V312,V314,V315,V318,V321
0,0,86400,68.5,4.0,13926,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,86401,29.0,4.0,2755,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,86469,59.0,4.0,4663,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,86499,50.0,4.0,18132,0.0,1.0,0.0,25.0,1.0,...,0.0,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0
4,0,86506,50.0,1.0,4497,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4.3 Applying SMOTE_NC

In [None]:
# Identify categorical feature indices
categorical_features = [dataset_final.columns.get_loc(col) for col in categorical_cols]

smote_nc = SMOTENC(categorical_features=categorical_features, random_state=1)

X_chunks_resampled = []
y_chunks_resampled = []

chunk_size = 50000  # adjust as needed



for start in range(0, len(dataset_final), chunk_size):
    end = start + chunk_size
    chunk = dataset_final.iloc[start:end]

    X_chunk = chunk.drop(columns=['isFraud'])
    y_chunk = chunk['isFraud']

    # Fit-resample
    X_res, y_res = smote_nc.fit_resample(X_chunk, y_chunk)

    # Get correct expanded column names from SMOTENC
    ohe_columns = smote_nc.ohe_.get_feature_names_out(
        X_chunk.columns[smote_nc.categorical_features]
    )
    numeric_columns = X_chunk.drop(
        columns=X_chunk.columns[smote_nc.categorical_features]
    ).columns
    expanded_columns = list(numeric_columns) + list(ohe_columns)

    # Convert resampled data to DataFrame with proper column names
    X_res_df = pd.DataFrame(X_res, columns=expanded_columns)

    X_chunks_resampled.append(X_res_df)
    y_chunks_resampled.append(pd.Series(y_res, name='isFraud'))



# Combine all resampled chunks into one DataFrame
X_final = pd.concat(X_chunks_resampled, ignore_index=True)
y_final = pd.concat(y_chunks_resampled, ignore_index=True)

# Get one full dataframe
resampled_df = pd.concat([X_final, y_final], axis=1)

In [None]:
# Identify categorical feature indices
categorical_features = [dataset_final.columns.get_loc(col) for col in categorical_cols]

# Instantiate SMOTE-NC
smote_nc = SMOTENC(categorical_features=categorical_features,  k_neighbors=3, sampling_strategy=0.5, random_state=1)


# ====== Chunking parameters ======
chunk_size = 50000  # adjust to fit your memory budget
X_chunks = []
y_chunks = []

# ====== Oversample each chunk ======
# Only chunk the majority class and oversample minority in each
minority_label = y_train_im.value_counts().idxmin()

dataset_final = X_train_im.copy()
dataset_final['isFraud'] = y_train_im

# Separate classes
df_minority = dataset_final[dataset_final['isFraud'] == minority_label]
df_majority = dataset_final[dataset_final['isFraud'] != minority_label]


df_minority.shape

df_majority.shape

df_majority.head()



# Process the majority class in chunks
for start in range(0, len(df_majority), chunk_size):
    maj_chunk = df_majority.iloc[start:start+chunk_size]
    combined_chunk = pd.concat([maj_chunk, df_minority], axis=0)

    X_chunk = combined_chunk.drop('isFraud', axis=1).values
    y_chunk = combined_chunk['isFraud'].values

    # Oversample this chunk
    X_res, y_res = smote_nc.fit_resample(X_chunk, y_chunk)

    X_chunks.append(X_res)
    y_chunks.append(y_res)




# ====== Combine all chunks ======
X_train = pd.concat([pd.DataFrame(x) for x in X_chunks], ignore_index=True)
y_train = pd.concat([pd.Series(y) for y in y_chunks], ignore_index=True)

print("Final resampled shape:", X_train.shape, y_train_resampled.shape)
print("Class counts:", y_train.value_counts())





### 4.4 Feature Scaling

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### 4.5 Unsupervised Learning Models

#### 4.5.1 Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Train Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.035,  # Approximate fraud ratio in dataset
    max_samples='auto',
    random_state=1,
    n_jobs=-1
)

In [None]:
start_time = time.time()

In [None]:
iso_forest.fit(X_train_scaled)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Isolation Forest Training Time: {training_time:.2f} seconds")

In [None]:
# Predict: -1 for outliers (fraud), 1 for inliers (non-fraud)
y_pred = iso_forest.predict(X_test_scaled)

In [None]:
# Convert to 1 for fraud, 0 for non-fraud to match label
y_pred_binary = np.where(y_pred == -1, 1, 0)

In [None]:
# Use anomaly scores for ROC AUC
y_pred_proba = iso_forest.decision_function(X_test_scaled)*-1  # Higher score = more anomalous

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("Isolation Forest - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y))]
p_fpr, p_tpr, thresholds = roc_curve(y, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Isolation Forest - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Isolation Forest - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Isolation Forest model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred_binary, pos_label=1)
print(f"Isolation Forest Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Isolation Forest to determine its sensitivity

precision = precision_score(y_test, y_pred_binary, pos_label=1)
print(f"Isolation Forest Precision Score: {precision:.4f}")

#### 4.5.2 Autoencoder

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# NOTE: Training is done on only on non-fraudulent samples
X_train_ae = X_train_scaled[y_train == 0] 

In [None]:
# Build the autoencoder
input_dim = X_train_ae.shape[1]
encoding_dim = 32  # compressed representation size

input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
encoded = layers.Dense(16, activation='relu')(encoded)

decoded = layers.Dense(encoding_dim, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
start_time = time.time()

In [None]:
# Train the autoencoder
history = autoencoder.fit(
    X_train_ae, X_train_ae,
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,
    verbose=2
)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Autoencoder Training Time: {training_time:.2f} seconds")

In [None]:
# Compute reconstruction errors on test data
X_test_pred = autoencoder.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)

In [None]:
# Evaluate anomaly detection performance
# A simple threshold for anomaly score
threshold = np.percentile(mse[y_test == 0], 95)  # 95th percentile of reconstruction error on non-fraud

In [None]:
# Predict fraud if reconstruction error > threshold
y_pred = (mse > threshold).astype(int)

In [None]:
y_pred_proba = mse

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("Autoencoder - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Autoencoder - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Autoencoder - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Autoencoder model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"Autoencoder Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Autoencoder

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"Autoencoder Precision Score: {precision:.4f}")

### 4.6 Supervised Learning Models

#### 4.6.1 Random Forest (RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Build a Random Forest Model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1, class_weight='balanced')

In [None]:
start_time = time.time()

In [None]:
# Train the Random Forest Model
rf.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"Random Forest Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("Random Forest - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('Random Forest - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('Random Forest - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the Random Forest model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"Random Forest Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the Random Forest

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"Random Forest Precision Score: {precision:.4f}")

#### 4.6.2 XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
# Build an XGBoost Model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=5,  # Adjust based on imbalance
    eval_metric='auc',
    random_state=1
)

In [None]:
start_time = time.time()

In [None]:
# Train the XGBoost model
xgb.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"XGBoost Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("XGBoost - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('XGBoost - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('XGBoost - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the XGBoost model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"XGBoost Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the XGBoost Model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"XGBoost Precision Score: {precision:.4f}")

#### 4.6.3 Artificial Neural Network (ANN)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Build an ANN model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Explicit Input layer instead of input_dim in Dense
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_AUC', patience=3, restore_best_weights=True, mode='max')

In [None]:
start_time = time.time()

In [None]:
# Train the ANN model with timing
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"ANN Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("ANN - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ANN - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('ANN - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the ANN model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"ANN Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the ANN model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"ANN Precision Score {precision:.4f}")

#### 4.6.4 K Nearest Neighbour (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Build a KNN model
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)

In [None]:
start_time = time.time()

In [None]:
# Train the KNN model
knn.fit(X_train_scaled, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"KNN Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred = knn.predict(X_test_scaled)
y_pred_proba = knn.predict_proba(X_test_scaled)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("KNN - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('KNN - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('KNN - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the baseline model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"KNN Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the KNN model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"KNN Precision Score (Sensitivity): {precision:.4f}")

#### 4.6.5 Support Vector Machine (SVM) - Linear SVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [None]:
# Define pipeline with scaling + LinearSVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', LinearSVC(class_weight='balanced', max_iter=10000, random_state=42))
])

In [None]:
start_time = time.time()

In [None]:
# Fit the model
pipeline.fit(X_train_scaled, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"SVM Base Model Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test_scaled
y_pred = pipeline.predict(X_test_scaled)
y_pred_proba = pipeline.decision_function(X_test_scaled)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba,pos_label=1)
auc_score = auc(fpr, tpr)
print("SVM - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('SVM - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('SVM - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the baseline model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"SVM Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the SVM model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"SVM Precision Score {precision:.4f}")

#### 4.6.6 Gradient Boosting Machine (GBM) 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Define GBM model
gbm = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5, 
    subsample=0.8,
    random_state=1
)

In [None]:
start_time = time.time()

In [None]:
gbm.fit(X_train, y_train)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"GBM Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = gbm.predict(X_test)
y_pred_proba = gbm.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("GBM - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('GBM - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('GBM - with SMOTE ROC curve',dpi=300)
plt.show()

In [None]:
### Calculate recall_score of the baseline model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"GBM Recall Score (Sensitivity): {sensitivity:.4f}")

In [None]:
### Calculate precision_score of the SVM model

precision = precision_score(y_test, y_pred, pos_label=1)
print(f"GBM Precision Score {precision:.4f}")

### 4.7 Cross Validation Checks for the Supervised Learning Models

#### 4.7.1 Cross Validation for the Random Forest Model

In [None]:
from sklearn.model_selection import cross_val_score
aucs = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10, scoring="roc_auc")
print("AUC Random Forest: {:.3f} %".format(aucs.mean()*100))
print("AUC Standard Deviation Random Forest: {:.5f}".format(aucs.std()))

In [None]:
recalls = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10, scoring="recall")
print("Recall Random Forest: {:.4f}".format(recalls.mean()))
print("Recall Standard Deviation Random Forest: {:.5f}".format(recalls.std()))

In [None]:
precisions = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10, scoring="precision")
print("Precision Random Forest: {:.4f}".format(precisions.mean()))
print("Precision Standard Deviation Random Forest: {:.5f}".format(precisions.std()))

#### 4.7.2 Cross Validation for the XGBoost Model

In [None]:
aucs = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10, scoring="roc_auc")
print("AUC XGBoost: {:.3f} %".format(aucs.mean()*100))
print("AUC Standard Deviation XGBoost: {:.5f}".format(aucs.std()))

In [None]:
recalls = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10, scoring="recall")
print("Recall XGBoost: {:.4f}".format(recalls.mean()))
print("Recall Standard Deviation XGBoost: {:.5f}".format(recalls.std()))

In [None]:
precisions = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10, scoring="precision")
print("Precision XGBoost: {:.4f}".format(precisions.mean()))
print("Precision Standard Deviation XGBoost: {:.5f}".format(precisions.std()))

#### 4.7.3 Cross Validation for the KNN Model

In [None]:
aucs = cross_val_score(estimator = knn, X = X_train_scaled, y = y_train, cv = 10, scoring="roc_auc")
print("AUC KNN: {:.3f} %".format(aucs.mean()*100))
print("AUC Standard Deviation KNN: {:.5f}".format(aucs.std()))

In [None]:
recalls = cross_val_score(estimator = knn, X = X_train_scaled, y = y_train, cv = 10, scoring="recall")
print("Recall KNN: {:.4f}".format(recalls.mean()))
print("Recall Standard Deviation KNN: {:.5f}".format(recalls.std()))

In [None]:
precisions = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10, scoring="precision")
print("Precision KNN: {:.4f}".format(precisions.mean()))
print("Precision Standard Deviation KNN: {:.5f}".format(precisions.std()))

#### 3.7.4 Cross Validation for the SVM Model

In [None]:
aucs = cross_val_score(estimator = pipeline, X = X_train_scaled, y = y_train, cv = 10, scoring="roc_auc")
print("AUC SVM: {:.3f} %".format(aucs.mean()*100))
print("AUC Standard Deviation SVM: {:.5f}".format(aucs.std()))

In [None]:
recalls = cross_val_score(estimator = pipeline, X = X_train_scaled, y = y_train, cv = 10, scoring="recall")
print("Recall SVM: {:.4f}".format(recalls.mean()))
print("Recall Standard Deviation SVM: {:.5f}".format(recalls.std()))

In [None]:
precisions = cross_val_score(estimator = pipeline, X = X_train, y = y_train, cv = 10, scoring="precision")
print("Precision SVM: {:.4f}".format(precisions.mean()))
print("Precision Standard Deviation SVM: {:.5f}".format(precisions.std()))

#### 3.7.5 Cross Validation for the GBM Model

In [None]:
aucs = cross_val_score(estimator = gbm, X = X_train, y = y_train, cv = 10, scoring="roc_auc")
print("AUC GBM: {:.3f} %".format(aucs.mean()*100))
print("AUC Standard Deviation GBM: {:.5f}".format(aucs.std()))

In [None]:
from sklearn.model_selection import cross_val_score
recalls = cross_val_score(estimator = gbm, X = X_train, y = y_train, cv = 10, scoring="recall")
print("Recall GBM: {:.4f}".format(recalls.mean()))
print("Recall Standard Deviation GBM: {:.5f}".format(recalls.std()))

In [None]:
precisions = cross_val_score(estimator = gbm, X = X_train, y = y_train, cv = 10, scoring="precision")
print("Precision GBM: {:.4f}".format(precisions.mean()))
print("Precision Standard Deviation GBM: {:.5f}".format(precisions.std()))