## MMTHE01 - Masters Thesis

### C. Thesis - Create a working model
* Splitting the data into train and test data
* Feature scaling by training on the 
* 

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import LabelEncoder

In [2]:
os.chdir(r'S:\Semester 4\Masters Thesis Report\6. Analysis')

#### Importing the dataset

In [3]:
dataset = pd.read_csv('train_dataset_final.csv')

In [4]:
dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,card1,C3,C9,C12,C13,C14,TransactionID,...,card4_discover,card4_mastercard,card4_visa,card6_charge card,card6_credit,card6_debit,card6_debit or credit,M4_M0,M4_M1,M4_M2
0,0,86400,68.5,13926,0.0,1.0,0.0,1.0,1.0,2987000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,86401,29.0,2755,0.0,0.0,0.0,1.0,1.0,2987001,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,86469,59.0,4663,0.0,1.0,0.0,1.0,1.0,2987002,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,86499,50.0,18132,0.0,1.0,0.0,25.0,1.0,2987003,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,86506,50.0,4497,0.0,0.0,0.0,1.0,1.0,2987004,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [5]:
dataset.shape

(590540, 201)

### 3.1 Split the data into Train-Test

#### 3.1.1 Separate the features and the label

In [6]:
dataset_final = dataset.drop('TransactionID', axis=1)

In [7]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:,0].values

In [8]:
from sklearn.model_selection import train_test_split
X_train_im, X_test, y_train_im, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

### 3.2 Fine tuning the XGBoost model with SMOTE

In [9]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline

#### 3.2.1 Setting up parameters

In [10]:
smote = SMOTE(sampling_strategy=0.3, k_neighbors=5, random_state=1)

In [11]:
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=5,  # Adjust based on imbalance
    eval_metric='auc',
    random_state=1
)

In [12]:
pipeline = Pipeline([
    ('smote', smote),
    ('xgb', xgb)
])

#### 3.2.2 Tuning the SMOTE Parameters

In [13]:
param_grid_smote_ss = {
    'smote__sampling_strategy': np.arange(0.1, 0.6, 0.1)
}

In [14]:
grid_search_smote_ss = GridSearchCV(
    pipeline,
    param_grid=param_grid_smote_ss,
    scoring='recall',  # or 'average_precision'
    cv=5,
    n_jobs=2,
    verbose=1
)

grid_search_smote_ss.fit(X_train_im, y_train_im)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [15]:
print("Best SMOTE params:", grid_search_smote_ss.best_params_)
print("Best CV Recall:", grid_search_smote_ss.best_score_)

Best SMOTE params: {'smote__sampling_strategy': 0.5}
Best CV Recall: 0.5882032667876589


In [16]:
param_grid_smote_kn = {
    'smote__k_neighbors': [3, 5, 7]
}

In [17]:
grid_search_smote_kn = GridSearchCV(
    pipeline,
    param_grid=param_grid_smote_kn,
    scoring='recall',  # or 'average_precision'
    cv=5,
    n_jobs=2,
    verbose=1
)

grid_search_smote_kn.fit(X_train_im, y_train_im)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [18]:
print("Best SMOTE params:", grid_search_smote_kn.best_params_)
print("Best CV Recall:", grid_search_smote_kn.best_score_)

Best SMOTE params: {'smote__k_neighbors': 3}
Best CV Recall: 0.579794313369631


In [19]:
# Applying SMOTE to the training data with the optimum parameters
smote = SMOTE(sampling_strategy=0.3, k_neighbors=3, random_state=1)
X_train_xgb, y_train_xgb = smote.fit_resample(X_train_im, y_train_im)

In [20]:
# Build an XGBoost Model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=5,  # Adjust based on imbalance
    eval_metric='auc',
    random_state=1
)

In [21]:
# Train the XGBoost model
xgb.fit(X_train_xgb, y_train_xgb)

In [22]:
# Predict y given X_test
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

In [23]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("XGBoost - with SMOTE AUC (in %):", auc_score*100)

XGBoost - with SMOTE AUC (in %): 90.60391595590505


In [24]:
### Calculate recall_score of the model to determine its sensitivity

sensitivity = recall_score(y_test, y_pred, pos_label=1)
print(f"XGBoost Recall Score (Sensitivity): {sensitivity:.4f}")

XGBoost Recall Score (Sensitivity): 0.5996


#### 3.2.3 Step by step tuning of the XGBoost parametters

In [None]:
param_grid = {
    'smote__sampling_strategy': np.arange(0.2,0.6,0.1),
    'smote__k_neighbors': range(5,8,1),
    'xgb__n_estimators': range(100,500,100),
    'xgb__max_depth': range(3,20,1),
    'xgb__min_child_weight': range(1,20,1),
    'xgb__gamma': [i/10.0 for i in range(0,20)],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [i/10.0 for i in range(0,11)],
    'xgb__colsample_bytree': [i/10.0 for i in range(0,10)],
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}

In [None]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='recall', 
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [None]:
grid_search.fit(X_train_im, y_train_im)

#### 3.2.1 Final XGBoost Model

In [None]:
# Build an XGBoost Model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=5,  # Adjust based on imbalance
    eval_metric='auc',
    random_state=1
)

In [None]:
xgb = XGBClassifier(learning_rate =0.01, n_estimators=1000, max_depth=9, min_child_weight=9, gamma=1.6, subsample=0.85, colsample_bytree=0.75, 
                           objective= 'binary:logistic', reg_alpha=0.5, nthread=4, scale_pos_weight=1, seed=27, random_state=30)
xgb.fit(X_train, y_train)

In [None]:
start_time = time.time()

In [None]:
# Train the XGBoost model
xgb.fit(X_train_im, y_train_im)

In [None]:
end_time = time.time()
training_time = end_time - start_time
print(f"XGBoost Training Time: {training_time:.2f} seconds")

In [None]:
# Predict y given X_test
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1],pos_label=1)
auc_score = auc(fpr, tpr)
print("XGBoost - with SMOTE AUC (in %):", auc_score*100)

In [None]:
# ROC-AUC Curve
# plot roc curves
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, thresholds = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(fpr, tpr, linestyle='--', color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('XGBoost - with SMOTE ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.savefig('XGBoost - with SMOTE ROC curve',dpi=300)
plt.show()

#### 3.8.2 Fine tuning the Isolation Forest model

#### 3.8.3 Fine tuning the ANN model

In [None]:
#### 3.2.1 Feature Scaling

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Fine Tuning with SMOTE

In [None]:
# Applying SMOTE only to the training data
smote = SMOTE(random_state=1)
X_train, y_train = smote.fit_resample(X_train_im, y_train_im)