In [17]:
# General Libraries
import re
import time
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
warnings.filterwarnings("ignore")

# Visualizations
import seaborn as sns
from termcolor import colored
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import (train_test_split, GridSearchCV,
                                     StratifiedKFold)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,  
                              GradientBoostingClassifier)
from sklearn.model_selection import cross_validate
from sklearn.metrics import (ConfusionMatrixDisplay, precision_score, recall_score)

# Imblearn
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import (SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE)
from imblearn.under_sampling import (TomekLinks, NearMiss, AllKNN,
                                     EditedNearestNeighbours, 
                                     RepeatedEditedNearestNeighbours) 
from imblearn.combine import SMOTETomek, SMOTEENN

# SHAP
import shap

In [22]:
cc_df = pd.read_csv("cc_df_group1_version2.csv")

cc_df = cc_df.drop(['trans_date'], axis=1)
cc_df.head(10)

Unnamed: 0,lat,long,city_pop,amt,is_fraud,merch_lat,merch_long,trans_day,city_Angeles City,city_Antipolo,...,part_of_day_lunch,region_CALABARZON,region_NCR,region_Region 3,quarantine_status_ECQ,quarantine_status_GCQ,quarantine_status_MECQ,quarantine_status_Normal,QL_Extreme QL,QL_Moderate-Low QL
0,14.5958,120.9772,23088000,966.24,1,14.13343,121.223118,13,0,0,...,1,0,1,0,0,1,0,0,0,1
1,14.5958,120.9772,23088000,275.37,1,15.290354,120.246084,13,0,0,...,0,0,1,0,0,1,0,0,0,1
2,14.5958,120.9772,23088000,326.96,1,15.402101,121.332516,14,0,0,...,0,0,1,0,0,1,0,0,0,1
3,14.5958,120.9772,23088000,1047.59,1,15.166276,121.955649,14,0,0,...,0,0,1,0,0,1,0,0,0,1
4,14.5958,120.9772,23088000,286.34,1,15.149144,121.696662,14,0,0,...,0,0,1,0,0,1,0,0,0,1
5,14.5958,120.9772,23088000,1004.66,1,14.824212,120.504185,13,0,0,...,0,0,1,0,0,1,0,0,0,1
6,14.5958,120.9772,23088000,14.54,0,15.011848,119.983598,4,0,0,...,0,0,1,0,0,1,0,0,0,1
7,14.5958,120.9772,23088000,35.84,0,14.108576,121.278285,23,0,0,...,0,0,1,0,0,1,0,0,0,1
8,14.5958,120.9772,23088000,46.69,0,15.235022,121.319209,14,0,0,...,0,0,1,0,0,1,0,0,0,1
9,14.5958,120.9772,23088000,83.26,0,15.275801,121.480398,6,0,0,...,0,0,1,0,1,0,0,0,1,0


In [23]:
X = cc_df.drop(['is_fraud'], axis=1)
y = cc_df['is_fraud']
(X_trainval, X_holdout, y_trainval, y_holdout) = train_test_split(X, y, 
                                                                  random_state=11, test_size=0.25,
                                                                  stratify=y)

In [24]:
def auto_ml(X, y, models_dict, scaler=None, cv=None, res_t=None):
    """Apply resampling for imbalanced data"""
    results = {}
    results2 = {}

    # log start time
    total_start = time.time()

    for model_name, model in tqdm(models_dict.items()):
        
        ################# this portion can be edited ###############
        train_scores = []
        val_scores = []

        train_recall = []
        val_recall = []

        ###########################################################

        for train_index, val_index in tqdm(cv.split(X, y)):

            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Apply scaling
            if scaler is not None:
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)
            else:
                pass

            # Apply resampling
            if res_t is not None:
                s = time.time()
                X_train, y_train = res_t.fit_resample(X_train, y_train)
                print(f'Resampling done in {time.time() - s}')
            else:
                pass

            start_time = time.time()

            # fit
            model.fit(X_train, y_train)

            # predict
            train_preds = model.predict(X_train)
            val_preds = model.predict(X_val)
            
            ################# this portion can be edited ###############

            # classification accuracy
            train_scores.append(model.score(X_train, y_train))
            val_scores.append(model.score(X_val, y_val))

            # recall
            train_recall.append(recall_score(y_train, train_preds))
            val_recall.append(recall_score(y_val, val_preds))

            end_time = time.time()

            results[model_name] = {
                'Train Accuracy': np.round(np.mean(train_scores) * 100, 2),
                'Val Accuracy': np.round(np.mean(val_scores) * 100, 2),
                'Train Recall': np.round(np.mean(train_recall) * 100, 2),
                'Val Recall': np.round(np.mean(val_recall) * 100, 2),
                'Run Time': end_time - start_time
            }

            results2[model_name] = {
                'Train Accuracy': '{:.2f}%'.format(np.round(np.mean(train_scores)*100, 2)),
                'Val Accuracy': '{:.2f}%'.format(np.round(np.mean(val_scores)*100, 2)),
                'Train Recall': '{:.2f}%'.format(np.round(np.mean(train_recall)*100, 2)),
                'Val Recall': '{:.2f}%'.format(np.round(np.mean(val_recall)*100, 2)),
                'Run Time': end_time - start_time
            }
            
            ###########################################################

    results = pd.DataFrame(results).T
    results2 = pd.DataFrame(results2).T
    return [results, results2]

In [25]:
models_dict = {'GradientBoostingClassifier': GradientBoostingClassifier(random_state=143)}

auto_run = auto_ml(X_trainval, y_trainval, models_dict, scaler=MinMaxScaler(),
               cv=StratifiedKFold(n_splits=5))
auto = auto_run[0]
auto_run[1]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,6.288205,99.98%,99.11%,99.79%,91.11%


In [26]:
res_list = [('SMOTE', SMOTE()),
            ('ADASYN', ADASYN()),
           ('BorderlineSMOTE', BorderlineSMOTE()),
           ('SVMSMOTE', SVMSMOTE())]

outputs = []
for title, res in res_list:
    print(colored(title, 'red', attrs=['bold']).center(120, "-"))
    var = auto_ml(X_trainval, y_trainval, models_dict, scaler=MinMaxScaler(),
                    cv=StratifiedKFold(n_splits=5), res_t=res)
    outputs.append(var[0])
    display(var[1])

---------------------------------------------------[1m[31mSMOTE[0m---------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 0.10846328735351562
Resampling done in 0.08298230171203613
Resampling done in 0.08604049682617188
Resampling done in 0.09255719184875488
Resampling done in 0.0778956413269043


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,29.309688,99.71%,99.93%,99.31%,95.56%


--------------------------------------------------[1m[31mADASYN[0m---------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 0.16542363166809082
Resampling done in 0.2021174430847168
Resampling done in 0.1912860870361328
Resampling done in 0.16155648231506348
Resampling done in 0.1742856502532959


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,28.4282,99.68%,99.89%,99.33%,95.11%


----------------------------------------------[1m[31mBorderlineSMOTE[0m----------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 0.16459155082702637
Resampling done in 0.16720938682556152
Resampling done in 0.1532135009765625
Resampling done in 0.1988077163696289
Resampling done in 0.20103859901428223


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,30.237097,99.76%,99.96%,99.41%,95.56%


-------------------------------------------------[1m[31mSVMSMOTE[0m--------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 2.7291440963745117
Resampling done in 2.9176416397094727
Resampling done in 2.400744676589966
Resampling done in 2.309436082839966
Resampling done in 2.2955875396728516


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,31.567078,99.71%,99.90%,99.36%,95.56%


In [31]:
# Undersampling


res_list = [('TomekLinks', TomekLinks()),
            ('NearMiss', NearMiss()),
           ('EditedNearestNeighbours', EditedNearestNeighbours()),
           ('AllKNN', AllKNN()),
           ('RepeatedEditedNearestNeighbours', RepeatedEditedNearestNeighbours())]

outputs_u = []
for title, res in res_list:
    print(colored(title, 'red', attrs=['bold']).center(120, "-"))
    var = auto_ml(X_trainval, y_trainval, models_dict, scaler=MinMaxScaler(),
                  cv=StratifiedKFold(n_splits=5), res_t=res)
    outputs_u.append(var[0])
    display(var[1])

------------------------------------------------[1m[31mTomekLinks[0m-------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 5.615335941314697
Resampling done in 4.25487494468689
Resampling done in 5.0258989334106445
Resampling done in 3.927220582962036
Resampling done in 4.770754814147949


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,16.039583,99.98%,99.33%,99.78%,92.00%


-------------------------------------------------[1m[31mNearMiss[0m--------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 0.11785316467285156
Resampling done in 0.1071479320526123
Resampling done in 0.12683677673339844
Resampling done in 0.11449790000915527
Resampling done in 0.10682368278503418


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,0.168575,100.00%,100.00%,93.97%,96.00%


------------------------------------------[1m[31mEditedNearestNeighbours[0m------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 6.9539220333099365
Resampling done in 6.492815732955933
Resampling done in 6.112780809402466
Resampling done in 6.32309627532959
Resampling done in 6.290825128555298


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,13.747526,99.99%,99.44%,99.79%,92.00%


--------------------------------------------------[1m[31mAllKNN[0m---------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 15.970970630645752
Resampling done in 15.109300136566162
Resampling done in 15.219838380813599
Resampling done in 13.710755109786987
Resampling done in 15.814285039901733


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,12.591095,99.99%,99.44%,99.80%,92.00%


--------------------------------------[1m[31mRepeatedEditedNearestNeighbours[0m--------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 22.978089094161987
Resampling done in 29.636507749557495
Resampling done in 27.01902484893799
Resampling done in 21.4951434135437
Resampling done in 27.8022141456604


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,11.405863,99.99%,99.56%,99.81%,92.44%


In [32]:
# Combined Resampling


res_list = [('SMOTETomek', SMOTETomek()),
           ('SMOTEENN', SMOTEENN())]

outputs_u = []
for title, res in res_list:
    print(colored(title, 'red', attrs=['bold']).center(120, "-"))
    var = auto_ml(X_trainval, y_trainval, models_dict, scaler=MinMaxScaler(),
                  cv=StratifiedKFold(n_splits=5), res_t=res)
    outputs_u.append(var[0])
    display(var[1])

------------------------------------------------[1m[31mSMOTETomek[0m-------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 15.827311038970947
Resampling done in 15.023146390914917
Resampling done in 15.030317544937134
Resampling done in 15.62339448928833
Resampling done in 16.001643180847168


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,28.607187,99.69%,99.92%,99.29%,95.11%


-------------------------------------------------[1m[31mSMOTEENN[0m--------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Resampling done in 21.62845778465271
Resampling done in 22.807477951049805
Resampling done in 20.844133853912354
Resampling done in 22.298982858657837
Resampling done in 22.134552240371704


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
GradientBoostingClassifier,27.329381,99.70%,99.91%,99.27%,95.56%
