In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2
import awswrangler as wr
import boto3
from config import db_password
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'config'

In [None]:
s3_bucket = "aws.nw.bootcamp.0805"
s3_bucket_path = "creditcard.csv"
raw_s3_path = f"s3://{s3_bucket}/{s3_bucket_path}"

creditcard_df = wr.s3.read_csv(path=raw_s3_path)

In [None]:
# write data to postgres
con = wr.postgresql.connect("my-glue-connection")
wr.postgresql.to_sql(df=creditcard_df, table="CreditCard_Transactions", schema="public", con=con)
con.close()


In [None]:
# connect to AWS PgAdmin to read data
creditcard_df = wr.postgresql.read_sql_table(table="CreditCard_Transactions", schema="public", con=con)
con.close()

In [None]:
# connection string
db_string = f"postgresql://postgresql:{db_password}@127.0.0.1:5432/Machine_Learning_Projects"
engine = create_engine(db_string)

In [None]:
# connect to PgAdmin to read data
creditcard_df = pd.read_sql_query('select * from "CreditCard_Transactions"',con=engine)
creditcard_df= creditcard_df.drop(['index'], axis=1)
creditcard_df.head()

In [None]:
# checking size of data to determine if we need to import in chunks
creditcard_df.shape

In [None]:
# create a subset dataframe for Amount and Time for visualization
viz_df = creditcard_df[['Amount', 'Time', 'Class']]
viz_df = DataFrame(viz_df, columns=['Amount', 'Time', 'Class'])
viz_df.head()

In [None]:
# write the visualization data to PgAdmin
viz_df.to_sql(name='Visualization_Table', con=engine, if_exists='replace')

## Initial EDA

### Data Description
Now we read the data and try to understand the meaning of each of the features. The python module pandas provide us with the functions to read data. In the next step, we will read the data from our directory, and then we look at the first five and last five rows of the data using head() and tail() attributes.

In [None]:
creditcard_df.head().append(creditcard_df.tail())

The time is recorded in the number of seconds since the first transaction in the data set. Therefore, we can conclude that this data set includes all transactions recorded over the course of two days. The features was prepared using PCA and so the physical interpretation of individual features does not make sense. The only features which have not been transformed with PCA are ‘Time’ and ‘Amount’. Feature ‘Class’ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [None]:
creditcard_df.dtypes

## Exploration and Visualization
Determine the relative proportion of valid and fraudulent credit card transactions

In [None]:
print("Fraudulent Transactions: " + str(len(creditcard_df[creditcard_df["Class"] == 1])))
print("Valid Transactions: " + str(len(creditcard_df[creditcard_df["Class"] == 0])))
print("Proportion of Fraudulent Transactions: " + str(len(creditcard_df[creditcard_df["Class"] == 1])/ creditcard_df.shape[0]))

# Determine the number of Fraudulent transactions
fraud_proportion = creditcard_df.copy()
fraud_proportion[" "] = np.where(fraud_proportion["Class"] == 1 ,  "Fraud", "Genuine")

%matplotlib inline
# plot chart
plt.figure(figsize=(16,8))
ax1 = plt.subplot(121, aspect='equal')
fraud_proportion[" "].value_counts().plot(kind='pie',  ax=ax1, startangle=0, legend = False, fontsize=14)

The pie chart shows an imbalance in the data, with only 0.17% of the total cases being fraudulent. Next, we check if there is any difference between the number of valid transactions and fraudulent transactions.

In [None]:
# Describle the data
creditcard_df.describe()

It does not make sense to evaluate the results of the description of the data since most of the variables are principal component. Next, we focus on the Time and Amount columns.

In [None]:
creditcard_df[['Time','Amount']].describe()

The Amount variable is highly skewed, with 75% of all transactions below $77

In [None]:
f, axes = plt.subplots(1, 2, figsize=(18,4), sharex = True)

amount_val = creditcard_df['Amount'].values
time_val = creditcard_df['Time'].values

sns.distplot(amount_val, hist=False, color="c", kde_kws={"shade": True}, ax=axes[0]).set_title('Distribution of Transaction Amount')
sns.distplot(time_val, hist=False, color="c", kde_kws={"shade": True}, ax=axes[1]).set_title('Distribution of Transaction Time')

plt.show()

In [None]:
print("Average Amount in a Fraudulent Transaction: " + str(creditcard_df[creditcard_df["Class"] == 1]["Amount"].mean()))
print("Average Amount in a Valid Transaction: " + str(creditcard_df[creditcard_df["Class"] == 0]["Amount"].mean()))

The average Amount for fraudulent transactions is higher than the average for valid transactions. Next, we will try to understand the distribution of values in each of the features.

In [None]:
# Describe Amount
print("Summary of the feature - Amount" + "\n-------------------------------")
print(creditcard_df["Amount"].describe())

 Next we look at the distribution of each feature [grouped by Class]

In [None]:
def draw_distplots(dataframe, features, rows, cols):
    features = data_plot.iloc[:,0:30].columns
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        sns.distplot(dataframe[feature][dataframe.Class == 1], hist=False, kde_kws={"shade": True}, bins=50)
        sns.distplot(dataframe[feature][dataframe.Class == 0], hist=False, kde_kws={"shade": True}, bins=50)
        #dataframe[feature].hist(bins=20,ax=ax,facecolor='midnightblue')
        ax.set_xlabel("")
        ax.set_title("Distribution of Column: "  + str(feature))
        #ax.set_yscale('log')
    fig.tight_layout()  
    plt.show()
draw_distplots(data_plot,data_plot.columns,8,4)

The Bivariate plots show that most of the features are normally distributed for valid transaction class. Conversely, the Fraud Class shows a wider spread as expected. Next, we move to data preparation, where we would handle missing data.

Data Preparation

Since we have a small number of features which are created using PCA,feature selection is not a necessary step. Next, we move on to handling missing data

In [None]:
print("Number of cases with non-missing values: " + str(creditcard_df.isnull().shape[0]))
print("Number of cases with missing values: " + str(creditcard_df.shape[0] - creditcard_df.isnull().shape[0]))

Since we do not have any missing data, the next step is to standardize the Time and Amount features using the RobustScaler. The choice of using the RobustScaler over the StandardScaler and the MinMaxScaler is the that the RobustScaler reduces the effects of outliers, relative to the MinMaxScaler. It is important to that many machine learning algorithms perform better or converge faster when features are on a relatively similar scale and/or close to normally distributed. That is why we are taking this scaling step.

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler().fit(creditcard_df[["Time", "Amount"]])
creditcard_df[["Time", "Amount"]] = scaler.transform(creditcard_df[["Time", "Amount"]])

creditcard_df.head().append(creditcard_df.tail())

Offline Database

In [None]:
# Write the scaled data to PgAdmin
db_string = f"postgresql://postgresql:{db_password}@127.0.0.1:5432/Machine_Learning_Projects"
engine = create_engine(db_string)
creditcard_df.to_sql(name='CreditCard_Transactions_Scaled', con=engine, if_exists='replace')

In [None]:
# connect to PgAdmin to read the scaled_data
creditcard_scaled_df = pd.read_sql_query('select * from "CreditCard_Transactions_Scaled"',con=engine)
creditcard_scaled_df= creditcard_scaled_df.drop(['index'], axis=1)
creditcard_scaled_df.head()

In [None]:
# write data to postgres
con = wr.postgresql.connect("my-glue-connection")
wr.postgresql.to_sql(df=creditcard_df, table="CreditCard_Transactions_Scaled", schema="public", con=con)
con.close()

In [None]:

# connect to AWS PgAdmin to read data
creditcard_scaled_df = wr.postgresql.read_sql_table(table="CreditCard_Transactions_Scaled", schema="public", con=con)
con.close()

Outlier Detection

Outlier handling depends on the type of problem we are trying to solve. In a balanced dataset, it makes to remove outliers since they could potentially affect our model. In this classification problem, our dataset is highly imbalanced amd we are trying to detect the outlier transactions, hence it makes sense that we do not remove the outliers found in the dataset

Modelling
First we divide the data into response and features. And also make the train-test split of the data for further modelling and validation.

In [None]:
# Separate response and features
y = creditcard_scaled_df["Class"]
X = creditcard_scaled_df.iloc[:,0:30]

# Split training/test datasets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)


As we identified earlier, the dataset is highly imbalanced. Fitting a model on this dataset will result in overfitting towards the majority class. To illustrate, we run one model (Random Forest or logistic regression) on the imbalanced data and see the performance.

In [None]:
# Define the logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:

# Train the model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred_lrc = log_classifier.predict(X_test)
print(f"The accuracy of the model is: {accuracy_score(y_test,y_pred_lrc):.4f}")
print(f"The pecision of the model is: {precision_score(y_test,y_pred_lrc):.4f}")
print(f"The recall of the model is: {recall_score(y_test,y_pred_lrc):.4f}")

In [None]:
logistic_recall = recall_score(y_test, y_pred_lrc)
logistic_accuracy = accuracy_score(y_test, y_pred_lrc)
logistic_precision = precision_score(y_test, y_pred_lrc)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_lrc = confusion_matrix(y_test, y_pred_lrc)

# Create a DataFrame from the confusion matrix.
cm_lrc_df = pd.DataFrame(cm_lrc, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_lrc_df

While model accuracy is 100%, our classifier did not do an excellent job at predicting fraudulent transactions. With precision and recall of 0.84 and 0.62, we would need a better understanding of the dataset to determine the best preprocessing steps to take

In [None]:
start_time = time.time()
# Define the random forest model
from sklearn.ensemble import RandomForestClassifier 

# Fit the model
rfc = RandomForestClassifier() 
rfc.fit(X_train, y_train)
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred_rfc = rfc.predict(X_test)
print(f"The accuracy of the random forest model is: {accuracy_score(y_test,y_pred_rfc):.4f}")
print(f"The pecision of the random forest model is: {precision_score(y_test,y_pred_rfc):.4f}")
print(f"The recall of the random forest model is: {recall_score(y_test,y_pred_rfc):.4f}")

In [None]:

rfc_recall = recall_score(y_test, y_pred_rfc)
rfc_accuracy = accuracy_score(y_test, y_pred_rfc)
rfc_precision = precision_score(y_test, y_pred_rfc)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
#y_pred_rfc = naive_rfc.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)

# Create a DataFrame from the confusion matrix.
cm_rfc_df = pd.DataFrame(cm_rfc, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_rfc_df

# Naive Model Results

In [None]:
# Gather the scores
naive_data_score = [['Logistic Regression', logistic_accuracy, logistic_recall, logistic_precision], 
        ['Random Forest', rfc_accuracy, rfc_recall, rfc_precision] ] 
  
# Create the dataframe 
naive_data_table = pd.DataFrame(naive_data_score, columns = ['Classifier', 'Accuracy', 'Recall Score', 'Precision Score']) 
naive_data_table

While model accuracy is 100%, and precision is 95%, our random forest classifier only achieved a 77% recall. We would need a better understanding of the dataset to determine the best preprocessing steps to take.
One thing to notice here is, we had only 0.17% cases with fraud transactions and a model predicting all trasactions to be valid would have similar accuracy. So we need to train our model in a way that is not overfitted to either of the classes. For this, we introduce Oversampling and Undersampling methods. Oversampling resamples from the minority class to balance the class proportions, and undersampling merges or removes similar observations from the majority to achive the same.

Undersampling
In this section we first describe the structure of the modelling and validations. One trivial point to note is, we will not undersample the test data as we want our model to perform well with skewed class distributions eventually. The steps are as follows (The whole set-up will be structured using the imbalance-learn module):

Use a 5-fold cross validation on the training set
On each of the folds use undersampling
Fit the model on the training folds and validate on the validation fold

In [None]:
# Create the cross validation framework 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV

kf = StratifiedKFold(n_splits=2, random_state = 42, shuffle = True)

In [None]:
# Import the imbalance Learn module
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Undersampling - Logistic Regression

In [None]:
start_time = time.time()
# Logistic Regression 
imba_pipeline = make_pipeline(NearMiss(), LogisticRegression())

log_reg_params = {"penalty": ['l1', 'l2'], 'C': [ 0.01, 0.1, 1, 100], 'solver' : ['liblinear']}
new_params = {'logisticregression__' + key: log_reg_params[key] for key in log_reg_params}

grid_imba_log_reg = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, n_jobs=-1, return_train_score=True)
grid_imba_log_reg.fit(X_train, y_train);
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
start_time = time.time()
logistic_cv_score_us = cross_val_score(grid_imba_log_reg, X_train, y_train, scoring = 'recall', cv = kf, n_jobs=-1)

pred_log_reg_us = grid_imba_log_reg.best_estimator_.named_steps['logisticregression'].predict(X_test)
logistic_recall_us = recall_score(y_test, pred_log_reg_us)
logistic_accuracy_us = accuracy_score(y_test, pred_log_reg_us)
logistic_precision_us = precision_score(y_test, pred_log_reg_us)

log_reg_us = grid_imba_log_reg.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_log_reg_us = confusion_matrix(y_test, pred_log_reg_us)

# Create a DataFrame from the confusion matrix.
cm_pred_log_reg_us_df = pd.DataFrame(cm_pred_log_reg_us, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_log_reg_us_df

In [None]:
print(f"The accuracy of the logistic regression-us model is: {logistic_accuracy_us:.4f}")
print(f"The pecision of the logistic regression-us model is: {logistic_precision_us:.4f}")
print(f"The recall of the logistic regression-us model is: {logistic_recall_us:.4f}")

In [None]:
log_reg_us, logistic_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
from sklearn.metrics import roc_curve, roc_auc_score

result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
yproba = grid_imba_log_reg.best_estimator_.named_steps['logisticregression'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Logistic Regression",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Undersampling - Random Forest

In [None]:
start_time = time.time()
# Define the pipeline
imba_pipeline = make_pipeline(NearMiss(), RandomForestClassifier())

params = {'n_estimators': [50, 100, 200], 'max_depth': [4, 6, 10, 12], 'random_state': [13] }
new_params = {'randomforestclassifier__' + key: params[key] for key in params}

grid_imba_rf = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, n_jobs=-1, return_train_score=True)
grid_imba_rf.fit(X_train, y_train);
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
start_time = time.time()
rfc_cv_score_us = cross_val_score(grid_imba_rf, X_train, y_train, scoring='recall', cv=kf, n_jobs=-1)

pred_rfc_us = grid_imba_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)
rfc_precision_us = precision_score(y_test, pred_rfc_us)
rfc_recall_us = recall_score(y_test, pred_rfc_us)
rfc_accuracy_us = accuracy_score(y_test, pred_rfc_us)

rfc_us = grid_imba_rf.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_rfc_us = confusion_matrix(y_test, pred_rfc_us)

# Create a DataFrame from the confusion matrix.
cm_pred_rfc_us_df = pd.DataFrame(cm_pred_rfc_us, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_rfc_us_df

In [None]:
print(f"The accuracy of the random forest-us classifier is: {rfc_accuracy_us:.4f}")
print(f"The pecision of the random forest-us classifier is: {rfc_precision_us:.4f}")
print(f"The recall of the random forest-us classifier is: {rfc_recall_us:.4f}")

In [None]:

rfc_us, rfc_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_rf.best_estimator_.named_steps['randomforestclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Random Forest",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Undersampling - Support Vector Classifier

In [None]:
start_time = time.time()
# Define the pipeline
imba_pipeline = make_pipeline(NearMiss(), SVC(probability = True))

svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
new_params = {'svc__' + key: svc_params[key] for key in svc_params}

grid_imba_svc = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, n_jobs=-1, return_train_score=True)
grid_imba_svc.fit(X_train, y_train);

print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
start_time = time.time()
svc_cv_score_us = cross_val_score(grid_imba_svc, X_train, y_train, scoring='recall', cv=kf, n_jobs=-1)

pred_svc_us = grid_imba_svc.best_estimator_.named_steps['svc'].predict(X_test)
svc_recall_us = recall_score(y_test, pred_svc_us)
svc_accuracy_us = accuracy_score(y_test, pred_svc_us)
svc_precision_us = precision_score(y_test, pred_svc_us)

svc_us = grid_imba_svc.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_svc_us = confusion_matrix(y_test, pred_svc_us)

# Create a DataFrame from the confusion matrix.
cm_pred_svc_us_df = pd.DataFrame(cm_pred_svc_us, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_svc_us_df

In [None]:
print(f"The accuracy of the support vector - undersampling model is: {svc_accuracy_us:.4f}")
print(f"The pecision of the support vector - undersampling model is: {svc_precision_us:.4f}")
print(f"The recall of the support vector - undersampling model is: {svc_recall_us:.4f}")

In [None]:
svc_us, svc_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_svc.best_estimator_.named_steps['svc'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Support Vector Classifier",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Undersampling - Decision Tree Classifier

In [None]:
start_time = time.time()
# DecisionTree Classifier
imba_pipeline = make_pipeline(NearMiss(),  DecisionTreeClassifier())

tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), "min_samples_leaf": list(range(5,7,1))}
new_params = {'decisiontreeclassifier__' + key: tree_params[key] for key in tree_params}

grid_imba_tree = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, n_jobs=-1, return_train_score=True)
grid_imba_tree.fit(X_train, y_train);
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:

start_time = time.time()
dtree_cv_score_us = cross_val_score(grid_imba_tree, X_train, y_train, scoring='recall', cv=kf, n_jobs=-1)

pred_dtc_us = grid_imba_tree.best_estimator_.named_steps['decisiontreeclassifier'].predict(X_test)
dtree_recall_us = recall_score(y_test, pred_dtc_us)
dtree_accuracy_us = accuracy_score(y_test, pred_dtc_us)
dtree_precision_us = precision_score(y_test, pred_dtc_us)

dtc_us = grid_imba_tree.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_dtc_us = confusion_matrix(y_test, pred_dtc_us)

# Create a DataFrame from the confusion matrix.
cm_pred_dtc_us_df = pd.DataFrame(cm_pred_dtc_us, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_dtc_us_df

In [None]:
print(f"The accuracy of the decision tree-us classifier is: {dtree_accuracy_us:.4f}")
print(f"The pecision of the decision tree-us classifier is: {dtree_precision_us:.4f}")
print(f"The recall of the decision tree-us classifier is: {dtree_recall_us:.4f}")

In [None]:
dtc_us, dtree_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_tree.best_estimator_.named_steps['decisiontreeclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Decision Tree",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Undersampling - k-Nearest Neighbour Classifier

In [None]:
start_time = time.time()
# KNeighbors Classifier
imba_pipeline = make_pipeline(NearMiss(), KNeighborsClassifier())

knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
new_params = {'kneighborsclassifier__' + key: knears_params[key] for key in knears_params}

grid_imba_knn = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, n_jobs=-1, return_train_score=True)
grid_imba_knn.fit(X_train, y_train);
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
start_time = time.time()
knear_cv_score_us = cross_val_score(grid_imba_knn, X_train, y_train, scoring='recall', cv=kf, n_jobs=-1)

pred_knn_us = grid_imba_knn.best_estimator_.named_steps['kneighborsclassifier'].predict(X_test)
knear_recall_us = recall_score(y_test, pred_knn_us)
knear_accuracy_us = accuracy_score(y_test, pred_knn_us)
knear_precision_us = precision_score(y_test, pred_knn_us)

knn_us = grid_imba_knn.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_knn_us = confusion_matrix(y_test, pred_knn_us)

# Create a DataFrame from the confusion matrix.
cm_pred_knn_us_df = pd.DataFrame(cm_pred_knn_us, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_knn_us_df

In [None]:
print(f"The accuracy of the KNN - undersampling model is: {knear_accuracy_us:.4f}")
print(f"The pecision of the KNN - undersampling model is: {knear_precision_us:.4f}")
print(f"The recall of the KNN - undersampling model is: {knear_recall_us:.4f}")

In [None]:
knn_us, knear_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_knn.best_estimator_.named_steps['kneighborsclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "k-Nearest Neighbour",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Summarize the undersampling model performances

In [None]:
# Gather the scores
data_score = [['Logistic Regression', logistic_cv_score_us.mean(), logistic_accuracy_us, logistic_recall_us, logistic_precision_us], 
        ['Random Forest', rfc_cv_score_us.mean(), rfc_accuracy_us, rfc_recall_us, rfc_precision_us], 
        ['Support Vector', svc_cv_score_us.mean(), svc_accuracy_us, svc_recall_us, svc_precision_us],
        ['Decision Tree', dtree_cv_score_us.mean(), dtree_accuracy_us, dtree_recall_us, dtree_precision_us],
        ['k-Nearest Neighbour', knear_cv_score_us.mean(), knear_accuracy_us, knear_recall_us, knear_precision_us]
             ] 
  
# Create the dataframe 
data_table = pd.DataFrame(data_score, columns = ['Classifier', 'CV Score', 'Accuracy', 'Recall Score', 'Precision Score']) 
data_table

Now we plot the ROC curve for the above classifiers.

In [None]:
# Plot the ROC curve for undersampling
result_table.set_index('classifiers', inplace=True)
fig = plt.figure(figsize=(17,7))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis for Undersampling', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

#The Learning Curve

Here we choose 4 models and try to see the trend of training and cross-validation scores over varrying training size. A cross-validation generator splits the whole dataset k times in training and test data. Subsets of the training set with varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed. Afterwards, the scores will be averaged over all k runs for each training subset size.

In [None]:
data = creditcard_scaled_df.copy()

In [None]:
# Let's Plot LogisticRegression Learning Curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator1, estimator2, estimator3, estimator4, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(20,14), sharey=True)
    if ylim is not None:
        plt.ylim(*ylim)
    # First Estimator
    train_sizes, train_scores, test_scores = learning_curve(
        estimator1, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring = "recall")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    ax1.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    ax1.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    ax1.set_title("Logistic Regression Learning Curve", fontsize=14)
    ax1.set_xlabel('Training size (m)')
    ax1.set_ylabel('Score')
    ax1.grid(True)
    ax1.legend(loc="best")
    
    # Second Estimator 
    train_sizes, train_scores, test_scores = learning_curve(
        estimator2, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring = "recall")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax2.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    ax2.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    ax2.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    ax2.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    ax2.set_title("Knears Neighbors Learning Curve", fontsize=14)
    ax2.set_xlabel('Training size (m)')
    ax2.set_ylabel('Score')
    ax2.grid(True)
    ax2.legend(loc="best")
    
    # Third Estimator
    train_sizes, train_scores, test_scores = learning_curve(
        estimator3, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring = "recall")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax3.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    ax3.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    ax3.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    ax3.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    ax3.set_title("Decision Tree Classifier \n Learning Curve", fontsize=14)
    ax3.set_xlabel('Training size (m)')
    ax3.set_ylabel('Score')
    ax3.grid(True)
    ax3.legend(loc="best")
    
    # Fourth Estimator
    train_sizes, train_scores, test_scores = learning_curve(
        estimator4, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring = "recall")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax4.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    ax4.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    ax4.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    ax4.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    ax4.set_title("Random Forest Classifier \n Learning Curve", fontsize=14)
    ax4.set_xlabel('Training size (m)')
    ax4.set_ylabel('Score')
    ax4.grid(True)
    ax4.legend(loc="best")
    return plt
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42)

df = data.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
genuine_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, genuine_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)
y = new_df["Class"]
X = new_df.iloc[:,0:30]
%matplotlib inline
plot_learning_curve(log_reg_us, knn_us, dtc_us, rfc_us, X, y, (0.8, 1));

#Oversampling

In this section we first describe the structure of the modelling and validations. One trivial point to note is, we will not oversample the test data as we want our model to perform well with skewed class distributions eventually. The steps are as follows (The whole set-up will be structured using the imbalance-learn module):

Use a 5-fold cross validation on the training set
On each of the folds use oversampling
Fit the model on the training folds and validate on the validation fold
Note that we will use the best model parameters as obtained from grid-search algorithm in Undersampling.

Oversampling - Logistic Regression

In [None]:
start_time = time.time()
imba_pipeline = make_pipeline(SMOTE(random_state=42), LogisticRegression())

log_reg_params = {"penalty": ['l1', 'l2'], 'C': [ 0.01, 0.1, 1, 100, 100], 'solver' : ['liblinear']}
new_params = {'logisticregression__' + key: log_reg_params[key] for key in log_reg_params}

ran_imba_log_reg = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, n_jobs=-1, scoring='recall', return_train_score=True)
ran_imba_log_reg.fit(X_train, y_train);
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
start_time = time.time()
logistic_cv_score_os = cross_val_score(ran_imba_log_reg, X_train, y_train, scoring = 'recall', cv = kf, n_jobs=-1)

pred_log_reg_os = ran_imba_log_reg.best_estimator_.named_steps['logisticregression'].predict(X_test)
logistic_recall_os = recall_score(y_test, pred_log_reg_os)
logistic_accuracy_os = accuracy_score(y_test, pred_log_reg_os)
logistic_precision_os = precision_score(y_test, pred_log_reg_os)

log_reg_os = ran_imba_log_reg.best_estimator_
print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_log_reg_os = confusion_matrix(y_test, pred_log_reg_os)

# Create a DataFrame from the confusion matrix.
cm_log_reg_os_df = pd.DataFrame(cm_log_reg_os, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_log_reg_os_df

In [None]:
print(f"The accuracy of the logistric regression-os classifier is: {logistic_accuracy_os:.4f}")
print(f"The pecision of the logistric regression-os classifier is: {logistic_precision_os:.4f}")
print(f"The recall of the logistric regression-os classifier is: {logistic_recall_os:.4f}")

In [None]:
log_reg_os, logistic_cv_score_os

Oversampling - Random Forest

In [None]:
start_time = time.time()
imba_pipeline = make_pipeline(SMOTE(random_state=42), RandomForestClassifier())

params = {'n_estimators': [50, 100, 200], 'max_depth': [4, 6, 10, 12], 'random_state': [13] }
new_params = {'randomforestclassifier__' + key: params[key] for key in params}

ran_imba_rf = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, n_jobs=-1, scoring='recall', return_train_score=True)
ran_imba_rf.fit(X_train, y_train);
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
param_grid = {
"max_depth": [3, 4, 7, 10, 25],
"gamma": [0.5, 1, 5, 10, 25],
"min_child_weight": [1, 3, 5, 10, 25],
"reg_lambda": [5, 10, 50, 100, 300],
"scale_pos_weight": [1, 3, 5, 10, 25]
}
# Grid Search CV implementation
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")
halving_cv = HalvingGridSearchCV(xgb_cl, param_grid, scoring="roc_auc", n_jobs=-1, min_resources="exhaust", factor=3)
halving_cv.fit(X_train, y_train)
# Return set of parameters with the best performance
halving_cv.best_params_
# Return the performance metric score
halving_cv.best_score_

In [None]:
start_time = time.time()
rf_cv_score_os = cross_val_score(ran_imba_rf, X_train, y_train, scoring = 'recall', cv = kf, n_jobs=-1)

print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
start_time = time.time()
pred_rfc_os = ran_imba_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)
rfc_recall_os = recall_score(y_test, pred_rfc_os)
rfc_accuracy_os = accuracy_score(y_test, pred_rfc_os)
rfc_precision_os = precision_score(y_test, pred_rfc_os)

rfc_os = ran_imba_rf.best_estimator_
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_rfc_os = confusion_matrix(y_test, pred_rfc_os)

# Create a DataFrame from the confusion matrix.
cm_pred_rfc_os_df = pd.DataFrame(cm_pred_rfc_os, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_rfc_os_df

In [None]:
print(f"The accuracy of the random forest - oversampling model is: {rfc_accuracy_os:.4f}")
print(f"The pecision of the random forest - oversampling model is: {rfc_precision_os:.4f}")
print(f"The recall of the random forest - oversampling model is: {rfc_recall_os:.4f}")

In [None]:
rfc_os, rf_cv_score_os

In [None]:
# Cumulatively create a table for the ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
yproba = ran_imba_rf.best_estimator_.named_steps['randomforestclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Random Forest",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

#Oversampling - Support Vector Classifier

In [None]:
start_time = time.time()
imba_pipeline = make_pipeline(SMOTE(random_state=42), SVC())

svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
new_params = {'svc__' + key: svc_params[key] for key in svc_params}
ran_imba_svc = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, n_jobs=-1, scoring='recall', return_train_score=True)
ran_imba_svc.fit(X_train, y_train);
print(f'Done. {(time.time() - start_time)} total seconds elapsed')

In [None]:

start_time = time.time()
ran_imba_svc.fit(X_train, y_train);
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:

start_time = time.time()
svc_cv_score_os = cross_val_score(ran_imba_svc, X_train, y_train, scoring = 'recall', cv = kf, n_jobs=-1)

pred_svc_os = ran_imba_svc.best_estimator_.named_steps['svc'].predict(X_test)
svc_recall_os = recall_score(y_test, pred_svc_os)
svc_accuracy_os = accuracy_score(y_test, pred_svc_os)
svc_precision_os = precision_score(y_test, pred_svc_os)

svc_os = ran_imba_svc.best_estimator_
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_svc_os = confusion_matrix(y_test, pred_svc_os)

# Create a DataFrame from the confusion matrix.
cm_pred_svc_os_df = pd.DataFrame(cm_pred_svc_os, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_svc_os_df

In [None]:
print(f"The accuracy of the support vector-os classifier is: {svc_accuracy_os:.4f}")
print(f"The pecision of the support vector-os classifier is: {svc_precision_os:.4f}")
print(f"The recall of the support vector-os classifier is: {svc_recall_os:.4f}")

In [None]:
svc_os, svc_cv_score_os

#Oversampling - Decision Tree Classifier

In [None]:
start_time = time.time()
# DecisionTree Classifier
imba_pipeline = make_pipeline(NearMiss(), DecisionTreeClassifier())

tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), "min_samples_leaf": list(range(5,7,1))}
new_params = {'decisiontreeclassifier__' + key: tree_params[key] for key in tree_params}

grid_imba_tree = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, return_train_score=True)

grid_imba_tree.fit(X_train, y_train);
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
start_time = time.time()
dtree_cv_score_us = cross_val_score(grid_imba_tree, X_train, y_train, scoring='recall', cv=kf)

pred_dtc_os = grid_imba_tree.best_estimator_.named_steps['decisiontreeclassifier'].predict(X_test)
dtree_recall_os = recall_score(y_test, pred_dtc_os)
dtree_accuracy_os = accuracy_score(y_test, pred_dtc_os)
dtree_precision_os = precision_score(y_test, pred_dtc_os)

tree_clf_os = grid_imba_tree.best_estimator_
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_dtc_os = confusion_matrix(y_test, pred_dtc_os)

# Create a DataFrame from the confusion matrix.
cm_pred_dtc_os_df = pd.DataFrame(cm_pred_dtc_os, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_dtc_os_df

In [None]:
print(f"The accuracy of the decision tree - oversampling model is: {dtree_accuracy_os:.4f}")
print(f"The pecision of the decision tree - oversampling model is: {dtree_precision_os:.4f}")
print(f"The recall of the decision tree - oversampling model is: {dtree_recall_os:.4f}")

In [None]:
tree_clf_os, dtree_cv_score_us

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_tree.best_estimator_.named_steps['decisiontreeclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "Decision Tree Classifier",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

Oversampling - K-Nearest Neighbour Classifier

In [None]:
start_time = time.time()
imba_pipeline = make_pipeline(SMOTE(random_state=42), KNeighborsClassifier())

knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
new_params = {'kneighborsclassifier__' + key: knears_params[key] for key in knears_params}

ran_imba_knn = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, n_jobs=-1, scoring='recall', return_train_score=True)
ran_imba_knn.fit(X_train, y_train);
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
start_time = time.time()
knear_cv_score_os = cross_val_score(ran_imba_knn, X_train, y_train, scoring = 'recall', cv = kf, n_jobs=-1)

pred_knn_os = ran_imba_knn.best_estimator_.named_steps['kneighborsclassifier'].predict(X_test)
knn_recall_os = recall_score(y_test, pred_knn_os)
knn_precision_os = precision_score(y_test, pred_knn_os)
knn_accuracy_os = accuracy_score(y_test, pred_knn_os)

knn_os = ran_imba_knn.best_estimator_
print(f'Done. {(time.time() - start_time)/60.0} total minutes elapsed')

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm_pred_knn_os = confusion_matrix(y_test, pred_knn_os)

# Create a DataFrame from the confusion matrix.
cm_pred_knn_os_df = pd.DataFrame(cm_pred_knn_os, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_pred_knn_os_df

In [None]:
print(f"The accuracy of the knn - oversampling model is: {knn_accuracy_os:.4f}")
print(f"The pecision of the knn - oversampling model is: {knn_precision_os:.4f}")
print(f"The recall of the knn - oversampling model is: {knn_recall_os:.4f}")

In [None]:
knn_os, knear_cv_score_os

In [None]:
# Cumulatively create a table for the ROC curve
yproba = grid_imba_tree.best_estimator_.named_steps['kneighborsclassifier'].predict_proba(X_test)[::,1]
    
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

result_table = result_table.append({'classifiers': "k-Nearest Neighbour",
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)


Now we summarize all the recall scores in a table for comparison.

In [None]:
# Gather the scores
final_scores = [['Logistic Regression', logistic_accuracy_us, logistic_accuracy_os, logistic_recall_us, logistic_recall_os], 
        ['Random Forest', rfc_accuracy_us, rfc_accuracy_os, rfc_recall_us, rfc_recall_os], 
        ['Support Vector', svc_accuracy_us, svc_accuracy_os, svc_recall_us, svc_recall_os],
        ['Decision Tree', dtree_accuracy_us, dtree_accuracy_os, dtree_recall_us, dtree_recall_os],
        ['k-Nearest Neighbour', knear_recall_us, knear_recall_os, knear_recall_us, knear_recall_os]
             ] 
  
# Create the dataframe 
final_df = pd.DataFrame(final_scores, columns = ['Classifier', 'Accuracy - Random UnderSampling', 'Accuracy - Oversampling (SMOTE)',
                                                'Recall - Random UnderSampling', 'Recall - Oversampling (SMOTE)']) 
final_df

In [None]:
# Gather the scores
final_scores = [['Logistic Regression', logistic_accuracy, logistic_recall, logistic_accuracy_us, logistic_accuracy_os, logistic_recall_us, logistic_recall_os], 
        ['Random Forest', rfc_accuracy, rfc_recall, rfc_accuracy_us, rfc_accuracy_os, rfc_recall_us, rfc_recall_os], 
        ['Decision Tree', 'Not Applicable', 'Not Applicable',dtree_accuracy_us, dtree_accuracy_os, dtree_recall_us, dtree_recall_os]
        ] 
  
# Create the dataframe 
final_df = pd.DataFrame(final_scores, columns = ['Classifier', 'Naive - Accuracy', 'Naive - Recall', 'Accuracy - Random UnderSampling', 'Accuracy - Oversampling (SMOTE)',
                                                'Recall - Random UnderSampling', 'Recall - Oversampling (SMOTE)']) 
final_df

Plot the ROC curve for Oversampling

In [None]:
# Plot the ROC curve for undersampling
result_table.set_index('classifiers', inplace=True)
fig = plt.figure(figsize=(17,7))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis for Oversampling', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show(

In [None]:
plot_learning_curve(log_reg_os, tree_clf_os, rfc_os, X, y, (0.8, 1));


Please note that we did not use the outlier detection because sometimes we want the features in the model to have some extreme values to train the model accordingly. Also, this problem was an example of anomaly detection () and hence we did not want to get rid of the extreme values in features.