# Introduction

In this project, we will be predicting whether a transaction is fraud or not fraud by using predictive models. The dataset given contains features V1 to V28 which are a result of PCA dimensionality reduction to protect user identities and sensitive features (according to the description).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
df.head()

In [3]:
df.describe()

In [4]:
# no null values
df.isnull().sum().max()

In [5]:
df.columns

In [6]:
print('No Frauds:', round(df.Class.value_counts()[0] / len(df) * 100, 2), '%')
print('Frauds:', round(df.Class.value_counts()[1] / len(df) * 100, 2),  '%')

Note that the dataset is highly imbalaced. If we use this dataset straight away for fitting models, we will probably overfit it since the models will assume that most transaction are not fraud.

In [7]:
sns.countplot('Class', data=df)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)
plt.show()

In [8]:
fig, ax = plt.subplots(1, 2, figsize=(18, 4))

amount_val = df.Amount.values
time_val = df.Time.values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])
plt.show()

# Data Pre-processing

## Scaling and Undersampling

Since the features V1 to V28 has been scaled beforehand, we will also scale the columns Time and Amount:

In [9]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# std_scaler = StandardScaler()
rob_scaler = RobustScaler() #more robust to outliers

df['scaled_amount'] = rob_scaler.fit_transform(df.Amount.values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df.Time.values.reshape(-1, 1))

df_scaled = df.drop(['Time', 'Amount'], axis=1)

In [10]:
scaled_amount = df.scaled_amount
scaled_time = df.scaled_time

df_scaled.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df_scaled.insert(0, 'scaled_amount', scaled_amount)
df_scaled.insert(1, 'scaled_time', scaled_time)

In [11]:
df_scaled.head()

We will be splitting the original DataFrame first before proceeding to undersample the dataset. This is because we want to evaluate our models based on the original test dataset.

In [12]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = df_scaled.drop('Class', axis=1)
y = df_scaled.Class

skf = StratifiedKFold(n_splits=5, shuffle=False)

for train_index, test_index in skf.split(X, y):
    original_X_train, original_X_test = X.iloc[train_index], X.iloc[test_index]
    original_y_train, original_y_test = y.iloc[train_index], y.iloc[test_index]
    
    original_X_train, original_X_test = original_X_train.values, original_X_test.values
    original_y_train, original_y_test = original_y_train.values, original_y_test.values

    # see if the train and test set have similar distributed labels
    _, train_label_count = np.unique(original_y_train, return_counts=True)
    _, test_label_count = np.unique(original_y_test, return_counts=True)

    print('Label Distributions:')
    print(train_label_count / len(original_y_train))
    print(test_label_count / len(original_y_test))

In [13]:
# see if the train and test set have similar distributed labels
# _, train_label_count = np.unique(original_y_train, return_counts=True)
# _, test_label_count = np.unique(original_y_test, return_counts=True)

# print('Label Distributions:')
# print(train_label_count / len(original_y_train))
# print(test_label_count / len(original_y_test))

In [14]:
y.value_counts()

Note that there is only 492 fraud transactions, so we will be including only 492 non-fraud trasactions to achieve a 50/50 ratio.

In [15]:
# shuffle before creating subsamples
df_scaled = df_scaled.sample(frac=1, random_state=42)

df_fraud = df_scaled.loc[df_scaled.Class == 1]
df_non_fraud = df_scaled.loc[df_scaled.Class == 0][:492]

df_norm = pd.concat([df_fraud, df_non_fraud]).sample(frac=1, random_state=42)

In [16]:
df_norm.head()

In [17]:
print('Label Distribution:')
print(df_norm.Class.value_counts() / len(df_norm))

In [18]:
sns.countplot('Class', data=df_norm)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)
plt.show()

## Detecting Anomalies

We will first plot out the correlation matrix to determine the most correlated features with respect to the target variable:

In [19]:
plt.figure(figsize=(12, 10))
corr = df_norm.corr()
fig = sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
fig.set_title('Correlation Matrix', fontsize=14)
plt.show()

In [20]:
corr

In [21]:
# top 3 most positive/negative correlated features
abs(corr['Class']).sort_values(ascending=False)[1:4]

In [22]:
fig, ax = plt.subplots(ncols=3, figsize=(15, 5))
sns.boxplot(x='Class', y='V14', data=df_norm, ax=ax[0])
ax[0].set_title('V14 vs Class')
sns.boxplot(x='Class', y='V4', data=df_norm, ax=ax[1])
ax[1].set_title('V4 vs Class')
sns.boxplot(x='Class', y='V11', data=df_norm, ax=ax[2])
ax[2].set_title('V11 vs Class')
plt.show()

In [23]:
from scipy.stats import norm

fig, ax = plt.subplots(1, 3, figsize=(20, 6))

v14_fraud_dist = df_norm.V14.loc[df_norm.Class == 1].values
sns.distplot(v14_fraud_dist, ax=ax[0], fit=norm, color='red')
ax[0].set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)

v4_fraud_dist = df_norm.V4.loc[df_norm.Class == 1].values
sns.distplot(v4_fraud_dist, ax=ax[1], fit=norm, color='green')
ax[1].set_title('V4 Distribution \n (Fraud Transactions)', fontsize=14)

v11_fraud_dist = df_norm.V11.loc[df_norm.Class == 1].values
sns.distplot(v11_fraud_dist, ax=ax[2], fit=norm, color='blue')
ax[2].set_title('V11 Distribution \n (Fraud Transactions)', fontsize=14)

plt.show()

Comparing the distributions of the 3 most positive/negative correlated features to the target variable (i.e., V14, V4, V11), only V4 has a slightly skewed distribution.

In [24]:
df_norm.shape

Next we will be removing the outliers with respect to V14, V4 and V11, starting with the most postive/negative correlated feature (i.e., V14). We will be setting our thresholds to 1.5 times the Interquartile Range (75th percentile - 25th percentile). Note that we have to be aware of not removing too many data points because the model might underfit the data.

In [25]:
# removing outliers of V14 feature
v14_fraud = v14_fraud_dist
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
v14_iqr = q75 - q25

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('V14 Lower Threshold: {}'.format(v14_lower))
print('V14 Upper Threshold: {}'.format(v14_upper))
v14_outliers = [x for x in v14_fraud if  x < v14_lower or x > v14_upper]
print('V14 Outliers: {}'.format(v14_outliers))
print('Number of V14 Outliers: {}'.format(len(v14_outliers)))
df_final = df_norm.drop(df_norm[(df_norm.V14 < v14_lower) | (df_norm.V14 > v14_upper)].index)

print('---' * 50)

# removing outliers of V4 feature
v4_fraud = df_final.V4.loc[df_final.Class == 1].values
q25, q75 = np.percentile(v4_fraud, 25), np.percentile(v4_fraud, 75)
v4_iqr = q75 - q25

v4_cut_off = v4_iqr * 1.5
v4_lower, v4_upper = q25 - v4_cut_off, q75 + v4_cut_off
print('V4 Lower Threshold: {}'.format(v4_lower))
print('V4 Upper Threshold: {}'.format(v4_upper))
v4_outliers = [x for x in v4_fraud if  x < v4_lower or x > v4_upper]
print('V4 Outliers: {}'.format(v4_outliers))
print('Number of V4 Outliers: {}'.format(len(v4_outliers)))
df_final = df_final.drop(df_final[(df_final.V4 < v4_lower) | (df_final.V4 > v4_upper)].index)

print('---' * 50)

# removing outliers of V10 feature
v11_fraud = df_final.V11.loc[df_final.Class == 1].values
q25, q75 = np.percentile(v11_fraud, 25), np.percentile(v11_fraud, 75)
v11_iqr = q75 - q25

v11_cut_off = v11_iqr * 1.5
v11_lower, v11_upper = q25 - v11_cut_off, q75 + v11_cut_off
print('V11 Lower Threshold: {}'.format(v11_lower))
print('V11 Upper Threshold: {}'.format(v11_upper))
v11_outliers = [x for x in v11_fraud if  x < v11_lower or x > v11_upper]
print('V11 Outliers: {}'.format(v11_outliers))
print('Number of V11 Outliers: {}'.format(len(v11_outliers)))
df_final = df_final.drop(df_final[(df_final.V11 < v11_lower) | (df_final.V11 > v11_upper)].index)

In [26]:
df_final.shape

## Dimensionality Reduction and Clustering

Let's see if the t-SNE algorithm can accurately classify the dataset:

In [27]:
from sklearn.manifold import TSNE

X = df_final.drop('Class', axis=1)
y = df_final['Class']

X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)

In [28]:
import matplotlib.patches as mpatches

fig, ax = plt.subplots(1, 1, figsize=(8, 6))

blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')

ax.scatter(X_reduced_tsne[:, 0], X_reduced_tsne[:, 1], c=(y==0), cmap='coolwarm', label='No Fraud')
ax.scatter(X_reduced_tsne[:, 0], X_reduced_tsne[:, 1], c=(y==1), cmap='coolwarm', label='Fraud')
ax.set_title('t_SNE', fontsize=14)
ax.grid(True)
ax.legend(handles=[blue_patch, red_patch])
plt.show()

# Model Building

## Undersampling before cross validation

The common mistake while undersampling or oversampling is doing it before cross validation. It will directly influence the validation set, hence it will show good precision and recall scores but in reality the data is overfitted. More info on this kernel: https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets/notebook

In [29]:
# note that we are using the undersampled dataset
# undersampling before cross validation will prone to overfit the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = {'LogisticRegression': LogisticRegression(), 
               'KNearest': KNeighborsClassifier(), 
               'Support Vector Classifer': SVC(), 
               'DecisionTreeClassifier': DecisionTreeClassifier()}

In [32]:
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print('{} Cross Validation Score: {}%'.format(key, round(training_score.mean() * 100, 2)))

Logistic Regression has the highest accuracy score among the classifiers. Let's further tune the hyperparameters of each classifier by using GridSearchCV:

In [33]:
from sklearn.model_selection import GridSearchCV

log_reg_params = {'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
log_reg = grid_log_reg.best_estimator_

knears_params = {'n_neighbors': list(range(2, 5, 1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
knears_neighbors = grid_knears.best_estimator_

svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)
svc = grid_svc.best_estimator_

tree_params = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(2, 4, 1)), 'min_samples_leaf': list(range(5, 7, 1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)
tree_clf = grid_tree.best_estimator_

In [34]:
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: {}%'.format(round(log_reg_score.mean() * 100, 2)))

knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
print('Knears Neighbors Cross Validation Score: {}%'.format(round(knears_score.mean() * 100, 2)))

svc_score = cross_val_score(svc, X_train, y_train, cv=5)
print('Support Vector Classifier Cross Validation Score: {}%'.format(round(svc_score.mean() * 100, 2)))

tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score: {}%'.format(round(tree_score.mean() * 100, 2)))

## Undersampling during cross validation

In [35]:
df_scaled.head()

In [98]:
# undersampling during cross validation
undersample_X = df_scaled.drop('Class', axis=1)
undersample_y = df_scaled['Class']

for train_index, test_index in skf.split(undersample_X, undersample_y):
    undersample_X_train, undersample_X_test = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
    undersample_y_train, undersample_y_test = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
    
undersample_X_train, undersample_X_test = undersample_X_train.values, undersample_X_test.values
undersample_y_train, undersample_y_test = undersample_y_train.values, undersample_y_test.values

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

The code below shows how to implement the NearMiss method for undersampling;

In [37]:
from imblearn.under_sampling import NearMiss
from collections import Counter

X_nearmiss, y_nearmiss = NearMiss().fit_resample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))

In [None]:
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

for train, test in skf.split(undersample_X_train, undersample_y_train):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg)
    undersample_model = undersample_pipeline.fit(undersample_X_train[train], undersample_y_train[train])
    undersample_prediction = undersample_model.predict(undersample_X_train[test])
    undersample_accuracy.append(undersample_pipeline.score(undersample_X_train[test], undersample_y_train[test]))
    undersample_precision.append(precision_score(undersample_y_train[test], undersample_prediction))
    undersample_recall.append(recall_score(undersample_y_train[test], undersample_prediction))
    undersample_f1.append(f1_score(undersample_y_train[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(undersample_y_train[test], undersample_prediction))

In [40]:
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42)
plot_learning_curve(log_reg, knears_neighbors, svc, tree_clf, X_train, y_train, ylim=(0.87, 1.01), cv=cv, n_jobs=4)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict

log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5, method='decision_function')
knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)
svc_pred = cross_val_predict(svc, X_train, y_train, cv=5, method='decision_function')
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)

In [42]:
print('Logistic Regression:', roc_auc_score(y_train, log_reg_pred))
print('K-Nearest Neighbors:', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier:', roc_auc_score(y_train, svc_pred))
print('Decision Tree Classifier:', roc_auc_score(y_train, tree_pred))

In [43]:
log_fpr, log_tpr, log_threshold = roc_curve(y_train, log_reg_pred)
knears_fpr, knears_tpr, knears_threshold = roc_curve(y_train, knears_pred)
svc_fpr, svc_tpr, svc_threshold = roc_curve(y_train, svc_pred)
tree_fpr, tree_tpr, tree_threshold = roc_curve(y_train, tree_pred)

def graph_roc_curve_multiple(log_fpr, log_tpr, knears_fpr, knears_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr):
    plt.figure(figsize=(16,8))
    plt.title('ROC Curve \n Top 4 Classifiers', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y_train, log_reg_pred)))
    plt.plot(knears_fpr, knears_tpr, label='K-Nearest Neighbors Classifier Score: {:.4f}'.format(roc_auc_score(y_train, knears_pred)))
    plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(roc_auc_score(y_train, svc_pred)))
    plt.plot(tree_fpr, tree_tpr, label='Decision Tree Classifier Score: {:.4f}'.format(roc_auc_score(y_train, tree_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.legend()
    
graph_roc_curve_multiple(log_fpr, log_tpr, knears_fpr, knears_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr)
plt.show()

Let's look into Logistic Regression specifically:

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, threshold = precision_recall_curve(y_train, log_reg_pred)

In [45]:
from sklearn.metrics import accuracy_score

y_pred = log_reg.predict(X_train)

# overfitting case (undersampling before cross validation)
print('Overfitting case:')
print('Recall Score: {:.2f}'.format(recall_score(y_train, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_train, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_train, y_pred)))
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_train, y_pred)))
print('---' * 50)

# normal case (undersampling during cross validation)
print('Normal case:')
print('Recall Score: {:.2f}'.format(np.mean(undersample_recall)))
print('Precision Score: {:.2f}'.format(np.mean(undersample_precision)))
print('F1 Score: {:.2f}'.format(np.mean(undersample_f1)))
print('Accuracy Score: {:.2f}'.format(np.mean(undersample_accuracy)))

In [46]:
from sklearn.metrics import average_precision_score

undersample_y_score = log_reg.decision_function(undersample_X_test)
undersample_average_precision = average_precision_score(undersample_y_test, undersample_y_score)
print('Average precision-recall score: {0:0.2f}'.format(undersample_average_precision))

In [47]:
fig = plt.figure(figsize=(12, 6))
precision, recall, _ = precision_recall_curve(undersample_y_test, undersample_y_score)
plt.step(recall, precision)
plt.fill_between(recall, precision, alpha=0.2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: \n Average Precision-Recall Score ={0:0.2f}'.format(undersample_average_precision), fontsize=16)
plt.show()

## SMOTE (Oversampling)

In [48]:
len(original_X_train), len(original_X_test)

In [49]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

log_reg_params = {'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)

for train, test in skf.split(original_X_train, original_y_train):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg)
    model = pipeline.fit(original_X_train[train], original_y_train[train])
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.predict(original_X_train[test])
    
    accuracy_lst.append(pipeline.score(original_X_train[test], original_y_train[test]))
    precision_lst.append(precision_score(original_y_train[test], prediction))
    recall_lst.append(recall_score(original_y_train[test], prediction))
    f1_lst.append(f1_score(original_y_train[test], prediction))
    auc_lst.append(roc_auc_score(original_y_train[test], prediction))
    
print("Accuracy: {}".format(np.mean(accuracy_lst)))
print("Precision: {}".format(np.mean(precision_lst)))
print("Recall: {}".format(np.mean(recall_lst)))
print("F1: {}".format(np.mean(f1_lst)))

In [50]:
from sklearn.metrics import classification_report

labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(original_X_test)

print(classification_report(original_y_test, smote_prediction, target_names=labels))

In [77]:
y_score = best_est.decision_function(original_X_test)

In [52]:
average_precision = average_precision_score(original_y_test, y_score)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

In [53]:
fig = plt.figure(figsize=(12, 6))
precision, recall, _ = precision_recall_curve(original_y_test, y_score)
plt.step(recall, precision)
plt.fill_between(recall, precision, alpha=0.2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: \n Average Precision-Recall Score ={0:0.2f}'.format(average_precision), fontsize=16)
plt.show()

Now we will take a look at the confusion matrix of predicting the original test dataset (i.e., original_X_test). <br>
Note that: <br>
True Negatives (Top-left): Correctly classifying 0 Class (No Fraud) <br>
False Positives (Top-right): Incorrectly classifying as 1 Class (Fraud). Actual is 0 Class (No Fraud) <br>
False Negatives (Bottom-left): Incorrectly classifying 0 Class (No Fraud). Actual is 1 Class (Fraud) <br>
True Positives (Bottom-right): Correctly classifying 1 Class (Fraud)

In [54]:
from sklearn.metrics import confusion_matrix

# y_pred_log_reg = best_est.predict(X_test)
# y_pred_knears = knears_neighbors.predict(X_test)
# y_pred_svc = svc.predict(X_test)
# y_pred_tree = tree_clf.predict(X_test)

y_pred_log_reg = best_est.predict(original_X_test)
y_pred_knears = knears_neighbors.predict(original_X_test)
y_pred_svc = svc.predict(original_X_test)
y_pred_tree = tree_clf.predict(original_X_test)

# log_reg_cf = confusion_matrix(y_test, y_pred_log_reg)
# kneighbors_cf = confusion_matrix(y_test, y_pred_knears)
# svc_cf = confusion_matrix(y_test, y_pred_svc)
# tree_cf = confusion_matrix(y_test, y_pred_tree)

log_reg_cf = confusion_matrix(original_y_test, y_pred_log_reg)
kneighbors_cf = confusion_matrix(original_y_test, y_pred_knears)
svc_cf = confusion_matrix(original_y_test, y_pred_svc)
tree_cf = confusion_matrix(original_y_test, y_pred_tree)

fig, ax = plt.subplots(2, 2, figsize=(22, 12))

sns.heatmap(log_reg_cf, ax=ax[0][0], annot=True, cmap=plt.cm.copper)
ax[0, 0].set_title("Logistic Regression \n Confusion Matrix", fontsize=14)
ax[0, 0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0, 0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(kneighbors_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("K-Nearest Neighbors \n Confusion Matrix", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Support Vector Classifier \n Confusion Matrix", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("Decision Tree Classifier \n Confusion Matrix", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)


plt.show()

In [55]:
print('Logistic Regression:')
print(classification_report(original_y_test, y_pred_log_reg))

print('K-Nearest Neighbors:')
print(classification_report(original_y_test, y_pred_knears))

print('Support Vector Classifier:')
print(classification_report(original_y_test, y_pred_svc))

print('Decision Tree Classifier:')
print(classification_report(original_y_test, y_pred_tree))

In [56]:
# comparing models fit on undersampled & oversampled dataset
y_pred = log_reg.predict(original_X_test)
undersample_score = accuracy_score(original_y_test, y_pred)

y_pred_sm = best_est.predict(original_X_test)
oversample_score = accuracy_score(original_y_test, y_pred_sm)

d = {'Technique': ['Random Undersampling', 'Oversampling (SMOTE)'], 'Score': [undersample_score, oversample_score]}
df_result = pd.DataFrame(data=d)
df_result

## Neural Networks

We will see if implementing neural networks can increase our performance for both undersampled and oversampled datasets.

In [57]:
n_inputs = X_train.shape[1]

undersample_model = keras.models.Sequential([keras.layers.Dense(n_inputs, input_shape=(n_inputs,), activation='relu'), 
                                             keras.layers.Dense(32, activation='relu'), 
                                             keras.layers.Dense(1, activation='sigmoid')])

In [58]:
undersample_model.summary()

In [60]:
undersample_model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True)

In [94]:
undersample_predictions = undersample_model.predict(original_X_test)

In [80]:
undersample_fraud_predictions = np.array([1 if prediction >= 0.5 else 0 for prediction in undersample_predictions])

In [95]:
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [64]:
undersample_cm = confusion_matrix(original_y_test, undersample_fraud_predictions)
undersample_cm

In [65]:
labels = ['No Fraud', 'Fraud']
fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(undersample_cm, labels, title='Random Undersampling \n Confusion Matrix')
plt.show()

We can see that the recall of this model has improved compared to the models we implemented previously.

In [96]:
# oversample dataset using SMOTE
sm = SMOTE(sampling_strategy='minority', random_state=42)
Xsm_train, ysm_train = sm.fit_resample(original_X_train, original_y_train)

In [79]:
n_inputs = Xsm_train.shape[1]

oversample_model = keras.models.Sequential([keras.layers.Dense(n_inputs, input_shape=(n_inputs,), activation='relu'), 
                                             keras.layers.Dense(32, activation='relu'), 
                                             keras.layers.Dense(1, activation='sigmoid')])

In [68]:
oversample_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [69]:
oversample_model.fit(Xsm_train, ysm_train, validation_split=0.2, batch_size=300, epochs=20, shuffle=True)

In [70]:
oversample_predictions = oversample_model.predict(original_X_test)

In [97]:
oversample_fraud_predictions = np.array([1 if prediction >= 0.5 else 0 for prediction in oversample_predictions])

In [72]:
oversample_cm = confusion_matrix(original_y_test, oversample_fraud_predictions)
oversample_cm

In [73]:
fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(oversample_cm, labels, title='Oversampling (SMOTE) \n Confusion Matrix')
plt.show()

The neural network we fitted on the oversampled dataset has lower recall than the previous neural network. However, the precision has also significantly improved.

# Conclusion

When we are building a classifier, both precision and recall are important, but it all comes down to the task we are given. In this particular project, higher recall means we can detect more frauds which leads to a more safer platform, whereas higher precision means that we can correctly detect the fraud cases which leads to customer satisfaction (from avoiding their accounts getting blocked). Moreover, we can still improve on our oversampled dataset by removing outliers, and also fine tuning our neural network models.