# Credit Card Fraud Detection

# Load Dataset

In [None]:
import pandas as pd

In [None]:
raw_df = pd.read_csv('/kaggle/input/fraud-detection/creditcard.csv')
# raw_df.head()

# raw_df = pd.read_csv('./creditcard.csv')
raw_df.shape

# Exploratory Data Analysis (EDA)

In [None]:
raw_df.describe().T

In [None]:
raw_df.isna().sum()

### Data Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_df.hist(figsize=(16,12))
plt.show()

In [None]:
# numeric_columns = (list(raw_df.loc[:, ['Time','Amount']]))

# fig = plt.figure(figsize=(20, 50))
# rows, cols = 10, 3
# for idx, num in enumerate(numeric_columns[:30]):
#     ax = fig.add_subplot(rows, cols, idx+1)
#     ax.grid(alpha = 0.7, axis ="both")
#     sns.kdeplot(x = num, fill = True,color ="#3386FF",linewidth=0.6, data = raw_df)
#     ax.legend()
# fig.tight_layout()
# fig.show()

### Genuine vs Fraud Comparation

In [None]:
tmp = raw_df.Class.value_counts()
pie_val = [tmp[0] / sum(tmp) * 100, tmp[1] / sum(tmp) * 100]
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.pie(pie_val,labels = ['Genuine','Fraud'], 
        autopct = '%1.2f%%',
        startangle = 90,
        explode = (0.1,0.1),
        colors = ['#66b3ff','#ffcc99'], 
        wedgeprops = {'linewidth': 1, 'antialiased' : True})
plt.title('Genuine vs Fraud Transaction Percentage')

plt.subplot(1,2,2)
ax = sns.countplot(data = raw_df, 
                   x='Class', 
                   palette = ['#66b3ff','#ffcc99']
                   )
for i in ax.containers:
    ax.bar_label(i,)
ax.set_xticklabels(['Genuine','Fraud'])
    
plt.title('Genuine and Fraud Transaction Comparation')
plt.show()

In [None]:
plt.figure(figsize=(12,12))

tmp = {}
for i, data in enumerate(raw_df.groupby('Class')['Amount'].median()):
    tmp[i] = data

plt.subplot(2,2,1)
ax = sns.barplot(x=list(tmp.keys()),
                 y=[float(tmp[k]) for k in tmp.keys()],
                 palette=['#66b3ff','#ffcc99'])
for i in ax.containers:
    ax.bar_label(i,)
ax.set_xticklabels(['Genuine','Fraud'])
ax.set_title('Genuine vs Fraud Amount Median')

tmp = {}
for i, data in enumerate(raw_df.groupby('Class')['Amount'].mean()):
    tmp[i] = data

plt.subplot(2,2,2)
ax = sns.barplot(x=list(tmp.keys()),
                 y=[float(tmp[k]) for k in tmp.keys()],
                 palette=['#66b3ff','#ffcc99'])
for i in ax.containers:
    ax.bar_label(i,)
ax.set_xticklabels(['Genuine','Fraud'])
ax.set_title('Genuine vs Fraud Amount Mean')

plt.subplot(2,2,3)
ax = sns.barplot(x=list(tmp.keys()),
                 y=[float(tmp[k]) for k in tmp.keys()],
                 palette=['#66b3ff','#ffcc99'])
for i in ax.containers:
    ax.bar_label(i,)
ax.set_xticklabels(['Genuine','Fraud'])
ax.set_title('Genuine vs Fraud Time Median')

tmp = {}
for i, data in enumerate(raw_df.groupby('Class')['Time'].mean()):
    tmp[i] = data

plt.subplot(2,2,4)
ax = sns.barplot(x=list(tmp.keys()),
                 y=[float(tmp[k]) for k in tmp.keys()],
                 palette=['#66b3ff','#ffcc99'])
for i in ax.containers:
    ax.bar_label(i,)
ax.set_xticklabels(['Genuine','Fraud'])
ax.set_title('Genuine vs Fraud Time Mean')

Both Amount and Time have very significant mean and median differences due to influence of outliers

### Feature Correlation

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(raw_df.drop('Class', axis=1).corr(), cmap='magma')

Amount and Time have a weak to moderate correlation with some of the features in the data set.

### Checking Outlier Data

In [None]:
# select only numeric features
numeric_features = (list(raw_df.loc[:, 'V1':'Amount'])) 

# checking boxplots
def boxplots_custom(dataset, columns_list, rows, cols, suptitle):
    fig, axs = plt.subplots(rows, cols, sharey=True, figsize=(16,25))
    fig.suptitle(suptitle,y=1, size=25)
    axs = axs.flatten()
    for i, data in enumerate(columns_list):
        sns.boxplot(data=dataset[data], orient='h', ax=axs[i])
        axs[i].set_title(data + ', skewness is: '+str(round(dataset[data].skew(axis = 0, skipna = True),2)))
        
boxplots_custom(dataset=raw_df, columns_list=numeric_features, rows=10, cols=3, suptitle='Boxplots for each variable')
plt.tight_layout()

there is a lot of outlier from each features, now try to remove it using Inter Quartile Range (IQR) method 

#### Removing Outlier with IQR

In [None]:
import numpy as np
from collections import Counter

In [None]:
def IQR_method (df,min_outliers,features):
    """
    Takes a dataframe and returns an index list corresponding to the observations 
    containing more than n outliers according to the Tukey IQR method.
    """
    outlier_list = []
    
    for column in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[column], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[column],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determining a list of indices of outliers
        outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step )].index
        # appending the list of outliers 
        outlier_list.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > min_outliers )
    
    # Calculate the number of records below and above lower and above bound value respectively
    out1 = df[df[column] < Q1 - outlier_step]
    out2 = df[df[column] > Q3 + outlier_step]
    
    print('Total number of deleted outliers is:', out1.shape[0]+out2.shape[0])
    
    return multiple_outliers

In [None]:
numeric_columns = (list(raw_df.loc[:, 'Time':'Amount']))

iqr_outliers = IQR_method(raw_df,1,numeric_columns)
raw_df2 = raw_df.drop(iqr_outliers, axis = 0).reset_index(drop=True)

In [None]:
raw_df2.Class.value_counts()

too much missing value, revert to non-removing outlier data

In [None]:
del raw_df2

# Preprocessing Data

### Drop Unused Feature

In [None]:
# raw_df.drop('Time',axis=1, inplace=True)

### Split Data

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
X = raw_df.drop('Class', axis=1)
y = raw_df['Class']

X_train_cv = []
X_test_cv = []
y_train_cv = []
y_test_cv = []
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train_cv.append(X.loc[train_index,:])
    X_test_cv.append(X.loc[test_index,:])
    y_train_cv.append(y.loc[train_index])
    y_test_cv.append(y.loc[test_index])



In [None]:
print(y_train_cv[0].value_counts())
print(y_test_cv[0].value_counts())

In [None]:
from sklearn.preprocessing import StandardScaler

### Scale Data

In [None]:
scaler = StandardScaler()
# X_train = pd.DataFrame(scaler.fit_transform(X_train),columns = X_train.columns, index=X_train.index)
# X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns, index=X_test.index)

# raw_df[["Amount"]] = scaler.fit_transform(raw_df[["Amount"]])
# raw_df[["Time"]] = scaler.fit_transform(raw_df[["Time"]])

# Transforming the test data
# X_test[["Amount"]] = scaler.transform(X_test[["Amount"]])
# X_test[["Time"]] = scaler.transform(X_test[["Time"]])

for i in range(len(X_train_cv)):
    X_train_cv[i][["Amount","Time"]] = scaler.fit_transform(X_train_cv[i][["Amount","Time"]])
#     X_train_cv[i][["Time"]] = scaler.fit_transform(X_train_cv[i][["Time"]])

    # Transforming the test data
#     X_test_cv[i][["Amount"]] = scaler.transform(X_test_cv[i][["Amount"]])
    X_test_cv[i][["Amount","Time"]] = scaler.transform(X_test_cv[i][["Amount","Time"]])

# Model Training

In [None]:
import time
import numpy
from sklearn.metrics import classification_report, precision_recall_curve, f1_score, auc, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
model_result = []

### Logistic Regression

In [None]:
# %%time
# logr = LogisticRegression(random_state=1)
# logr.fit(X_train, y_train)
# y_preds = logr.predict(X_test)
# print(classification_report(y_test,y_preds))
# y_probs = logr.predict_proba(X_test)
# # keep probabilities for the positive outcome only
# y_probs = y_probs[:, 1]
# # predict class values
# yhat = logr.predict(X_test)
# lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_probs)
# lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# # summarize scores
# print(f'Logistic: f1 = {round(lr_f1,3)} auc = {round(lr_auc,3)}')
# # plot the precision-recall curves
# # no_skill = len(y_test[y_test==1]) / len(y_test)
# # plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# plt.plot(lr_recall, lr_precision, label='Logistic')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.legend()
# plt.show()


In [None]:
logr = LogisticRegression(random_state=1)

axes = plt.subplot()
y_real = []
y_proba = []
f1_list = []
# auc_list = []
time_list = []
for i in range(len(X_train_cv)):
    print(f'fold #{i+1}')
    start_time = time.time()
    logr.fit(X_train_cv[i],y_train_cv[i])
    elapsed_time = time.time() - start_time
    print(f'training time {elapsed_time}')
    time_list.append(elapsed_time)
    y_probs = logr.predict_proba(X_test_cv[i])
    # probabilities for the positive outcome only
    y_probs = y_probs[:, 1]
    # predict class values
    yhat = logr.predict(X_test_cv[i])
    f_score = f1_score(y_test_cv[i], yhat)
    f1_list.append(f_score)
    print('f1 score:',f_score)
    time_list.append(time.time() - start_time)
    # print(classification_report(y_test_cv[i],yhat))
    res_precision, res_recall, _ = precision_recall_curve(y_test_cv[i], y_probs)
    aupr = average_precision_score(y_test_cv[i], y_probs)
    lab = 'Fold %d AUPR = %.4f' % (i+1, aupr)
#     model_result.append({'model':'Logistic Regression', 'rec': res_recall, 'prec':res_precision, 'aupr':aupr})
    axes.step(res_recall, res_precision, label=lab, alpha=0.5)
    # y_real.append(y_test_cv[i])
    y_proba.append(y_probs)

print()
print('Avg. Training time:',sum(time_list)/len(time_list))
print('Avg. F1-Score Positive class:',sum(f1_list)/len(f1_list))

y_real = numpy.concatenate(y_test_cv)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR = %.4f' % (average_precision_score(y_real, y_proba))
axes.step(recall, precision, label=lab, lw=2, color='black')
axes.set_xlabel('Recall')
axes.set_ylabel('Precision')
axes.legend(loc='lower left', fontsize='small')
axes.set_title('Precision-Recall Curve for Positive Class')

### Random Forest

In [None]:
# rfc = RandomForestClassifier(random_state=1)
# rfc.fit(X_train, y_train)
# y_preds = rfc.predict(X_test)
# print(classification_report(y_test,y_preds))
# y_probs = rfc.predict_proba(X_test)
# # keep probabilities for the positive outcome only
# y_probs = y_probs[:, 1]
# # predict class values
# yhat = rfc.predict(X_test)
# lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_probs)
# lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# # summarize scores
# print(f'Logistic: f1 = {round(lr_f1,3)} auc = {round(lr_auc,3)}')
# # plot the precision-recall curves
# # no_skill = len(y_test[y_test==1]) / len(y_test)
# # plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# plt.plot(lr_recall, lr_precision, label='Logistic')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.legend()
# plt.show()

In [None]:
rfc = RandomForestClassifier(n_estimators=100,random_state=1)

axes = plt.subplot()
y_real = []
y_proba = []
f1_list = []
# auc_list = []
time_list = []
for i in range(len(X_train_cv)):
    print(f'fold #{i+1}')
    start_time = time.time()
    rfc.fit(X_train_cv[i],y_train_cv[i])
    elapsed_time = time.time() - start_time
    print(f'training time {elapsed_time}')
    time_list.append(elapsed_time)
    y_probs = rfc.predict_proba(X_test_cv[i])
    # probabilities for the positive outcome only
    y_probs = y_probs[:, 1]
    # predict class values
    yhat = rfc.predict(X_test_cv[i])
    f_score = f1_score(y_test_cv[i], yhat)
    f1_list.append(f_score)
    print('f1 score:',f_score)
    time_list.append(time.time() - start_time)
    # print(classification_report(y_test_cv[i],yhat))
    res_precision, res_recall, _ = precision_recall_curve(y_test_cv[i], y_probs)
    aupr = average_precision_score(y_test_cv[i], y_probs)
    lab = 'Fold %d AUPR = %.4f' % (i+1, aupr)
#     model_result.append({'model':'Random Forest', 'rec': res_recall, 'prec':res_precision, 'aupr':aupr})
    axes.step(res_recall, res_precision, label=lab, alpha=0.5)
    # y_real.append(y_test_cv[i])
    y_proba.append(y_probs)

print()
print('Avg. Training time:',sum(time_list)/len(time_list))
print('Avg. F1-Score Positive class:',sum(f1_list)/len(f1_list))

y_real = numpy.concatenate(y_test_cv)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR = %.4f' % (average_precision_score(y_real, y_proba))
axes.step(recall, precision, label=lab, lw=2, color='black')
axes.set_xlabel('Recall')
axes.set_ylabel('Precision')
axes.legend(loc='lower left', fontsize='small')
axes.set_title('Precision-Recall Curve for Positive Class')

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# knn = KNeighborsClassifier(n_neighbors=9)
# knn.fit(X_train, y_train)

In [None]:
# y_preds = knn.predict(X_test)
# print(classification_report(y_test,y_preds))

In [None]:
# y_probs = knn.predict_proba(X_test)
# # keep probabilities for the positive outcome only
# y_probs = y_probs[:, 1]
# # predict class values
# yhat = knn.predict(X_test)
# lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_probs)
# lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# # summarize scores
# print(f'Logistic: f1 score positive class = {round(lr_f1,3)} auc = {round(lr_auc,3)}')
# # plot the precision-recall curves
# # no_skill = len(y_test[y_test==1]) / len(y_test)
# # plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# plt.plot(lr_recall, lr_precision, label='RFC')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.legend()
# plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)

axes = plt.subplot()
y_real = []
y_proba = []
f1_list = []
# auc_list = []
time_list = []
for i in range(len(X_train_cv)):
    print(f'fold #{i+1}')
    start_time = time.time()
    knn.fit(X_train_cv[i],y_train_cv[i])
    elapsed_time = time.time() - start_time
    print(f'training time {elapsed_time}')
    time_list.append(elapsed_time)
    y_probs = knn.predict_proba(X_test_cv[i])
    # probabilities for the positive outcome only
    y_probs = y_probs[:, 1]
    # predict class values
    yhat = knn.predict(X_test_cv[i])
    f_score = f1_score(y_test_cv[i], yhat)
    f1_list.append(f_score)
    print('f1 score:',f_score)
    time_list.append(time.time() - start_time)
    # print(classification_report(y_test_cv[i],yhat))
    res_precision, res_recall, _ = precision_recall_curve(y_test_cv[i], y_probs)
    aupr = average_precision_score(y_test_cv[i], y_probs)
    lab = 'Fold %d AUPR = %.4f' % (i+1, aupr)
    model_result.append({'model':'Logistic Regression', 'rec': res_recall, 'prec':res_precision, 'aupr':aupr})
    axes.step(res_recall, res_precision, label=lab, alpha=0.5)
    # y_real.append(y_test_cv[i])
    y_proba.append(y_probs)

print()
print('Avg. Training time:',sum(time_list)/len(time_list))
print('Avg. F1-Score Positive class:',sum(f1_list)/len(f1_list))

y_real = numpy.concatenate(y_test_cv)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR = %.4f' % (average_precision_score(y_real, y_proba))
axes.step(recall, precision, label=lab, lw=2, color='black')
axes.set_xlabel('Recall')
axes.set_ylabel('Precision')
axes.legend(loc='lower left', fontsize='small')
axes.set_title('Precision-Recall Curve for Positive Class')