<h2> Necessary functions </h2>

In [None]:
# call all necessary functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix,accuracy_score, fbeta_score
import sklearn.metrics as metrics
import seaborn as sns

In [None]:
def PrintStats(y_test, pred, output_print=False):
    f1Score = round(f1_score(y_test, pred), 2)
    fbetaScore = round(fbeta_score(y_test, pred, beta=1.25))
    recallScore = round(recall_score(y_test, pred), 2)
    precscore = round(precision_score(y_test, pred), 2)
    accScore = round(accuracy_score(y_test, pred), 2)
    
    #roc curve
    auc = PlotROC(y_test,pred)
    
    print("Accuracy for Model : {acc_score}".format(acc_score = accScore))
    print("Precision for Model : {prec_score}".format(prec_score = precscore))
    print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = recallScore))
    print("F1 Score for Model : {f1_score}".format(f1_score = f1Score))
    print("F-Beta Score for Model : {fbeta_score}".format(fbeta_score = fbetaScore))
    
    output = [accScore,precscore,recallScore,f1Score, fbetaScore,auc]
    
    if output_print:
        return output

<h2> Random Oversampling Method & Other Functions</h2>

In [None]:
def Oversample(X_train,Y_train,print_output=False):
    Train_set = np.concatenate((X_train, Y_train), axis=1)

    #Convert back to dataframe for random oversampling
    df = pd.DataFrame.from_records(Train_set)

    # Class count
    count_class_0, count_class_1 = df.iloc[:,30].value_counts()

    # Divide by class
    df_class_0 = df[df.iloc[:,30] == 0]
    df_class_1 = df[df.iloc[:,30] == 1]

    df_class_1_over = df_class_1.sample(count_class_0, replace=True)
    df = pd.concat([df_class_0, df_class_1_over], axis=0)

    #shuffle rows
    df.sample(frac=1)

    # df_test_over.Class.value_counts().plot(kind='bar', title='Count (target)');
    Y_train = df.iloc[:,30].values
    X_train = df.iloc[:,0:30].values
    
    if print_output == True:
        print('Random over-sampling:')
        print(df.iloc[:,30].value_counts())
    
    return X_train, Y_train

def Convert_prob_to_class(Y_test_hat):
    Y_test_hat[Y_test_hat > 0.5] = 1
    Y_test_hat[Y_test_hat < 0.5] = 0

def PlotROC(y_test,pred):
    fpr, tpr, threshold = metrics.roc_curve(y_test, pred)
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
    return roc_auc   

<h2> Set Dataframe </h2>

In [None]:
df = pd.read_csv('creditcard-training set v2.csv')
df.head()

<H2>Renaming Columns</H2>

In [None]:
# rename columns
new_names =  {'Seconds since reference time': 'Time',
'Amount': 'Amount',
'Fraud? (1: Fraud, 0:  No Fraud)': 'Class',
'Feature 1': 'F1',
'Feature 2': 'F2',
'Feature 3': 'F3',
'Feature 4': 'F4',
'Feature 5': 'F5',
'Feature 6': 'F6',
'Feature 7': 'F7',
'Feature 8': 'F8',
'Feature 9': 'F9',
'Feature 10': 'F10',
'Feature 11': 'F11',
'Feature 12': 'F12',
'Feature 13': 'F13',
'Feature 14': 'F14',
'Feature 15': 'F15',
'Feature 16': 'F16',
'Feature 17': 'F17',
'Feature 18': 'F18',
'Feature 19': 'F19',
'Feature 20': 'F20',
'Feature 21': 'F21',
'Feature 22': 'F22',
'Feature 23': 'F23',
'Feature 24': 'F24',
'Feature 25': 'F25',
'Feature 26': 'F26',
'Feature 27': 'F27',
'Feature 28': 'F28',
'Feature 29': 'F29'}
df.rename(columns=new_names, inplace=True)

<h2>Generate Chart</h2>

In [None]:
# execute this next to check graph
for row in df.head():
    fig=plt.figure(figsize=(17,10))
    df.hist(column=row)
    plt.xlabel(row,fontsize=15)
    plt.ylabel("Frequency",fontsize=15)

# From these charts, we can see that the distribution for feature 23 is considered uniform.
# It is meaningless and would introduce noisy outputs to our machine learning model, which is why we would drop it later


<h2>Check for duplications & % of Null Values</h2>

In [None]:
#Check for duplicated rows
count = 0
for rows in df.duplicated():
    if rows == True:
        count += 1
if count > 0:
    print("There are",count,"duplicated rows")
else:
    print("There are no duplicated rows")

#check for null values
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

<H2>Analyze dataset based on classes</H2>

In [None]:
#Density Plot for Credit Card Transactions Time
for i in range(2):
    subset = df[df['Class'] == i]
    # Draw the density plot
    sns.distplot(subset['Time'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = i)
# Plot formatting
plt.legend(prop={'size': 16}, title = 'Fraud')
plt.title('Credit Card Transactions Time Density Plot')
plt.xlabel('Seconds since reference time')
plt.ylabel('Density')

# Box Plot for Fraud Transactions
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.boxplot(ax = ax1, x="Class", y="Amount", hue="Class",data=df, palette="PRGn",showfliers=True).set_title("Box Plot with Outliers")
s = sns.boxplot(ax = ax2, x="Class", y="Amount", hue="Class",data=df, palette="PRGn",showfliers=False).set_title("Box Plot without Outliers")
plt.show();

# Desnsity Plot for distribution of each features for each classes
var = df.columns.values

i = 0
t0 = df.loc[df['Class'] == 0]
t1 = df.loc[df['Class'] == 1]

sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(8,4,figsize=(16,28))

for feature in var:
    if feature != "Class":
        i += 1
        plt.subplot(8,4,i)
        sns.kdeplot(t0[feature], bw=0.5,label="Fraud = 0")
        sns.kdeplot(t1[feature], bw=0.5,label="Fraud = 1")
        plt.xlabel(feature, fontsize=12)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=9)
plt.show();

<h2> Format properly with dropping rows which contains null values </h2>

In [None]:
# drop rows which has null values
df = df.dropna()

#drop feature 23
df.drop(['F23'], inplace=True, axis=1)
columns = ["Time","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F24","F25",'F26','F27',"F28","F29","Amount"]

<h2>Split into training and testing data</h2>

In [None]:
# execute this to split into train and test data
X = pd.DataFrame.as_matrix(df,columns=columns)
Y = df.Class
Y = Y.values.reshape(Y.shape[0],1)
X.shape
sc = StandardScaler()
X = sc.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

<h2>Train Model </h2>

In [None]:
#Split data into train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

#oversampling
X_train, Y_train = Oversample(X_train,Y_train)

model = LogisticRegression(C=1)

model.fit(X_train, Y_train.ravel())

Y_test_hat = model.predict(X_test)
Convert_prob_to_class(Y_test_hat)

#Print metrics 
PrintStats(Y_test,Y_test_hat)

<h2>Cross-Validation</h2>

In [None]:
#Define parameters for k-fold cv
n_splits = 5
kf = KFold(n_splits)
Metric_array = np.zeros(6)

counter = 1

#Start k-fold cross validation 
for train_index, test_index in kf.split(X):
    
    #Split into training and testing set
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    #Oversampling of training set ONLY
    X_train, Y_train = Oversample(X_train,Y_train)
    
    #run the model
    model.fit(X_train, Y_train)
    
    #Run forward prop to get predicted values
    Y_test_hat = model.predict(X_test)
    Convert_prob_to_class(Y_test_hat)
    
    print('\nMetrics for fold number: ' + str(counter))
    counter += 1
    
    #Print metrics and store in numpy array to average after end of cv 
    Metric_array += PrintStats(Y_test,Y_test_hat,True)
    
#Averaging of metrics    
Metric_array /= n_splits
np.around(Metric_array, decimals=2)

print('\nSummary metrics:')
print("Accuracy for Model : {acc_score}".format(acc_score = Metric_array[0]))
print("Precision for Model : {prec_score}".format(prec_score = Metric_array[1]))
print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = Metric_array[2]))
print("F1 Score for Model : {f1_score}".format(f1_score = Metric_array[3]))
print("F-Beta Score for Model : {f1_score}".format(f1_score = Metric_array[4]))
print("AUC for Model : {auc}".format(auc = Metric_array[5]))