Hi there! In this notebook, we will do a little EDA and some baseline classification on the Credit Card Fraud Detection dataset. Bear in mind that this is a work in progress. 

Here we load packages and import data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../input/creditcard.csv")

We want to know which variables in the dataset give the best class seperation. To do this, we can simply visualize the distribution of class instances for each variable. This will allow us to discard ones that have poor seperation right off the bat. A more complex model would take into accound class seperation in conditional distributions, but for what we want to accomplish right now class independence is a fine assumption.

In [None]:
n_features = len(df.columns)
n_rows, n_cols = 6, 6

fig, ax = plt.subplots(n_rows, n_cols, figsize=(15,15))

feature = 0
for i in range(n_rows):
    for j in range(n_cols):
        if feature > n_features - 1:
            ax[i,j].plot()
        else:
            fraud = df.iloc[:,feature][df["Class"] == 1].values
            non_fraud = df.iloc[:,feature][df["Class"] == 0].values

            ax[i,j].hist(non_fraud, 
                     color="blue", 
                     weights=np.zeros_like(non_fraud) + 1. / non_fraud.size,
                     bins=15,
                     alpha=0.5)
            ax[i,j].hist(fraud, 
                     color="red", 
                     weights=np.zeros_like(fraud) + 1. / fraud.size,
                     bins=8,
                     alpha=0.5)
            ax[i,j].set_title(df.columns[feature])
            
        feature += 1
    if feature > n_features:
        break

plt.tight_layout()
plt.show()

Out of curiosity, let's see what a subset of these points looks like in the input space. 

In [None]:
selected_features = ["V3", "V4", "V10", "V11", "V12", "V14", "V16", "V17", "Class"]
idx = np.random.choice(np.arange(len(df)),10000)

df_vis = df.iloc[idx,:]

fig, ax = plt.subplots(len(selected_features), len(selected_features), figsize=(15,15), sharex=True, sharey=True)
for i, featurei in enumerate(selected_features):
    for j, featurej in enumerate(selected_features):
        fraud = df_vis.loc[:,[featurei, featurej]][df["Class"] == 1].values
        non_fraud = df_vis.loc[:,[featurei, featurej]][df["Class"] == 0].values
        
        if i == j: 
            ax[i,j].hist(non_fraud[:,0], color="blue", alpha=0.5, weights=np.zeros_like(non_fraud[:,0]) + 1. / non_fraud[:,0].size)
            ax[i,j].hist(fraud[:,0], color="red", alpha=0.5, weights=np.zeros_like(fraud[:,0]) + 1. / fraud[:,0].size)
        else:
            ax[i,j].scatter(non_fraud[:,0], non_fraud[:,1], color="blue", marker='.', alpha=0.5)
            ax[i,j].scatter(fraud[:,0], fraud[:,1], color="red",  marker='.', alpha=0.5)
                
plt.show()

For baseline classification, we need to split the data into training and testing sets. We will use a relatively high test proportion due to the highly unbalanced nature of the dataset. 

In [None]:
test_proportion = 0.4

m = len(df)
test_size = int(test_proportion * m)
indices = np.random.permutation(m)
train_indices, test_indices = indices[test_size:], indices[:test_size]

train =  df.iloc[train_indices,:]
test = df.iloc[test_indices,:]

train = train.loc[:,selected_features]
test = test.loc[:,selected_features]

train_data = train.loc[:, train.columns != "Class"]
train_data = train_data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
train_label = train["Class"].values

test_data = test.loc[:, test.columns != "Class"]
test_data = test_data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
test_label = test["Class"].values

As a baseline, we will see how a logistic regression performs.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve

model = LogisticRegression()
model.fit(X = train_data, y = train_label)

prediction = model.decision_function(test_data)
auprc = average_precision_score(test_label, prediction)

precision, recall, _ = precision_recall_curve(test_label, prediction)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid()
plt.show()

print("Area under PR curve:", auprc)

The support vector machine with RBF kernel performs about the same as the LR.

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, precision_recall_curve

model = SVC()
model.fit(X = train_data, y = train_label)

prediction = model.decision_function(test_data)
auprc = average_precision_score(test_label, prediction)

precision, recall, _ = precision_recall_curve(test_label, prediction)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid()
plt.show()

print("Area under PR curve:", auprc)

A more complex model may perform better. To test this, we will train a neural network on ALL of the inputs in the dataset, not just the few we selected earlier. 

In [None]:
test_proportion = 0.4

m = len(df)
test_size = int(test_proportion * m)
indices = np.random.permutation(m)
train_indices, test_indices = indices[test_size:], indices[:test_size]

train =  df.iloc[train_indices,:]
test = df.iloc[test_indices,:]

# train = train.loc[:,selected_features]
# test = test.loc[:,selected_features]

train_data = train.loc[:, train.columns != "Class"]
train_data = train_data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
train_label = train["Class"].values

test_data = test.loc[:, test.columns != "Class"]
test_data = test_data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
test_label = test["Class"].values

These are just a few helper functions to feed data into the network. 

In [None]:
def to_one_hot(c, depth):
    I = np.identity(depth)
    return I[c,:]

def train_batch(batch_size):
    for j in range(int(len(train_data)/batch_size)):
        start = batch_size*j
        end = start + batch_size
        
        train_data_batch = train_data[start:end].values
        train_label_batch = train_label[start:end]
        
        train_label_batch = np.apply_along_axis(lambda x: to_one_hot(x, depth=2), 0, train_label_batch)
        
        yield train_data_batch, train_label_batch
        
def get_test_data():
    return test_data.values, np.apply_along_axis(lambda x: to_one_hot(x, depth=2), 0, test_label)

In this TensorFlow session, we train a relatively simple network containing just one small hidden layer, and evaluate its performance using average precision, like the other classifiers. 

In [None]:
import tensorflow as tf
from sklearn.metrics import average_precision_score, precision_recall_curve

x = tf.placeholder(tf.float32, [None, 30]) # inputs
t = tf.placeholder(tf.float32, [None, 2]) # targets

def predict(x): # model
    size_l1 = 10
    weights = {"l1": tf.Variable(tf.random_normal([30, size_l1])), 
               "output": tf.Variable(tf.random_normal([size_l1, 2]))}
    
    biases = {"l1": tf.Variable(tf.random_normal([size_l1])), 
               "output": tf.Variable(tf.random_normal([2]))}
    
    h1 = tf.add(tf.matmul(x, weights["l1"]), biases["l1"])
    h1 = tf.nn.relu(h1)
    
    h2 = tf.add(tf.matmul(h1, weights["output"]), biases["output"])
    output = tf.nn.relu(h2)
    
    return output

y = predict(x) # logits
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=t, logits=y)) # loss 
optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) 

correct = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1)) 
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # confusion matrix accuracy

decision_variable = tf.nn.softmax(y) # for precision-recall curve

with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    
    # train phase
    batch_size = 128
    n_epochs = 3
    for epoch in range(n_epochs):
        epoch_loss = 0
        batch_generator = train_batch(batch_size)
        for batch in batch_generator:
            batch_x, batch_t = batch 
            _, curr_loss = sess.run([optimizer, loss], feed_dict={t: batch_t, x: batch_x})
            epoch_loss += curr_loss
            
        print("Epoch " + str(epoch+1) + " loss: " + str(epoch_loss))
        
    # test phase
    test_x, test_t = get_test_data()
    test_y = sess.run(decision_variable, feed_dict={x: test_x})
    
    auprc = average_precision_score(test_t[:,0], test_y[:,0])
    precision, recall, _ = precision_recall_curve(test_t[:,0], test_y[:,0])

    # plot pr curve
    plt.plot(recall, precision)
    plt.grid()
    plt.show()
    
    # print average precision
    print(auprc)

Clearly, the network easily outperforms other classifiers when it comes to fraud detection.