Goal:  The goal of this project is to classify transactions into fraudulent or not fraudulent.
Plan:
1.  Examine the data - plot and look at distributions and correlation between variables
2.  Transform the data - We expect that the principal components were normalized prior to being transformed by PCA.  We should consider transformation of the Amount column.
3.  Split into test and train data
4.  Modeling - We will try a number of different classification models.  Let's use 10-fold cross validation to tune any parameters if necessary.
    *  Decision Tree (CART)
    *  Logistic Regression
    *  Support Vector Machine
    *  Random Forest
    *  XGBoost
    *  Neural Network

In [1]:
from __future__ import print_function   # use print as a function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab                            # plot matplotlib plots inline

In [2]:
dat = pd.read_csv("C:\Users\Craig\Documents\GitHubData\creditcard.csv")

In [3]:
# all variables are Principal Components and anonymized except Time (time since first transaction) and amount (amount of transaction)
# When class == 1, it is a fraudulent transaction
print(dat.head())
print(dat.shape)  # 284,807 rows x 31 columns

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...         V21       V22       V23       V24  \
0  0.098698  0.363787  ...   -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425  ...   -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  ...    0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024  ...   -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  ...   -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28  Amount  Class  
0  0.128539 -0.189115

In [4]:
# What is the range of V1?
dat["V1"].min(), dat["V1"].max()

(-56.407509631328999, 2.4549299912112099)

In [5]:
# Histogram of V1
# It appears that non-fraudulent purchases have more positive values and are not as left skewed as fradulent purchases
positive = dat.loc[dat["Class"] == 1]
negative = dat.loc[dat["Class"] != 1]
n, bins, patches = plt.hist(positive["V1"], 50, normed=1, facecolor='green', alpha = 0.75)
n, bins, patches = plt.hist(negative["V1"], 50, normed=1, facecolor='blue', alpha = 0.75)
plt.xlabel("V1")
plt.ylabel("Probability")
plt.show()

In [6]:
# Histogram of V9
# similar patter as V1
positive = dat.loc[dat["Class"] == 1]
negative = dat.loc[dat["Class"] != 1]
n, bins, patches = plt.hist(positive["V9"], 50, normed=1, facecolor='green', alpha = 0.75)
n, bins, patches = plt.hist(negative["V9"], 50, normed=1, facecolor='blue', alpha = 0.75)
plt.xlabel("V1")
plt.ylabel("Probability")
plt.show()

In [7]:
# Correlation between V1 and V2 (should be none bc they are principal components and therefore orthogonal)
dat["V1"].corr(dat["V2"])  # -7.608e-17 - None!

-7.6084975097731361e-17

In [8]:
dat.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [9]:
# Split data into training and test sets, let's remove Time here bc I don't really want to mess with it now
from sklearn.model_selection import train_test_split
y = dat["Class"].values
X = dat.drop(["Class", "Time"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=802)

In [10]:
X_train.shape  # 213,605 x 29
X_test.shape   # 71,202 x 29
len(y_train)   # 213,605
len(y_test)    # 71,202

71202

In [11]:
# We will now standardize the Amount column - fit the standard scaler on the training data and then apply it to the test data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train) # fit to training data
X_test_std = scaler.transform(X_test)       # apply to test data

Due to the unbalanced nature of the data, we will use AUC as a measure of accuracy rather than true positive rate or overall success rate.

# Baseline AUC

In [12]:
# Baseline AUC:  50% - if we assume NONE or ALL of the transactions are fraudulent, we'd be at 50% AUC.
# Baseline AUC 2:  If we randomly choose 130 transactions as fraudulent, we'd be around 50% AUC (50.29%)
from sklearn.metrics import roc_auc_score

y_true = y_test
y_scores = np.zeros(len(y_true))
y_scores = [y + 1 for y in y_scores]
roc_auc_score(y_true, y_scores)

# We know there are 130 actual fraudulent transactions in the test set.  What we if we randomly choose 130 of the 71202?
y_scores = np.zeros(len(y_true) - 130)
one = np.ones(130)
y_scores = np.append(y_scores,  one)

# shuffle y_scores
np.random.shuffle(y_scores)
print(y_scores)

roc_auc_score(y_true, y_scores)

[ 0.  0.  0. ...,  0.  0.  0.]


0.49908543448896892

# Decision Tree (CART)

In [13]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

# Initialize the Model
tree = DecisionTreeClassifier()

# Train the model
tree.fit(X_train_std, y_train)

# Predict on test set
test_pred = tree.predict(X_test_std)

# Decision Tree AUC: 87.29%
roc_auc_score(y_test, test_pred)

0.88054410695113083

# Logistic Regression
* Baseline AUC:  50%
* Decision Tree AUC:  87.28%

In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# initialize the model
logistic = LogisticRegression(C=1000.0, random_state=802, penalty='l2', solver='lbfgs')

# Train the model
logistic.fit(X_train_std, y_train)

# Predict on test set
test_pred = logistic.predict(X_test_std)

# Logistic Regression AUC:  79.61%
roc_auc_score(y_test, test_pred)

0.79607645984137432

# Support Vector Machine
* Baseline AUC:          50.00%
* Decision Tree (CART):  87.28%
* Logistic Regression:   79.61%

In [None]:
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# We have a ton of samples, let's use an out of the box SVC with linear kernel

# initialize the model
sv_class = SVC(kernel='linear')

# fit the model on the training data
sv_class.fit(X_train_std, y_train)

# test prediction
test_pred = sv_class.predict(X_test_std)

# out of the box SVM with linear kernel, AUC:  89.22%
roc_auc_score(y_test, test_pred)

# Random Forest
* Baseline AUC:          50.00%
* Decision Tree (CART):  87.28%
* Logistic Regression:   79.61%
* Linear SVM Classifier:  89.22%

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# random forests are pretty good out of the box, but let's try tuning on the number of trees in the forest, 
# and the number of features randomly chosen at each tree split.

# initialize the model
rf = RandomForestClassifier(verbose=1)

# set up grid search parameters
param_grid = {"n_estimators": [100,500],
              "max_features": [1, "sqrt", "log2"]}

# run grid search
grid_search = GridSearchCV(rf, param_grid=param_grid, verbose=1, n_jobs=3)

# fit the model
grid_search.fit(X_train_std, y_train)

# predict
test_pred = grid_search.predict(X_test_std)

# Random Forest AUC:  89.23%  (500 trees, sqrt(n features) sampled
roc_auc_score(y_test, test_pred)


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.3s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.5s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.9s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    7.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]

0.89229362206906093

In [23]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=1, warm_start=False)

# Gradient Boosting
* Baseline AUC:          50.00%
* Decision Tree (CART):  87.28%
* Logistic Regression:   79.61%
* Linear SVM Classifier: 89.22%
* Random Forest:         89.23%

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# initialize the model - need to reduce the learning rate to avoid 'seesawing' farther and farther from the min
gb = GradientBoostingClassifier(verbose=1, learning_rate=0.025, n_estimators=500)

# fit the model
gb.fit(X_train_std, y_train)

# predict
test_pred = gb.predict(X_test_std)

# Random Forest AUC:  89.21%  (500 trees, 2.5% learning rate)
roc_auc_score(y_test, test_pred)

      Iter       Train Loss   Remaining Time 
         1           0.0101            6.88m
         2           0.0097            6.94m
         3           0.0095            6.97m
         4           0.0093            6.95m
         5           0.0091            6.90m
         6           0.0089            6.90m
         7           0.0087            6.86m
         8           0.0086            6.88m
         9           0.0085            6.84m
        10           0.0084            6.83m
        20           0.0076            6.66m
        30           0.0071            6.48m
        40           0.0067            6.31m
        50           0.0065            6.15m
        60           0.0063            6.00m
        70           0.0061            5.88m
        80           0.0060            5.74m
        90           0.0059            5.59m
       100           0.0058            5.45m
       200           0.0050            4.20m
       300           0.0040            2.78m
       40

0.89212477920548605

# Neural Network Classifier
* Baseline AUC:          50.00%
* Decision Tree (CART):  87.28%
* Logistic Regression:   79.61%
* Linear SVM Classifier: 89.22%
* Random Forest:         89.23%
* Gradient Boosting:     89.22%
* Neural Network:        89.60%

In [46]:
from sklearn.neural_network import MLPClassifier

# initialize the model - try 4 hidden layers, reducing in size.
# Use relu to speed things up, adaptive learning as loss decreases
nn = MLPClassifier(hidden_layer_sizes=(1000,100,50), activation='relu',solver='sgd',
                   learning_rate = 'adaptive', learning_rate_init = 0.1, max_iter=200,verbose=True)

# fit the model
nn.fit(X_train_std, y_train)

# predict
test_pred = nn.predict(X_test_std)

# NN AUC:  89.61% (alpha=default, (100,100,50))
roc_auc_score(y_test, test_pred)

Iteration 1, loss = 0.00604466
Iteration 2, loss = 0.00295988
Iteration 3, loss = 0.00262334
Iteration 4, loss = 0.00243080
Iteration 5, loss = 0.00222736
Iteration 6, loss = 0.00212663
Iteration 7, loss = 0.00197242
Iteration 8, loss = 0.00181449
Iteration 9, loss = 0.00175208
Iteration 10, loss = 0.00161562
Iteration 11, loss = 0.00158075
Iteration 12, loss = 0.00149277
Iteration 13, loss = 0.00153898
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.020000
Iteration 14, loss = 0.00128761
Iteration 15, loss = 0.00105539
Iteration 16, loss = 0.00096347
Iteration 17, loss = 0.00090381
Iteration 18, loss = 0.00086748
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.004000
Iteration 19, loss = 0.00078683
Iteration 20, loss = 0.00076882
Iteration 21, loss = 0.00075719
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0

0.89609756519932116

In [17]:
# out of the box SVM AUC:
roc_auc_score(y_test, test_pred)

0.89220920063727349