# 3.6.1 Credit card fraud detection

from [this dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud)

<em>"Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise."</em>

In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("/Users/gemma/Documents/data science/DS Bootcamp/creditcard.csv")

In [3]:
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [4]:
df[['Time','Class']].describe()

Unnamed: 0,Time,Class
count,284807.0,284807.0
mean,94813.859575,0.001727
std,47488.145955,0.041527
min,0.0,0.0
25%,54201.5,0.0
50%,84692.0,0.0
75%,139320.5,0.0
max,172792.0,1.0


## sub sampling

since the df is too large to run easily on my computer, and since we were asked to focus on identifying fraud when the df has a smaller amount of posiitve fraud samples, im going to take a subsample of the df that includes all of the positive fraud and an equivalent # of negative fraud examples to test on. then at the end ill run the model on the full df.

In [5]:
fraud = df[df['Class'] == 1]

x = len(fraud)*4

nofraud = df[df['Class'] == 0]
nofraud = nofraud[:x]

subsample = pd.concat([fraud, nofraud], axis=0)

# could also do subsample = fraud.append(nofraud)

print(len(df)/5)

print(len(subsample))

56961.4
2460


# k nearest neighbors

### w/ original df

In [6]:
neighbors = KNeighborsClassifier(n_neighbors=4)

In [7]:
X = df.drop('Class',1)
Y = df['Class']
neighbors.fit(X,Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [8]:
score = cross_val_score(neighbors, X, Y, cv=15)

In [9]:
score

array([0.01816937, 0.30566674, 0.17521593, 0.06188119, 0.07494207,
       0.19887291, 0.28077105, 0.2535419 , 0.23310686, 0.22836678,
       0.17180176, 0.26586612, 0.30980723, 0.21257769, 0.99831455])

In [10]:
np.std(score)

0.21664339514318484

In [11]:
# w/ holdout groups
target2 = df['Class']
data2 = df.drop('Class',1)

X_train, X_test, y_train, y_test = train_test_split(data2, target2, test_size=0.2, random_state=20)

print('\nWith 20% Holdout: ' + str(neighbors.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(neighbors.fit(data2, target2).score(data2, target2)))


With 20% Holdout: 0.9984726659878516
Testing on Sample: 0.9984621164507895


In [12]:
np.mean(score)

0.252593475620769

### w/ subsample

In [13]:
X = subsample.drop('Class',1)
Y = subsample['Class']
neighbors.fit(X,Y)

score = cross_val_score(neighbors, X, Y, cv=5)
score

array([0.90263692, 1.        , 1.        , 1.        , 1.        ])

In [14]:
print('Mean {} and Standard deviation {}'.format(np.mean(score),np.std(score)))

Mean 0.98052738336714 and Standard deviation 0.03894523326572008


In [8]:
# w/ holdout groups
target2 = subsample['Class']
data2 = subsample.drop('Class',1)

X_train, X_test, y_train, y_test = train_test_split(data2, target2, test_size=0.2, random_state=20)

print('\nWith 20% Holdout: ' + str(neighbors.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(neighbors.fit(data2, target2).score(data2, target2)))



With 20% Holdout: 1.0
Testing on Sample: 0.998780487804878


### SVM w/ subsample

In [15]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
#n_samples = len(digits.images)
#X = digits.images.reshape((n_samples, -1))
#y = digits.target

X = subsample.drop('Class',1)
y = subsample['Class']


# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    
    print(sklearn.metrics.confusion_matrix(y_true, y_pred, labels=True))

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.917 (+/-0.019) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.958 (+/-0.034) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.919 (+/-0.019) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.969 (+/-0.039) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.919 (+/-0.019) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.969 (+/-0.039) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.919 (+/-0.019) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.969 (+/-0.039) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.999 (+/-0.002) for {'C': 1, 'kernel': 'linear'}
0.999 (+/-0.002) for {'C': 10, 'kernel': 'linear'}
0.999 (+/-0.002) for {'C': 100, 'kernel': 'linear'}
0.999 (+/-0.002) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on 

TypeError: iteration over a 0-d array

## Random forest

In [16]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()

X = subsample.drop('Class',1)
Y = subsample['Class']

X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

array([0.97975709, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.99591837, 1.        ])

Scores of 10 holdout groups with the Random Forest model

In [17]:
scores = cross_val_score(rfc, X, Y, cv=10)

print('Mean {} and Standard deviation {}'.format(np.mean(scores),np.std(scores)))

Mean 0.9890655343800401 and Standard deviation 0.030190831406171684


Standard deviation of the cross validation scores

In [18]:
X = subsample.drop('Class',1)
Y = subsample['Class']

X = pd.get_dummies(X)
X = X.dropna(axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.5, random_state=0)

rfc.fit(X_train, y_train)

y_true, y_pred = y_test, rfc.predict(X_test)

sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)

array([[978,   0],
       [  2, 250]])

A confusion matrix for the subsample with the Random Forest model

In [19]:
X = df.drop('Class',1)
Y = df['Class']

X = pd.get_dummies(X)
X = X.dropna(axis=1)

scores_big = cross_val_score(rfc, X, Y, cv=10)

After running the Random Forest model on the full dataset, scores from cross validation with 10 folds

In [20]:
print('Mean {} and Standard deviation {}'.format(np.mean(scores_big),np.std(scores_big)))

Mean 0.902612148373603 and Standard deviation 0.2902793917447926


Standard deviation of the cross validation scores

In [21]:
X = df.drop('Class',1)
Y = df['Class']

X = pd.get_dummies(X)
X = X.dropna(axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.5, random_state=0)

rfc.fit(X_train, y_train)

y_true, y_pred = y_test, rfc.predict(X_test)

sklearn.metrics.confusion_matrix(y_true, y_pred, labels=True, sample_weight=None)

TypeError: iteration over a 0-d array

## Logistic regression

In [22]:
from sklearn.linear_model import LogisticRegression

# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e9)
X = subsample.drop('Class',1)
y = subsample['Class']

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[ 1.69823865e-04 -1.28377774e-01 -3.85428401e-01 -7.71441003e-01
   5.13066302e-02 -1.96244962e-01 -1.84187997e-02  1.60354549e-02
  -3.47696702e-02  5.32797437e-04 -1.01849430e-01 -1.95349270e-03
  -4.28422513e-01  1.73698707e-01  9.97028609e-03 -2.71866397e-01
  -8.88259715e-02 -5.56668920e-02  8.39520835e-02 -5.36765226e-02
   9.54900770e-03  5.19380827e-02  2.13729522e-01 -1.89733698e-02
   1.38204479e-02 -2.12896236e-01 -3.37443680e-02 -3.05242502e-02
  -2.14981614e-02 -1.08978118e-02]]
[-1.16277502]

 Accuracy by admission status
Class     0    1
row_0           
0      1966    3
1         2  489

 Percentage accuracy
0.9979674796747967
