In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,confusion_matrix

<p>false positive = not fraud and detected as fraud
<p>false negative = was fraud and detected as not fraud
<p>true positive = fraud and detected as fraud
<p>true negative = not fraud and detected as not fraud

### Load data and print columns.

In [4]:
df = pd.read_csv('./sample-1-25.csv/sample-1-25.csv')

In [5]:
df.head()

Unnamed: 0,FRD_IND,APPRD_AUTHZN_CNT,AVG_DLY_AUTHZN_AMT,MRCH_CATG_CD,POS_ENTRY_MTHD_CD,RCURG_AUTHZN_IND,DISTANCE_FROM_HOME,ACCT_CURR_BAL,AUTHZN_AMT,AUTHZN_OUTSTD_AMT,PLSTC_ISU_DUR
0,0,3,7,5812,90,0,0.0,2498.21,15.52,94.28,805
1,0,1,6,5541,90,0,0.0,88.88,46.56,56.31,68
2,0,4,0,5965,81,0,358.8503,1934.36,16.34,927.13,190
3,0,0,3,5542,90,0,4.293625,916.04,1.0,0.0,217
4,0,1,2,5814,90,0,509.83035,166.62,10.66,10.66,110


In [6]:
df.columns

Index(['FRD_IND', 'APPRD_AUTHZN_CNT', 'AVG_DLY_AUTHZN_AMT', 'MRCH_CATG_CD',
       'POS_ENTRY_MTHD_CD', 'RCURG_AUTHZN_IND', 'DISTANCE_FROM_HOME',
       'ACCT_CURR_BAL', 'AUTHZN_AMT', 'AUTHZN_OUTSTD_AMT', 'PLSTC_ISU_DUR'],
      dtype='object')

### Training/testing split.

In [7]:
y_col, x_col = df.columns[0], df.columns[1::]

In [8]:
x, xt, y, yt = train_test_split(df[x_col], df[y_col], test_size=0.20, random_state=444)

In [9]:
x.head()

Unnamed: 0,APPRD_AUTHZN_CNT,AVG_DLY_AUTHZN_AMT,MRCH_CATG_CD,POS_ENTRY_MTHD_CD,RCURG_AUTHZN_IND,DISTANCE_FROM_HOME,ACCT_CURR_BAL,AUTHZN_AMT,AUTHZN_OUTSTD_AMT,PLSTC_ISU_DUR
695527,0,0,3063,1,0,346.78033,0.0,0.0,0.0,123
110525,3,6,6011,2,0,17.270464,232.82,63.0,317.97,642
1910345,1,1,5812,90,0,15.674769,282.81,4.33,4.33,284
1052733,1,2,5812,90,0,22.744324,411.73,5.02,5.02,464
820788,1,3,5812,90,0,53.759247,3664.23,34.12,35.12,397


### Basic SVM model.

In [10]:
xtrain_normal = preprocessing.normalize(x, norm='l2')
xtest_normal = preprocessing.normalize(xt, norm='l2')

In [11]:
clf = svm.LinearSVC(max_iter=500)

In [13]:
clf.fit(xtrain_normal, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
y_pred = clf.predict(xtest_normal)

In [19]:
cm = confusion_matrix(yt, y_pred)

In [21]:
tn, fp, fn, tp = cm.ravel()

In [22]:
cm # Missed all fraudulant transactions since class are so imbalanced

array([[397516,      0],
       [   485,      0]])

In [33]:
c = Counter(yt)
c

Counter({0: 397516, 1: 485})

In [34]:
cp = Counter(y_pred)
cp

Counter({0: 398001})

## NN model

In [7]:
from sklearn.neural_network import MLPClassifier

In [57]:
mlp = MLPClassifier(solver='adam', activation='relu', alpha=1e-5, hidden_layer_sizes=(5,2), max_iter=500)

In [58]:
mlp.fit(xtrain_normal, y)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [59]:
predict_train = mlp.predict(xtest_normal)

In [60]:
nncm = confusion_matrix(yt, predict_train) # missed all fraudulent examples becuase of class imabalance?

In [61]:
nncm

array([[397516,      0],
       [   485,      0]])

## SMOTE

In [19]:
# from imblearn.over_sampling import SMOTE

from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

In [20]:
# x_resampled, y_resampled = SMOTE().fit_resample(x, y)

oversample = RandomOverSampler()
x_resampled, y_resampled = oversample.fit_sample(x, y)



In [21]:
countSMOTE = Counter(y_resampled)
print(countSMOTE)

Counter({0: 1589998, 1: 1589998})


## SVM with Balanced Classes

In [15]:
clf = svm.LinearSVC(max_iter=500)

In [16]:
x_resampled_normal = preprocessing.normalize(x_resampled, norm='l2')
clf.fit(x_resampled_normal, y_resampled)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=500,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [17]:
resampled_predict = clf.predict(xtest_normal)

In [18]:
svmcm = confusion_matrix(yt, resampled_predict)
svmcm

array([[     1, 397515],
       [     0,    485]], dtype=int64)

In [88]:
tn, fp, fn, tp = svmcm.ravel()
print(Counter(resampled_predict))
print(Counter(yt))

Counter({0: 310288, 1: 87713})
Counter({0: 397516, 1: 485})


In [89]:
print(classification_report(yt, resampled_predict))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88    397516
           1       0.00      0.60      0.01       485

   micro avg       0.78      0.78      0.78    398001
   macro avg       0.50      0.69      0.44    398001
weighted avg       1.00      0.78      0.88    398001



## NN with Balanced Classes

In [68]:
mlp = MLPClassifier(solver='adam', activation='relu', alpha=1e-5, hidden_layer_sizes=(5,2), max_iter=500)

In [69]:
x_resampled_normal = preprocessing.normalize(x_resampled, norm='l2')
mlp.fit(x_resampled_normal, y_resampled)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [96]:
resampled_predict = mlp.predict(xtest_normal)

In [97]:
nncm = confusion_matrix(yt, resampled_predict)

In [98]:
nncm

array([[284361, 113155],
       [   149,    336]])

In [99]:
tn, fp, fn, tp = nncm.ravel()
print(Counter(resampled_predict))
print(Counter(yt))

Counter({0: 284510, 1: 113491})
Counter({0: 397516, 1: 485})


In [100]:
print(classification_report(yt, resampled_predict))

              precision    recall  f1-score   support

           0       1.00      0.72      0.83    397516
           1       0.00      0.69      0.01       485

   micro avg       0.72      0.72      0.72    398001
   macro avg       0.50      0.70      0.42    398001
weighted avg       1.00      0.72      0.83    398001



## Cross Validation

In [50]:
from sklearn.model_selection import KFold

array([0, 0, 0, ..., 1, 1, 1])

In [103]:
conf_matrix_list_of_arrays = []
kf = KFold(n_splits=5, random_state=None, shuffle=False)

print(len(x_resampled))
for train_index, test_index in kf.split(x_resampled):
    
    X_train, X_valid = x_resampled[train_index], x_resampled[test_index]
    y_train, y_valid = y_resampled[train_index], y_resampled[test_index]
    
    X_train = preprocessing.normalize(X_train, norm='l2')
    X_valid = preprocessing.normalize(X_test, norm='l2')
    
    clf.fit(X_train, y_train)
    print(confusion_matrix(y_valid, clf.predict(X_test)))

3179996
[[396230 238913]
 [   167    690]]
[[396112 239119]
 [   158    610]]
[[249066  70558]
 [121778 194597]]
[[     0      0]
 [349594 286405]]
[[     0      0]
 [349958 286041]]


## AutoEncoder

In [12]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score

In [14]:
# Shape of input and latent variable

n_input = 10

# Encoder structure
n_encoder1 = 7
n_encoder2 = 5

n_latent = 2

# Decoder structure
n_decoder2 = 5
n_decoder1 = 7

In [42]:
reg = MLPRegressor(hidden_layer_sizes = (n_encoder1, n_encoder2, n_latent, n_decoder2, n_decoder1), 
                   activation = 'relu', 
                   solver = 'adam', 
                   learning_rate_init = 0.001, 
                   max_iter = 20, 
                   tol = 0.0000001, 
                   verbose = True)

In [43]:
# Loss is too big
reg.fit(np.array(x), np.array(x))

Iteration 1, loss = 314234.10897845
Iteration 2, loss = 281684.36462762
Iteration 3, loss = 280569.45757405
Iteration 4, loss = 279832.43158658




MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(7, 5, 2, 5, 7), learning_rate='constant',
       learning_rate_init=0.001, max_iter=20, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=1e-07,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [68]:
ser = xt.iloc[1]
example1 = np.array(ser)

x_reconst = reg.predict(example1)


array([0.0013167])

In [2]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras import optimizers
from keras.optimizers import Adam

In [19]:
autoencoder = Sequential()
autoencoder.add(Dense(7,  activation='elu', input_shape=(10,)))
autoencoder.add(Dense(2,    activation='linear', name="bottleneck"))
autoencoder.add(Dense(7,  activation='elu'))
autoencoder.add(Dense(10,  activation='sigmoid'))
autoencoder.compile(loss='mean_squared_error', optimizer = Adam())

In [33]:
trained_model = autoencoder.fit(np.array(x), np.array(x), batch_size=512, epochs=10, verbose=1, validation_data=(np.array(xt), np.array(xt)))
encoder = Model(autoencoder.input, autoencoder.get_layer('bottleneck').output)

Train on 1592003 samples, validate on 398001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [24]:
from keras.datasets import mnist

In [25]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
train_x = x_train.reshape(60000, 784) / 255
val_x = x_test.reshape(10000, 784) / 255