In [77]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [78]:
seed = 7
np.random.seed(seed)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Read csv file to DataFrame

In [79]:
df = pd.read_csv('../input/creditcard.csv')
df.head()

Count each class, class = 1 (fraud detected) is very rare.

In [80]:
# make dataset more balance by randomly removing majority class
df_major = df[df['Class']==0]
df_minor = df[df['Class']==1]
major_count, minor_count = df.Class.value_counts()
print(major_count)
print(minor_count)
print('ratio imbalance dataset:',major_count/minor_count)

It is highly imbalance data set. I need to rebalance the data set before train the system.
First, I random split minor class to 80% train and 20% test.

In [81]:
# shuffle both major and minor classes
df_major = shuffle(df_major, random_state=42)
df_minor = shuffle(df_minor, random_state=42)

# split minor class into train 80% and dev 20%
perc = 0.8
minor_data_train = int(perc*minor_count)
df_minor_train = df_minor[0:minor_data_train]
df_minor_dev = df_minor[minor_data_train:]

In [86]:
# rebalance training set into the ratio
ratio_imb = 2.0
major_data_train = int(ratio_imb*minor_data_train)
df_major_train = df_major[0:major_data_train]
df_major_dev = df_major[major_data_train:int(ratio_imb*major_data_train)]
df_major_test = df_major[int(ratio_imb*major_data_train):]

Now, the ratio is 2.0 for training set

In [87]:
major_c = df_major_train.Class.value_counts()
minor_c = df_minor_train.Class.value_counts()
print('ratio imbalance dataset:',int(major_c)/int(minor_c))

In [88]:
# concat to make df_train, df_dev and df_test
df_train = pd.concat([df_major_train, df_minor_train], axis=0)
df_dev = pd.concat([df_major_dev, df_minor_dev], axis=0)
df_test = pd.concat([df_major_test, df_minor_dev], axis=0)
# shuffle agian make sure they are not orderical
df_train = shuffle(df_train, random_state=42)
df_dev = shuffle(df_dev, random_state=42)
df_test = shuffle(df_test, random_state=42)

As all input is PCA, I do not drop any input features expect Time and Amount we do not use as features.

In [90]:
feature_train = df_train.drop(['Time', 'Amount'], axis=1)
target_train = df_train['Class']
feature_dev = df_dev.drop(['Time', 'Amount'], axis=1)
target_dev = df_dev['Class']

Normalize features by using standard method

In [91]:
scalar = StandardScaler()
scalar.fit(feature_train)
X_train = scalar.transform(feature_train)
y_train = target_train
X_dev = scalar.transform(feature_dev)
y_dev = target_dev

In [92]:
X_train.shape

In [93]:
y_train.shape

I use Deep Learning with three layers NN and dropout for deruce overfitting

In [94]:
# create model three layers
model = Sequential()
model.add(Dense(100, input_dim=29, kernel_initializer='uniform',  activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, kernel_initializer='uniform',  activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,kernel_initializer='uniform', activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit model
train_history = model.fit(X_train, y_train, epochs=50, batch_size=128, verbose=0)
plt.plot(train_history.history['acc'])

Look like overfit, so I test dev set

In [95]:
X_dev.shape

In [96]:
y_dev.shape

In [102]:
prediction = model.predict_classes(X_dev)
print(classification_report(y_dev, prediction))
print(confusion_matrix(y_dev, prediction))

It is a good prediction because there is no false fruad detection (class1 predict as 0)

**Expand to use larger data set**

In [99]:
feature_test = df_test.drop(['Time', 'Amount'], axis=1)
target_test = df_test['Class']
X_test = scalar.transform(feature_test)
y_test = target_test

In [100]:
X_test.shape

In [101]:
y_test.shape

In [103]:
prediction_test = model.predict_classes(X_test)
print(classification_report(y_test, prediction_test))
print(confusion_matrix(y_test, prediction_test))

Conclusion
1. undersample is good strategy to handle imbalance dataset 
2. in this case can train small data set to apply to largar data set efficiently.
3. good predictor because there is no false fraud detection.
4. in test set has 99 transections from fruad credit card, the predictor can all detect
5. the predictor detects transections that from normal credit card as fruad, it is OK (human can check them later)


Thank You!