In [1]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.combine import SMOTEENN 
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources

### In this notebook I am going to explore using DNNClassifier, a premade neural network by tensorflow, to create a fraud risk identifier. 

In [6]:
df = pd.read_csv("../input/creditcard.csv")

In [7]:
df.head()

In [8]:
df.describe()

### First we will use seaborn to visualize the data. To do this, I will plot a normalized histogram of the distribution of each feature with separate plots for fraud and genuine transactions

In [9]:
# visualize histograms of distribution of each feature, comparing fraud (red) vs genuine (blue)
for column in df.iloc[:,1:29].columns:
    plt.figure(figsize=(16,4))
    sns.distplot(df[column][df.Class == 1], bins=60, color= '#FF2731')
    sns.distplot(df[column][df.Class == 0], bins=60, color = '#349EB8')
    plt.grid(True)
    plt.title('histogram of feature: ' + str(column))
    
    plt.show()
    
    

The next step is to solve the unbalanced problem with the data, and then create training sets in x and y. To solve the unbalnaced problem I have decided to use the SMOTE algorithm (Synthetic Minority Over-sampling Technique). The idea is to undersample the majority case (genuine transactions) and oversample the minorty case (fraudulent transactions) by creating synthetic minority class members. These synthetic members are created by introducing synthetic examples along the liine segments joining any of the minority class nearest neighbors. The algorithm I will use is from Nitesh V Chawla, et. al. 2002. Essentially I will take the difference between a minority class feature vector, and its nearest neighbor. Then multiply this difference by a random number from 0 to 1 and add it to the minority class feature vector

In [10]:
# separate the data into fraud and genuine and then again into test sets and training sets
X = df.loc[:, df.columns != 'Class']
y = df.Class
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size =.2, random_state=42)
type(X_test)


In [11]:
# use SMOTEENN from imblear
sme = SMOTEENN(random_state=33)
X_train, y_train = sme.fit_sample(X_train, y_train)
print(type(X_train))
print(y_train.shape)

In [12]:
# preprocess the data to have std = 1 and mean = 0 for each feature. This helps with tensorflow models. 
X_train[:,0] = (X_train[:,0] - X_train[:,0].mean()) / X_train[:,0].std()
X_test = X_test.values
X_test[:,0] = (X_test[:,0] - X_test[:,0].mean()) / X_test[:,0].std()

X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

features_temp = df.columns.values
features_temp = np.delete(features_temp, 30)
X_train = pd.DataFrame(data = X_train, columns=features_temp)
X_test = pd.DataFrame(data = X_test, columns=features_temp)
y_train = pd.DataFrame(data = y_train, columns = ["Class"])
X_train.describe()

In [13]:
X_test.describe()

### Now I will create and train a model using DNNClassifier premade neural network from tensorflow

In [14]:
my_feature_columns = []
for key in X_train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [15]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

In [16]:
# Parameters
train_steps = 500 
batch_size = 1000


In [17]:
def train_input_fn(features, labels, batch_size):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(500000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

classifier.train(input_fn=lambda:train_input_fn(X_train, y_train, batch_size), steps =train_steps)

### we can see the model learned something since the loss function decreased. Lets find out what it learned

In [18]:
batch_size = 100
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(X_test, y_test, batch_size))

In [19]:
eval_result

### Turns out our SMOTEEEN overvalued the minority case causing our model to pick the minoiry case 100% of the time. IT's necessary to revisit the data cleansing process.