In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data-stage2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,category,amt,is_fraud,hour,trans_count_7d,trans_count_30d,time_diff
0,0,misc_net,4.97,0,1,0.0,0.0,0.0
1,1,grocery_pos,107.23,0,1,0.0,0.0,0.0
2,2,entertainment,220.11,0,1,0.0,0.0,0.0
3,3,gas_transport,45.0,0,1,0.0,0.0,0.0
4,4,misc_pos,41.96,0,1,0.0,0.0,0.0


In [2]:
df = df.drop(columns = ['Unnamed: 0'])
df = pd.get_dummies(df, columns = ['category'])
df.head()


Unnamed: 0,amt,is_fraud,hour,trans_count_7d,trans_count_30d,time_diff,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,107.23,0,1,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,220.11,0,1,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,45.0,0,1,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,41.96,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [3]:
df_y_removed = df.drop(columns = ['is_fraud'])

# standardization
scaler = StandardScaler()
scaled = scaler.fit_transform(df_y_removed)
df_standardized = pd.DataFrame(scaled, columns = ['amt', 'hour', 'trans_count_7d', 'trans_count_30d', 'time_diff', 'category_entertainment', 'category_food_dining', 'category_gas_transport','category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel']) 
# add is_fraud column back into data set
df_standardized['is_fraud'] = df['is_fraud']

## Create Model

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [6]:
# import libraries
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from numpy import array 
from numpy import argmax 
from math import sqrt
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [7]:
model = keras.models.Sequential([keras.Input(shape=(19),)])

layer1_num_units = np.ceil(.9*19)
layer2_num_units = np.ceil(.8*layer1_num_units)
layer3_num_units = np.ceil(.6*layer2_num_units)
layer4_num_units = np.ceil(.7*layer3_num_units)

# Create hidden layers
model.add(
    layers.Dense(
        units=layer1_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer2_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer3_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer4_num_units,
        activation="relu",
    )
)

# Single output layer
model.add(
    layers.Dense(
        units=1, 
        activation="sigmoid"
    )
)

# Compile model, tune learning rate
model.compile(
    optimizer=keras.optimizers.SGD(1e-2),
    loss=keras.losses.BinaryCrossentropy(), 
    metrics=[
        keras.metrics.BinaryAccuracy(name="accuracy"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall")
    ]
)


In [21]:
# oversampling (using training set)
# can lead to overfitting
smote = SMOTE()

In [22]:
# k-fold Cross Validation

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle = True)

# get X and Y
X, Y = df_standardized.drop(columns = ['is_fraud']), df_standardized['is_fraud']

i = 0
for train_i, test_i in kf.split(X):
    start_train, stop_train = train_i[0], train_i[-1]+1
    start_test, stop_test = test_i[0], test_i[-1]+1
    
    x_train, y_train = X[start_train:stop_train], Y[start_train:stop_train],
    x_test, y_test = X[start_test:stop_test], Y[start_test:stop_test]
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
    
    history = model.fit(
    x_train_smote,
    y_train_smote,
    epochs=10,
    validation_data=(x_test, y_test)
    )
    i += 1
    print("\n=====", i, "=====")
    print("Evaluate on test data")
    results = model.evaluate(x_test, y_test)
    print("loss, accuracy, precision, recall:", results, "\n\n")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

===== 1 =====
Evaluate on test data
loss, accuracy, precision, recall: [0.06374754756689072, 0.9743729829788208, 0.1659836769104004, 0.9736815094947815] 


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

===== 2 =====
Evaluate on test data
loss, accuracy, precision, recall: [0.05099762976169586, 0.9806762933731079, 0.20796658098697662, 0.964563250541687] 


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

===== 3 =====
Evaluate on test data
loss, accuracy, precision, recall: [0.05894952267408371, 0.9756789803504944, 0.17368139326572418, 0.9761682748794556] 


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

===== 4 =====
Evaluate on test data
loss, accuracy, precision, recall: [0.05770333111286163, 0.9763899445533752, 0.1777718961238861, 0.9741995930671692] 


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

===== 5 =====
Evaluate on test data
loss, accuracy, precision, recall: [0.05816253647208214, 0.9763171076774597, 0.17739228904247284, 0.9748212695121765] 


