In [14]:
# imports
import typing
import os
import math
import sys
import tensorflow as tf
from tensorflow import keras
import numpy as np
from numpy import genfromtxt
from numpy.lib import recfunctions as rfn
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, Math, Latex
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


In [15]:
# colab config
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/My Drive/cs470')

# Data Preprocessing

## Data Loading

The dataset was downloaded from Kaggle. It contains 32 Columns of 1,000,000 entries. Since it is 216.59 MB, it may take time to load.

_in this project, "label" will refer to column categories, and "data_actual" will refer to the actual value of the data_

In [16]:
data = pd.read_csv("Base.csv") # load csv from drive
classification = data.pop("fraud_bool") # remove classification label from dataset
labels = data.keys() # get column labels

# Data Analysis
## Balance check
Our dataset is extremely imbalanced. This is due to the fact that fraudulence overwhelmingly rare in the dataset.

In [17]:
Latex(f'In the complete data set, {classification.sum()} are fraud ({100*classification.sum()/len(classification):.3f}%)')

<IPython.core.display.Latex object>

# One-hot vectorization

Machine learning algorithms have a hard time understanding strings. To confront this, we replace a column of multiple unique string values with multiple columns for each unique category. These columns contain a boolean to indicate which category it was. This is alternatively known as a _One-hot_.

In [18]:
# one-hotify labels
labels_categorical = ["payment_type", "employment_status", "housing_status", "source", "device_os"] # list that contains columns to be binarized
labels_vectorized = [] # list that contains all newly created binarized columns
for label in labels_categorical:
    label_index = data.columns.get_loc(label)
    column_binarized = pd.get_dummies(data[label])
    for label_binarized in column_binarized:
        # new label joins the category with the original column name
        label_binarized_new = label + "_" + label_binarized
        data.insert(
            label_index,
            label_binarized_new,
            column_binarized[label_binarized])
        labels_vectorized.append(label_binarized_new)
    del data[label]
# prove that removal occurred and that new columns were added
assert("payment_type" not in data.columns and "device_os" not in data.columns and "device_os_windows" in data.columns)

# Minimum-Maximum Normalization

The backpropagation algorithm will eventually attempt to normalize the range of values within a column to become from 0 to 1. We can save time by preprocessing the data beforehand, shaving seconds if not minutes off of training.

In [19]:
# minimum-maximum normalization
for label in labels:
    if label not in labels_categorical: # check only the ones that were not categorical (this implies columns of numbers, not strings)
        scaler = MinMaxScaler() # scales data between 0 and 1
        data[label] = scaler.fit_transform(data[[label]])

# Data Partitioning

Finally, the data will be partitioned into both training and evaluation subsets.

In [20]:
# data partitioning
training_ratio = 0.85 # this percent will be used for training
training_index = math.floor(len(data)*training_ratio)
data_training = (data[:training_index]) # training data
labels_training = pd.DataFrame(classification[:training_index]) # training data's actual value
print(f"Training data: {len(data_training)}")
display(data_training[3:10])
data_evaluation = (data[training_index:]) # evaluation data
labels_evaluation  = pd.DataFrame(classification[training_index:]) # evaluation data's actual value

Training data: 850000


Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type_AE,payment_type_AD,payment_type_AC,...,session_length_in_minutes,device_os_x11,device_os_windows,device_os_other,device_os_macintosh,device_os_linux,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
3,1.0,0.15951,0.0,0.053613,0.5,0.000243,0.111493,0,0,0,...,0.035074,0,0,0,0,1,0.0,0.666667,0.0,1.0
4,1.0,0.596414,0.0,0.51049,0.5,5.7e-05,0.114854,0,0,0,...,0.054951,0,0,0,1,0,1.0,0.666667,0.0,1.0
5,0.25,0.14392,0.0,0.072261,0.25,0.00036,0.115048,0,1,0,...,0.066917,0,1,0,0,0,0.0,0.666667,0.0,1.0
6,0.75,0.321554,0.0,0.356643,0.25,0.000391,0.118701,0,0,0,...,0.029448,0,1,0,0,0,1.0,0.666667,0.0,1.0
7,1.0,0.064816,0.0,0.044289,0.5,0.00044,0.111021,0,0,0,...,0.041859,0,0,0,0,1,0.0,0.666667,0.0,1.0
8,0.75,0.065937,0.0,0.151515,0.375,0.000264,0.109649,0,0,0,...,0.036541,0,1,0,0,0,1.0,0.666667,0.0,1.0
9,1.0,0.700096,0.0,0.142191,0.375,0.000214,0.112542,0,0,0,...,0.288158,0,1,0,0,0,0.0,0.666667,0.0,1.0


income                              1.000000
name_email_similarity               0.159510
prev_address_months_count           0.000000
current_address_months_count        0.053613
customer_age                        0.500000
days_since_request                  0.000243
intended_balcon_amount              0.111493
payment_type_AE                     0.000000
payment_type_AD                     0.000000
payment_type_AC                     0.000000
payment_type_AB                     1.000000
payment_type_AA                     0.000000
zip_count_4w                        0.120764
velocity_6h                         0.214831
velocity_24h                        0.335657
velocity_4w                         0.047137
bank_branch_count_8w                0.805451
date_of_birth_distinct_emails_4w    0.153846
employment_status_CG                0.000000
employment_status_CF                0.000000
employment_status_CE                0.000000
employment_status_CD                0.000000
employment

## Class Imbalance
Calculate percentage of training dataset that is true and false respectively.

In [21]:
weight_for_1 = labels_training.sum() / len(labels_training)
weight_for_0 = 1 - weight_for_1
class_weight = {0: weight_for_0, 1: weight_for_1}
print("weight for 0:",weight_for_0)
print("weight for 1:",weight_for_1)

weight for 0: fraud_bool    0.988733
dtype: float64
weight for 1: fraud_bool    0.011267
dtype: float64


# Fraud Distribution

In [23]:
# plots two features' distributions on a single plot
def distribution(label_0, label_1):
    print(labels_training.shape)
    fraud = pd.DataFrame(columns=data_training.columns)
    fraud_not = pd.DataFrame(columns=data_training.columns)
    for row in data_training:
        if labels_training[row]:
            fraud.append(row)
        else:
            fraud_not.append(row)

    sns.jointplot(x=fraud[label_0], y=fraud[label_1],
                kind='hex', xlim=(-5,5), ylim=(-5,5))
    plt.suptitle("Fraudulent distribution")

    sns.jointplot(x=fraud_not[label_0], y=fraud_not[label_1],
                kind='hex', xlim=(-5,5), ylim=(-5,5))
    _ = plt.suptitle("Non-fraudulent distribution")

distribution("income", "name_email_similarity")

# Model Design

We will be using a simple Multilayer Neural Network for this project. This is because our data is simply just numbers, which have no spatial significance (like an image), so we will not be using a Convolutional Neural Network.

In [24]:
# model design

model = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu", input_shape=(data_training.shape[-1],)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

# Model Compilation

In [25]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.TruePositives(), tf.keras.metrics.AUC(curve="PR", num_thresholds=50)])
model.build()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 256)               13568     
                                                                 
 dense_5 (Dense)             (None, 256)               65792     
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 256)               65792     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 1)                 257       
                                                                 
Total params: 145,409
Trainable params: 145,409
Non-tr

# Model Training

In [26]:
history = model.fit(
    x=data_training,
    y=labels_training,
    class_weight=class_weight,
    epochs=3)

Epoch 1/3


2022-12-12 23:39:32.525731: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 5406/26563 [=====>........................] - ETA: 5:10 - loss: 0.0015 - binary_accuracy: 0.9886 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - false_positives_1: 1.0000 - false_negatives_1: 1977.0000 - true_negatives_1: 171014.0000 - true_positives_1: 0.0000e+00 - auc_1: 0.0115

In [None]:
history = model.evaluate(data_evaluation, labels_evaluation)

In [None]:
display(model.predict(np.array(data_evaluation.iloc[5])))

In [None]:
print(data_evaluation.shape)