In [30]:
# imports
import typing
import os
import math
import tensorflow as tf
from tensorflow import keras
import numpy as np
from numpy import genfromtxt
from numpy.lib import recfunctions as rfn
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, Math, Latex
print(tf.__version__)

2.11.0


In [16]:
# colab config
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/My Drive/cs470')

# Data Preprocessing

## Data Loading

The dataset was downloaded from Kaggle. It contains 32 Columns of 1,000,000 entries. Since it is 216.59 MB, it may take time to load.

_in this project, "label" will refer to column categories, and "data_actual" will refer to the actual value of the data_

In [17]:
data = pd.read_csv("Base.csv") # load csv from drive
classification = data.pop("fraud_bool") # remove classification label from dataset
labels = data.keys() # get column labels

# Data Analysis
## Balance check
Our dataset is extremely imbalanced. This is due to the fact that fraudulence overwhelmingly rare in the dataset.

In [33]:
Latex(f'In the complete data set, {classification.sum()} are fraud ({100*classification.sum()/len(classification):.3f}%)')


<IPython.core.display.Latex object>

# One-hot vectorization

Machine learning algorithms have a hard time understanding strings. To confront this, we replace a column of multiple unique string values with multiple columns for each unique category. These columns contain a boolean to indicate which category it was. This is alternatively known as a _One-hot_.

In [18]:
# one-hotify labels
labels_categorical = ["payment_type", "employment_status", "housing_status", "source", "device_os"] # list that contains columns to be binarized
labels_vectorized = [] # list that contains all newly created binarized columns
for label in labels_categorical:
    label_index = data.columns.get_loc(label)
    column_binarized = pd.get_dummies(data[label])
    for label_binarized in column_binarized:
        # new label joins the category with the original column name
        label_binarized_new = label + "_" + label_binarized
        data.insert(
            label_index,
            label_binarized_new,
            column_binarized[label_binarized])
        labels_vectorized.append(label_binarized_new)
    del data[label]
# prove that removal occurred and that new columns were added
assert("payment_type" not in data.columns and "device_os" not in data.columns and "device_os_windows" in data.columns)

# Minimum-Maximum Normalization

The backpropagation algorithm will eventually attempt to normalize the range of values within a column to become from 0 to 1. We can save time by preprocessing the data beforehand, shaving seconds if not minutes off of training.

In [19]:
# minimum-maximum normalization
scaler = MinMaxScaler() # scales data between 0 and 1
for label in labels:
    if label not in labels_categorical: # check only the ones that were not categorical (this implies columns of numbers, not strings)
        data[label] = scaler.fit_transform(data[[label]])

# Data Partitioning

Finally, the data will be partitioned into both training and evaluation subsets.

In [20]:
# data partitioning
training_ratio = 0.85 # this percent will be used for training
training_index = math.floor(len(data)*training_ratio)
data_training = (data[:training_index]) # training data
labels_training = pd.DataFrame(classification[:training_index]) # training data's actual value
data_evaluation = (data[training_index:]) # evaluation data
labels_evaluation  = pd.DataFrame(classification[training_index:]) # evaluation data's actual value

# Model Design

We will be using a simple Multilayer Neural Network for this project. This is because our data is simply just numbers, which have no spatial significance (like an image), so we will not be using a Convolutional Neural Network.

In [54]:
# model design

model = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu", input_shape=(data_training.shape[-1],)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

# Model Compilation

In [55]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.TruePositives()])
model.build()
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 256)               13568     
                                                                 
 dense_46 (Dense)            (None, 256)               65792     
                                                                 
 dropout_14 (Dropout)        (None, 256)               0         
                                                                 
 dense_47 (Dense)            (None, 256)               65792     
                                                                 
 dropout_15 (Dropout)        (None, 256)               0         
                                                                 
 dense_48 (Dense)            (None, 1)                 257       
                                                                 
Total params: 145,409
Trainable params: 145,409
Non-t

# Model Training

In [56]:
history = model.fit(
    x=data_training,
    y=labels_training,
    epochs=3)

Epoch 1/3


2022-12-12 17:14:08.729965: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 353600000 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


In [59]:
history = model.evaluate(data_evaluation, labels_evaluation)



In [None]:
model.predict(np.array([evaluation_dataset[5]]))

2022-12-11 16:07:14.649445: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




array([[0.00150064]], dtype=float32)