In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score

In [10]:
# read in data 
df = pd.read_csv('../../../data/model_data_2.csv')
df.head()

Unnamed: 0,trans_date_trans_time,amt,lat,long,city_pop,dob,unix_time,merch_lat,merch_long,is_fraud,...,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,F,M
0,0.573869,4.97,36.0788,-81.1781,3495,573868800.0,1325376018,36.011293,-82.048315,0,...,0,0,1,0,0,0,0,0,1,0
1,0.267235,107.23,48.8878,-118.2105,149,267235200.0,1325376044,49.159047,-118.186462,0,...,0,0,0,0,0,0,0,0,1,0
2,-0.250906,220.11,42.1808,-112.262,4154,-250905600.0,1325376051,43.150704,-112.154481,0,...,0,0,0,0,0,0,0,0,0,1
3,-0.093744,45.0,46.2306,-112.1138,1939,-93744000.0,1325376076,47.034331,-112.561071,0,...,0,0,0,0,0,0,0,0,0,1
4,0.512352,41.96,38.4207,-79.4629,99,512352000.0,1325376186,38.674999,-78.632459,0,...,0,0,0,1,0,0,0,0,0,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 26 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  float64
 1   amt                    1296675 non-null  float64
 2   lat                    1296675 non-null  float64
 3   long                   1296675 non-null  float64
 4   city_pop               1296675 non-null  int64  
 5   dob                    1296675 non-null  float64
 6   unix_time              1296675 non-null  int64  
 7   merch_lat              1296675 non-null  float64
 8   merch_long             1296675 non-null  float64
 9   is_fraud               1296675 non-null  int64  
 10  entertainment          1296675 non-null  int64  
 11  food_dining            1296675 non-null  int64  
 12  gas_transport          1296675 non-null  int64  
 13  grocery_net            1296675 non-null  int64  
 14  grocery_pos       

In [12]:
# split into label (isFraud) and data, drop label from data 
y = df['is_fraud'].to_numpy()

df.drop(columns=['is_fraud'], inplace=True)
X = df.to_numpy()

In [13]:
# split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
# rescale x_train to prepare for model
scaler = MinMaxScaler(feature_range=(0,1))
x_train = scaler.fit_transform(x_train)

In [15]:
# develop model
reg = tf.keras.regularizers.l2(0.001)

model = Sequential(
    [
        Dense(32, activation='relu', input_shape=(25,)),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ], name = 'model'
)

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

# apply class weights to balance data
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.array([0,1]), y=y_train)

# train model
history = model.fit(x_train, y_train, batch_size=32, epochs=11, class_weight={0:class_weights[0], 1:class_weights[1]})

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [16]:
# test model
y_pred = model.predict(x_test)

# round predictions to binary classification
rounded_pred = np.round(y_pred)

#compute accuracy
accuracy = accuracy_score(y_test, rounded_pred)
accuracy



0.9944858966202017