# Cross Validation for Tuned Neural Network Model

### 0. Read Data from Prev Notebook

In [1]:
import pandas as pd

df = pd.read_csv('data-stage2.csv')
df

Unnamed: 0,category,amt,is_fraud,hour,trans_count_7d,trans_count_30d,time_diff
0,misc_net,4.97,0,1,0.0,0.0,0.000000
1,grocery_pos,107.23,0,1,0.0,0.0,0.000000
2,entertainment,220.11,0,1,0.0,0.0,0.000000
3,gas_transport,45.00,0,1,0.0,0.0,0.000000
4,misc_pos,41.96,0,1,0.0,0.0,0.000000
...,...,...,...,...,...,...,...
1852389,health_fitness,43.77,0,1,39.0,167.0,4.619444
1852390,kids_pets,111.84,0,1,62.0,272.0,2.706389
1852391,kids_pets,86.88,0,1,67.0,277.0,0.201111
1852392,travel,7.99,0,1,36.0,192.0,3.340278


### 1. Scaling and Encoding

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import set_config

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector(dtype_include="float64")),
        ("cat", OneHotEncoder(), make_column_selector(dtype_include="object")),
    ],
    remainder='passthrough'
)

set_config(display="diagram")

preprocessor

In [3]:
X = preprocessor.fit_transform(df.drop(columns=['is_fraud']))
y = df[['is_fraud']]

### 2. Create Neural Network Model with the Best Set of Hyper-parameters

Please refer to `hyperparameter_tuning_v2.ipynb` for how we get the hyper-parameters by using Keras Tuner.

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
model = keras.models.Sequential([keras.Input(shape=(19),)])

layer1_num_units = np.ceil(.9*19)
layer2_num_units = np.ceil(.8*layer1_num_units)
layer3_num_units = np.ceil(.6*layer2_num_units)
layer4_num_units = np.ceil(.7*layer3_num_units)

# Create hidden layers
model.add(
    layers.Dense(
        units=layer1_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer2_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer3_num_units,
        activation="relu",
    )
)

model.add(
   layers.Dense(
        units=layer4_num_units,
        activation="relu",
    )
)

# Single output layer
model.add(
    layers.Dense(
        units=1, 
        activation="sigmoid"
    )
)

# Compile model, tune learning rate
model.compile(
    optimizer=keras.optimizers.SGD(1e-2),
    loss=keras.losses.BinaryCrossentropy(), 
    metrics=[
        keras.metrics.BinaryAccuracy(name="accuracy"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall")
    ]
)


## 3. 5-fold Cross Validation

In [9]:
from imblearn.over_sampling import SMOTE

# oversampling (using training set)
smote = SMOTE(n_jobs=-1)

In [None]:
# k-fold Cross Validation

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle = True)

i = 0
for train_i, test_i in kf.split(X):
    start_train, stop_train = train_i[0], train_i[-1]+1
    start_test, stop_test = test_i[0], test_i[-1]+1
    
    x_train, y_train = X[start_train:stop_train], y[start_train:stop_train],
    x_test, y_test = X[start_test:stop_test], y[start_test:stop_test]
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
    
    history = model.fit(
        x_train_smote,
        y_train_smote,
        epochs=50,
        validation_data=(x_test, y_test),
        use_multiprocessing=True
    )
    i += 1
    print("\n=====", i, "=====")
    print("Evaluate on test data")
    results = model.evaluate(x_test, y_test)
    print("loss, accuracy, precision, recall:", results, "\n\n")