In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

pd==2.2.2
np==1.26.4
sns==0.13.2


In [2]:
with open('../../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)
type(config)

dict

In [3]:
TARGET_FEATURE = config['data']['target_feature']
INDEX_COLUMN = config['data']['index_column']
RANDOM_STATE = config['model']['random_state']
TARGET_FEATURE, INDEX_COLUMN, RANDOM_STATE

('isFraud', 'TransactionID', 42)

In [4]:
path_data = config['data']['path']
filename_train_pp = config['data']['train']['transaction']['pp']

dataset_pp = pd.read_csv(path_data+filename_train_pp, index_col=INDEX_COLUMN) # make sure files are not offloaded to iCloud
dataset_pp.shape

(590540, 274)

In [5]:
N_FEATURES = dataset_pp.shape[0]

In [6]:
dataset_pp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 2987000 to 3577539
Columns: 274 entries, isFraud to P_domain_risk_group
dtypes: float64(266), int64(8)
memory usage: 1.2 GB


In [7]:
# Ensure chronological order of transactions
dataset_pp = dataset_pp.sort_values(by='TransactionDT', ascending=True)
dataset_pp['TransactionDT']

TransactionID
2987000       86400
2987001       86401
2987002       86469
2987003       86499
2987004       86506
             ...   
3577535    15811047
3577536    15811049
3577537    15811079
3577538    15811088
3577539    15811131
Name: TransactionDT, Length: 590540, dtype: int64

In [8]:
# Data split: train, val, test

# Split data into Train, Val, Test : 70, 20, 10
# Split into chronological chunks - better representation of real life inference

#     train       val   test
# X X X X X X X | X X | X  Datapoints
# ------------------------> t

m = dataset_pp.shape[0]

y: pd.Series = dataset_pp[TARGET_FEATURE]
X: pd.DataFrame = dataset_pp.drop(columns=TARGET_FEATURE)

X_train = X.iloc[:int(m*0.7)]
y_train = y.iloc[:int(m*0.7)]
X_val = X.iloc[int(m*0.7):int(0.9*m)]
y_val = y.iloc[int(m*0.7):int(0.9*m)]
X_test = X.iloc[int(0.9*m):]
y_test = y.iloc[int(0.9*m):]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((413378, 273), (413378,), (118108, 273), (118108,), (59054, 273), (59054,))

In [9]:
# Scalers with default settings, which one yields the best result?
scaler_mm = MinMaxScaler(feature_range=(0, 1))
scaler_rb = RobustScaler(with_centering=True, with_scaling=True)
scaler_st = StandardScaler(with_mean=True, with_std=True)
scaler_st

In [13]:
# Data scaling
scaler_st.fit(X=X_train)
X_train_sc = pd.DataFrame(data=scaler_st.transform(X_train), index=X_train.index, columns=X_train.columns)
X_val_sc = pd.DataFrame(data=scaler_st.transform(X_val), index=X_val.index, columns=X_val.columns)
X_test_sc = pd.DataFrame(data=scaler_st.transform(X_test), index=X_test.index, columns=X_test.columns)

X_train_sc.shape, X_val_sc.shape, X_test_sc.shape

((413378, 273), (118108, 273), (59054, 273))

In [10]:
# DL Models
# DNN
# CNN
# DNN/CNN + GMB features
# DNN/CNN + Autoencoders

In [63]:
# Dense Neural Network
estimator_dnn = keras.Sequential(layers=[
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units=1, activation='sigmoid'),
])

estimator_dnn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['f1_score', 'AUC'] # keras.metrics.AUC
)

In [64]:
history = estimator_dnn.fit(
    x=X_train_sc, y=y_train, validation_data=(X_val_sc, y_val),
    epochs=8, batch_size=32,
    # class_weight={0:1, 1:8},
)

Epoch 1/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 674us/step - AUC: 0.7990 - f1_score: 0.0675 - loss: 0.1271 - val_AUC: 0.8386 - val_f1_score: 0.0641 - val_loss: 0.1216
Epoch 2/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 635us/step - AUC: 0.8628 - f1_score: 0.0670 - loss: 0.1019 - val_AUC: 0.8389 - val_f1_score: 0.0641 - val_loss: 0.1128
Epoch 3/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 674us/step - AUC: 0.8725 - f1_score: 0.0681 - loss: 0.0980 - val_AUC: 0.8397 - val_f1_score: 0.0641 - val_loss: 0.1105
Epoch 4/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 650us/step - AUC: 0.8817 - f1_score: 0.0675 - loss: 0.0935 - val_AUC: 0.8462 - val_f1_score: 0.0642 - val_loss: 0.1080
Epoch 5/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 661us/step - AUC: 0.8875 - f1_score: 0.0686 - loss: 0.0924 - val_AUC: 0.8534 - val_f1_score: 0.0642 - val_loss: 0.1074
Epoc

In [65]:
y_prob:np.ndarray = estimator_dnn.predict(x=X_test_sc).ravel()
y_pred = (y_prob >= 0.5).astype(int)

report_dnn = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_dnn = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_dnn)
print(roc_aur_dnn)

[1m1846/1846[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 306us/step
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     56841
           1       0.79      0.25      0.38      2213

    accuracy                           0.97     59054
   macro avg       0.88      0.62      0.68     59054
weighted avg       0.96      0.97      0.96     59054

0.8468132696327592


In [70]:
# Convolutional Neural Network

estimator_cnn = keras.Sequential(layers=[
    keras.layers.Reshape(target_shape=(-1, 3)),
    keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    keras.layers.MaxPool1D(pool_size=2),
    keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    keras.layers.MaxPool1D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dropout(rate=0.5),
    keras.layers.Dense(units=1, activation='sigmoid')
])

estimator_cnn.compile(
    optimizer='adam',
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.F1Score, keras.metrics.AUC]
)

In [71]:
history = estimator_cnn.fit(
    x=X_train_sc, y=y_train, validation_data=(X_val_sc, y_val),
    epochs=8, batch_size=32,
    )

Epoch 1/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 5ms/step - auc_8: 0.7985 - f1_score: 0.0680 - loss: 0.1218 - val_auc_8: 0.8434 - val_f1_score: 0.0641 - val_loss: 0.1069
Epoch 2/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 5ms/step - auc_8: 0.8595 - f1_score: 0.0669 - loss: 0.1017 - val_auc_8: 0.8466 - val_f1_score: 0.0641 - val_loss: 0.1037
Epoch 3/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 5ms/step - auc_8: 0.8684 - f1_score: 0.0671 - loss: 0.0976 - val_auc_8: 0.8415 - val_f1_score: 0.0641 - val_loss: 0.1089
Epoch 4/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 5ms/step - auc_8: 0.8784 - f1_score: 0.0675 - loss: 0.0943 - val_auc_8: 0.8263 - val_f1_score: 0.0641 - val_loss: 0.1135
Epoch 5/8
[1m12919/12919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 5ms/step - auc_8: 0.8820 - f1_score: 0.0683 - loss: 0.0933 - val_auc_8: 0.8324 - val_f1_score: 0.0641 - val_los

In [73]:
y_prob:np.ndarray = estimator_cnn.predict(x=X_test_sc).ravel()
y_pred = (y_prob >= 0.5).astype(int)

report_dnn = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_dnn = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_dnn)
print(roc_aur_dnn)

[1m1846/1846[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     56841
           1       0.74      0.27      0.40      2213

    accuracy                           0.97     59054
   macro avg       0.86      0.63      0.69     59054
weighted avg       0.96      0.97      0.96     59054

0.8463360662482664
