In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

import xgboost as xgb

import tensorflow as tf
import keras

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

pd==2.2.2
np==1.26.4
sns==0.13.2


In [3]:
kaggle_dataset_path = "/Users/oskarwallberg/Desktop/kaggle-datasets/ieee-fraud-detection/"
name_dataset = "train_transaction_pp_fs1.csv"

dataset_pp = pd.read_csv(filepath_or_buffer=kaggle_dataset_path+name_dataset, index_col="TransactionID")

dataset_pp.shape

(590540, 274)

In [27]:
TARGET_FEATURE = "isFraud"
INPUT_FEATURES = dataset_pp.shape[1]-1 # except target feature


In [4]:
dataset_pp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 2987000 to 3577539
Columns: 274 entries, isFraud to P_domain_risk_group
dtypes: float64(266), int64(8)
memory usage: 1.2 GB


In [5]:
dataset_pp

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V326,V327,V328,V335,V336,V337,V338,V339,P_domain_fraud_rate,P_domain_risk_group
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.50,0.020399,13926,-1.0,150.0,0.077282,142.0,0.066785,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.029538,1
2987001,0,86401,29.00,0.020399,2755,404.0,150.0,0.034331,102.0,0.066785,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.043542,1
2987002,0,86469,59.00,0.020399,4663,490.0,150.0,0.034756,166.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.094584,2
2987003,0,86499,50.00,0.020399,18132,567.0,150.0,0.034331,117.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.022757,1
2987004,0,86506,50.00,0.047662,4497,514.0,150.0,0.034331,102.0,0.066785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043542,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577535,0,15811047,49.00,0.020399,6550,-1.0,150.0,0.034756,226.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.029538,1
3577536,0,15811049,39.50,0.020399,10444,225.0,150.0,0.034331,224.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.043542,1
3577537,0,15811079,30.95,0.020399,12037,595.0,150.0,0.034331,224.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.043542,1
3577538,0,15811088,117.00,0.020399,7826,481.0,150.0,0.034331,224.0,0.024263,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.021811,1


In [6]:
y = dataset_pp[TARGET_FEATURE]
X = dataset_pp.drop(columns=TARGET_FEATURE)


# Split data into Train, Val, Test : 70, 20, 10
# Split into chronological chunks - better representation of real life inference
# X X X X X X X X X X X X X X X X X X X X X X X X X X X  |  X X X X X X X X X X X X X X X X X X  Datapoints
# ---------------------------------------------------------------------------------------------> t

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2)

X_train: pd.DataFrame
X_val: pd.DataFrame
X_test: pd.DataFrame
y_train: pd.DataFrame
y_val: pd.DataFrame
y_test: pd.DataFrame

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((425188, 273), (425188,), (106298, 273), (106298,), (59054, 273), (59054,))

In [7]:
sc = StandardScaler(with_mean=True, with_std=True)

sc

In [8]:
sc.fit(X=X_train)
X_train_sc = pd.DataFrame(data=sc.transform(X_train), index=X_train.index, columns=X_train.columns)
X_val_sc = pd.DataFrame(data=sc.transform(X_val), index=X_val.index, columns=X_val.columns)
X_test_sc = pd.DataFrame(data=sc.transform(X_test), index=X_test.index, columns=X_test.columns)

X_train_sc.shape, X_val_sc.shape, X_test_sc.shape

((425188, 273), (106298, 273), (59054, 273))

In [9]:
logreg = LogisticRegression(
    class_weight={0:1, 1:8},
    max_iter=400,
    verbose=True,
    n_jobs=-1,
)

logreg

In [10]:
logreg.fit(X=X_train_sc, y=y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          274     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  2.74901D-01


 This problem is unconstrained.



At iterate   50    f=  3.57701D-01    |proj g|=  5.20631D-03

At iterate  100    f=  3.55744D-01    |proj g|=  5.25682D-04

At iterate  150    f=  3.55418D-01    |proj g|=  3.08630D-04

At iterate  200    f=  3.55297D-01    |proj g|=  3.48761D-04

At iterate  250    f=  3.55251D-01    |proj g|=  1.42685D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  274    274    301      1     0     0   9.956D-05   3.552D-01
  F =  0.35523901465109781     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [14]:
y_pred_lr = logreg.predict(X_test_sc)
y_prob_lr = logreg.predict_proba(X_test_sc)[:, 1]

report_lr = classification_report(y_true=y_test, y_pred=y_pred_lr)
rocauc_lr = roc_auc_score(y_true=y_test, y_score=y_prob_lr)

print(report_lr)
print(f"ROC AUC for Logistic Regression: {rocauc_lr}")

              precision    recall  f1-score   support

           0       0.98      0.97      0.97     56995
           1       0.35      0.45      0.39      2059

    accuracy                           0.95     59054
   macro avg       0.66      0.71      0.68     59054
weighted avg       0.96      0.95      0.95     59054

ROC AUC for Logistic Regression: 0.8478214967435135


In [22]:
xgbst = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=3, 
    learning_rate=0.1, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    use_label_encoder=False, 
    eval_metric="logloss",
    # eval_metric=eval_auc
    scale_pos_weight=8,
)

xgbst

In [23]:
xgbst.fit(X_train_sc, y_train)

In [60]:
y_pred_xgb = xgbst.predict(X_test_sc)
y_prob_xgb = xgbst.predict_proba(X_test_sc)[:, 1]

report_xgb = classification_report(y_true=y_test, y_pred=y_pred_xgb)
rocauc_xgb = roc_auc_score(y_true=y_test, y_score=y_prob_xgb)

print(report_xgb)
print(f"ROC AUC for XGradientBoost: {rocauc_xgb}")

              precision    recall  f1-score   support

           0       0.98      0.97      0.98     56995
           1       0.42      0.51      0.46      2059

    accuracy                           0.96     59054
   macro avg       0.70      0.74      0.72     59054
weighted avg       0.96      0.96      0.96     59054

ROC AUC for XGradientBoost: 0.8813155222966527


In [42]:
denseNN = keras.models.Sequential([
    keras.layers.InputLayer(shape=(INPUT_FEATURES,)),
    keras.layers.Dense(units=128, activation=keras.activations.relu),
    keras.layers.Dense(units=64, activation=keras.activations.relu),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(units=1, activation=keras.activations.sigmoid)
])


denseNN.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['f1_score'],
)

In [43]:
history = denseNN.fit(X_train_sc, y_train, epochs=10, batch_size=32)
history

Epoch 1/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 615us/step - f1_score: 0.0685 - loss: 0.1213
Epoch 2/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 643us/step - f1_score: 0.0684 - loss: 0.1042
Epoch 3/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 657us/step - f1_score: 0.0665 - loss: 0.0948
Epoch 4/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 772us/step - f1_score: 0.0681 - loss: 0.0945
Epoch 5/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 659us/step - f1_score: 0.0685 - loss: 0.0922
Epoch 6/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 695us/step - f1_score: 0.0679 - loss: 0.0893
Epoch 7/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 668us/step - f1_score: 0.0665 - loss: 0.0868
Epoch 8/10
[1m13288/13288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 718us/step - f1_score: 0.0684 - los

<keras.src.callbacks.history.History at 0x2959fffa0>

In [44]:
test_loss, test_acc = denseNN.evaluate(X_test_sc, y_test)
test_loss, test_acc

[1m1846/1846[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 368us/step - f1_score: 0.0691 - loss: 0.0901


(0.08946581184864044, 0.067718006670475)

In [59]:
y_prob_dnn = denseNN.predict(X_test_sc)
y_pred_dnn = np.zeros(y_prob_dnn.shape)
y_pred_dnn[y_prob_dnn >= 0.5] = 1

report_dnn = classification_report(y_true=y_test, y_pred=y_pred_dnn)
rocauc_dnn = roc_auc_score(y_true=y_test, y_score=y_prob_dnn)

print(report_dnn)
print(f"ROC AUC for DenseNeuralNetwork: {rocauc_dnn}")

[1m1846/1846[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 295us/step
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     56995
           1       0.81      0.42      0.56      2059

    accuracy                           0.98     59054
   macro avg       0.90      0.71      0.77     59054
weighted avg       0.97      0.98      0.97     59054

ROC AUC for Logistic Regression: 0.8998594493411973
