In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler      # better for heavy outliers
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score


In [3]:
# import transaction train data
train_trns = pd.read_csv('/Users/oskarwallberg/desktop/kaggle-datasets/ieee-fraud-detection/train_transaction.csv')

In [5]:
V_COLS = ['V'+str(n) for n in range(1, 340)]
train_trns
train_trns.shape

(590540, 394)

In [10]:
df_vesta = train_trns[V_COLS].copy()
df_vesta.shape

(590540, 339)

In [11]:
df_vesta = df_vesta.fillna(0)
df_vesta.isna().any().any()

False

In [18]:
pca = PCA(n_components=20)

In [14]:
df_vesta_pca = pd.DataFrame(data=pca.fit_transform(X=df_vesta))
df_vesta_pca.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-6601.867648,-771.684940,30.216280,-14.377035,-129.030500,-25.546991,-48.703378,-74.773042,56.323594,121.195005,45.843863,1.297666,-88.302091,9.619239,74.801683,-17.590068,82.408244,0.416635,-11.268793,6.688108
1,-6601.684180,-899.892623,-46.124201,-113.540356,-125.728317,-13.812270,-108.929904,-72.496693,137.572856,76.898491,34.981168,44.515439,-46.791725,-22.515119,37.610411,6.896409,52.207367,14.002531,-12.398344,13.069701
2,-6601.684198,-899.892648,-46.124051,-113.540305,-125.728477,-13.812400,-108.929948,-72.496388,137.572141,76.897919,34.980768,44.515203,-46.791400,-22.514918,37.609964,6.896862,52.205678,14.001569,-12.397243,13.068760
3,-6605.178432,1552.029273,1341.062515,1461.659013,177.882277,-140.940829,285.031583,79.142574,-1043.028179,-361.097290,-47.843218,-107.024659,-368.588521,569.199667,599.335927,-123.332504,-48.993274,97.771226,-200.837024,-97.320611
4,163764.430416,-666.416155,10.427825,-160.431347,302.187767,6808.009875,993.578223,340.558061,53.539676,191.778153,-4.901771,-14.948000,-227.720742,-11.887488,277.218913,136.649356,-96.689027,23.057719,269.290222,-357.793131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,-6601.763009,-871.482665,-27.428737,-88.487791,-115.041338,-17.075980,-93.979338,-52.556907,82.077141,63.069495,20.110210,11.401478,-17.187513,-7.999996,12.571788,-4.141023,14.213045,0.408153,5.834001,-16.968262
590536,-6601.684198,-899.892648,-46.124051,-113.540306,-125.728477,-13.812400,-108.929948,-72.496388,137.572141,76.897919,34.980768,44.515204,-46.791401,-22.514918,37.609964,6.896862,52.205678,14.001570,-12.397243,13.068760
590537,-6601.684205,-899.892660,-46.124034,-113.540341,-125.728560,-13.812434,-108.930033,-72.496354,137.572228,76.898004,34.980746,44.515223,-46.791455,-22.515204,37.609812,6.896925,52.205820,14.001357,-12.397024,13.068890
590538,-6605.151349,1151.039445,1367.207335,1815.839818,8.506943,-339.139244,1899.748523,-30.315834,-725.403462,348.832906,57.446032,-629.271377,94.467990,167.263803,46.860740,-327.101918,1.488797,-153.308356,285.911633,-447.686752


In [17]:
np.cumsum(pca.explained_variance_ratio_)

array([0.92920758, 0.99344518, 0.99531193, 0.99649527, 0.99738471,
       0.99812689, 0.99874799, 0.99908903, 0.99926784, 0.99940869,
       0.9995124 , 0.99959921, 0.99967346, 0.99973058, 0.99977739,
       0.99981061, 0.99984107, 0.9998622 , 0.99987784, 0.99989044])

In [21]:
TARGET_FEATURE = 'isFraud'
train_trns[TARGET_FEATURE].isna().any()

False

In [47]:
X = df_vesta
y = train_trns[TARGET_FEATURE]
X.shape, y.shape

((590540, 339), (590540,))

In [48]:
rb = RobustScaler(with_centering=True, with_scaling=True)

xgbclf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    learning_rate=0.1
)

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True)

In [49]:
scores = np.zeros(N_FOLDS)

# NOTE about 0.7% of values are flagged as fraudulent per fold
for i, (train_idx, val_idx) in enumerate(skf.split(X=X, y=y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    X_train_sc = rb.fit_transform(X=X_train)
    X_val_sc = rb.transform(X=X_val)

    xgbclf.fit(X=X_train_sc, y=y_train, ) # eval_set=[(X_val_sc, y_val)]
    y_pred = xgbclf.predict(X=X_val_sc)
    y_score: np.ndarray = xgbclf.predict_proba(X=X_val_sc)[:, 1]
    
    print(f'Fold {i+1}/{N_FOLDS}')
    print(classification_report(y_true=y_val, y_pred=y_pred))
    print(f'ROC-AUC score: {roc_auc_score(y_true=y_val, y_score=y_score)}')

Fold 1/5
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113976
           1       0.89      0.30      0.45      4132

    accuracy                           0.97    118108
   macro avg       0.93      0.65      0.72    118108
weighted avg       0.97      0.97      0.97    118108

ROC-AUC score: 0.8634090008742181
Fold 2/5
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113976
           1       0.88      0.31      0.45      4132

    accuracy                           0.97    118108
   macro avg       0.93      0.65      0.72    118108
weighted avg       0.97      0.97      0.97    118108

ROC-AUC score: 0.8625917072027052
Fold 3/5
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.90      0.30      0.46      4133

    accuracy                           0.97    118108
   macro avg       0.94      0.65