In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

import xgboost as xgb
import catboost as cat
import lightgbm as lgb

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

pd==2.2.2
np==1.26.4
sns==0.13.2


In [132]:
with open('../../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)
type(config)

dict

In [133]:
TARGET_FEATURE = config['data']['target_feature']
INDEX_COLUMN = config['data']['index_column']
RANDOM_STATE = config['model']['random_state']
TARGET_FEATURE, INDEX_COLUMN, RANDOM_STATE

('isFraud', 'TransactionID', 42)

In [135]:
path_data = config['data']['path']
filename_train_pp = config['data']['train']['transaction']['pp']

dataset_pp = pd.read_csv(path_data+filename_train_pp, index_col=INDEX_COLUMN)
dataset_pp.shape

(590540, 274)

In [7]:
dataset_pp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 2987000 to 3577539
Columns: 274 entries, isFraud to P_domain_risk_group
dtypes: float64(266), int64(8)
memory usage: 1.2 GB


In [8]:
# Ensure chronological order of transactions
dataset_pp = dataset_pp.sort_values(by='TransactionDT', ascending=True)
dataset_pp['TransactionDT']

TransactionID
2987000       86400
2987001       86401
2987002       86469
2987003       86499
2987004       86506
             ...   
3577535    15811047
3577536    15811049
3577537    15811079
3577538    15811088
3577539    15811131
Name: TransactionDT, Length: 590540, dtype: int64

In [9]:
# Data split: train, val, test

# Split data into Train, Val, Test : 70, 20, 10
# Split into chronological chunks - better representation of real life inference

#     train       val   test
# X X X X X X X | X X | X  Datapoints
# ------------------------> t

m = dataset_pp.shape[0]

y: pd.Series = dataset_pp[TARGET_FEATURE]
X: pd.DataFrame = dataset_pp.drop(columns=TARGET_FEATURE)

X_train = X.iloc[:int(m*0.7)]
y_train = y.iloc[:int(m*0.7)]
X_val = X.iloc[int(m*0.7):int(0.9*m)]
y_val = y.iloc[int(m*0.7):int(0.9*m)]
X_test = X.iloc[int(0.9*m):]
y_test = y.iloc[int(0.9*m):]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((413378, 273), (413378,), (118108, 273), (118108,), (59054, 273), (59054,))

In [10]:
# Scalers with default settings, which one yields the best result?
scaler_mm = MinMaxScaler(feature_range=(0, 1))
scaler_rb = RobustScaler(with_centering=True, with_scaling=True)
scaler_st = StandardScaler(with_mean=True, with_std=True)
scaler_st

In [11]:
# Data scaling
scaler_st.fit(X=X_train)
X_train_sc = pd.DataFrame(data=scaler_st.transform(X_train), index=X_train.index, columns=X_train.columns)
X_val_sc = pd.DataFrame(data=scaler_st.transform(X_val), index=X_val.index, columns=X_val.columns)
X_test_sc = pd.DataFrame(data=scaler_st.transform(X_test), index=X_test.index, columns=X_test.columns)

X_train_sc.shape, X_val_sc.shape, X_test_sc.shape

((413378, 273), (118108, 273), (59054, 273))

In [82]:
estimator_xgb = xgb.XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    learning_rate=0.1,      # Step size shrinkage
    max_depth=8,            # Maximum depth of a tree
    min_child_weight=0.1,     # Minimum sum of instance weight (hessian) needed in a child
    gamma=0,                # Minimum loss reduction required to make a further partition
    # alpha=10,                # L1 Lasso regularisation
    early_stopping_rounds=10,
    eval_metric='auc',
    subsample=0.6,          # Subsample ratio of the training instances
    colsample_bytree=0.8,   # Subsample ratio of columns when constructing each tree
    objective='binary:logistic',  # Binary classification objective
    n_jobs=-1,              # Use all available cores
    random_state=RANDOM_STATE         # Seed for reproducibility
)

estimator_xgb.fit(
    X=X_train_sc, y=y_train, 
    eval_set=[(X_train_sc, y_train), (X_val_sc, y_val)],
)

y_pred = estimator_xgb.predict(X_test_sc)
y_prob = estimator_xgb.predict_proba(X_test_sc)[:, 1]

report_xgb = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_xgb = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_xgb)
print(roc_aur_xgb)

[0]	validation_0-auc:0.80893	validation_1-auc:0.78355
[1]	validation_0-auc:0.82484	validation_1-auc:0.80266
[2]	validation_0-auc:0.83687	validation_1-auc:0.80855
[3]	validation_0-auc:0.86076	validation_1-auc:0.83171
[4]	validation_0-auc:0.86408	validation_1-auc:0.83396
[5]	validation_0-auc:0.86726	validation_1-auc:0.83274
[6]	validation_0-auc:0.87183	validation_1-auc:0.83562
[7]	validation_0-auc:0.87634	validation_1-auc:0.83611
[8]	validation_0-auc:0.87871	validation_1-auc:0.83883
[9]	validation_0-auc:0.88046	validation_1-auc:0.84040
[10]	validation_0-auc:0.88171	validation_1-auc:0.84330
[11]	validation_0-auc:0.88311	validation_1-auc:0.84513
[12]	validation_0-auc:0.88396	validation_1-auc:0.84530
[13]	validation_0-auc:0.88576	validation_1-auc:0.84774
[14]	validation_0-auc:0.88776	validation_1-auc:0.84900
[15]	validation_0-auc:0.88947	validation_1-auc:0.84989
[16]	validation_0-auc:0.89076	validation_1-auc:0.85136
[17]	validation_0-auc:0.89185	validation_1-auc:0.85216
[18]	validation_0-au

```
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     56841
           1       0.80      0.31      0.44      2213

    accuracy                           0.97     59054
   macro avg       0.89      0.65      0.71     59054
weighted avg       0.97      0.97      0.96     59054

0.9032448057337353
```

In [131]:
estimator_cat = cat.CatBoostClassifier(
    iterations=400,        # Number of boosting rounds
    learning_rate=0.1,      # Step size shrinkage
    depth=10,               # Depth of the tree
    l2_leaf_reg=3,          # L2 regularization term on weights
    loss_function='Logloss',# Binary classification objective
    border_count=32,        # Number of splits for numerical features
    verbose=100,            # Print the progress after each 100 iterations
    random_state=RANDOM_STATE,         # Seed for reproducibility
    colsample_bylevel = 0.6,
    early_stopping_rounds=10,
    eval_metric='AUC',
    scale_pos_weight=8,
)

estimator_cat.fit(
    X=X_train_sc, y=y_train,
    eval_set=[(X_train_sc, y_train), (X_val_sc, y_val)],
    use_best_model=True,
)

y_pred = estimator_cat.predict(X_test_sc)
y_prob = estimator_cat.predict_proba(X_test_sc)[:, 1]

report_cat = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_cat = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_cat)
print(roc_aur_cat)

0:	test: 0.8258262	test1: 0.7928062	best: 0.7928062 (0)	total: 314ms	remaining: 2m 5s
100:	test: 0.9404926	test1: 0.8820432	best: 0.8820432 (100)	total: 30.2s	remaining: 1m 29s
200:	test: 0.9758069	test1: 0.9005670	best: 0.9005670 (200)	total: 1m	remaining: 59.8s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.9023367517
bestIteration = 225

Shrink model to first 226 iterations.
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     56841
           1       0.50      0.51      0.50      2213

    accuracy                           0.96     59054
   macro avg       0.74      0.74      0.74     59054
weighted avg       0.96      0.96      0.96     59054

0.8917945280694478


```
           0       0.98      0.97      0.97     56841
           1       0.37      0.51      0.43      2213

    accuracy                           0.95     59054
   macro avg       0.68      0.74      0.70     59054
weighted avg       0.96      0.95      0.95     59054

0.8820835620196221
```

In [None]:
# NOTE Only do if sufficient time

estimator_lgb = lgb.LGBMClassifier(
    n_estimators=100,       # Number of boosting rounds
    learning_rate=0.1,      # Step size shrinkage
    max_depth=-1,           # No limit on tree depth
    num_leaves=31,          # Maximum number of leaves in one tree
    min_child_weight=0.001, # Minimum sum of instance weight (hessian) needed in a child
    subsample=0.8,          # Subsample ratio of the training instances
    colsample_bytree=0.8,   # Subsample ratio of columns when constructing each tree
    objective='binary',     # Binary classification objective
    n_jobs=-1,              # Use all available cores
    random_state=RANDOM_STATE         # Seed for reproducibility
)

estimator_lgb.fit(X=X_train_sc, y=y_train)

y_pred = estimator_lgb.predict(X_test_sc)
y_prob = estimator_lgb.predict_proba(X_test_sc)[:, 1]

report_lgb = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_lgb = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_lgb)
print(roc_aur_lgb)