In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score, f1_score, classification_report

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

pd==2.2.2
np==1.26.4
sns==0.13.2


In [177]:
with open('../../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)
type(config)

dict

In [178]:
TARGET_FEATURE = config['data']['target_feature']
INDEX_COLUMN = config['data']['index_column']
RANDOM_STATE = config['model']['random_state']
TARGET_FEATURE, INDEX_COLUMN, RANDOM_STATE

('isFraud', 'TransactionID', 42)

In [6]:
path_data = config['data']['path']
filename_train_pp = config['data']['train']['transaction']['pp']

dataset_pp = pd.read_csv(path_data+filename_train_pp, index_col=INDEX_COLUMN)
dataset_pp.shape

(590540, 274)

In [7]:
dataset_pp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 2987000 to 3577539
Columns: 274 entries, isFraud to P_domain_risk_group
dtypes: float64(266), int64(8)
memory usage: 1.2 GB


In [8]:
# Ensure chronological order of transactions
dataset_pp = dataset_pp.sort_values(by='TransactionDT', ascending=True)
dataset_pp['TransactionDT']

TransactionID
2987000       86400
2987001       86401
2987002       86469
2987003       86499
2987004       86506
             ...   
3577535    15811047
3577536    15811049
3577537    15811079
3577538    15811088
3577539    15811131
Name: TransactionDT, Length: 590540, dtype: int64

In [9]:
# Data split: train, val, test

# Split data into Train, Val, Test : 70, 20, 10
# Split into chronological chunks - better representation of real life inference

#     train       val   test
# X X X X X X X | X X | X  Datapoints
# ------------------------> t

m = dataset_pp.shape[0]

y: pd.Series = dataset_pp[TARGET_FEATURE]
X: pd.DataFrame = dataset_pp.drop(columns=TARGET_FEATURE)

X_train = X.iloc[:int(m*0.7)]
y_train = y.iloc[:int(m*0.7)]
X_val = X.iloc[int(m*0.7):int(0.9*m)]
y_val = y.iloc[int(m*0.7):int(0.9*m)]
X_test = X.iloc[int(0.9*m):]
y_test = y.iloc[int(0.9*m):]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape


((413378, 273), (413378,), (118108, 273), (118108,), (59054, 273), (59054,))

In [10]:
# Scalers with default settings, which one yields the best result?
scaler_mm = MinMaxScaler(feature_range=(0, 1))
scaler_rb = RobustScaler(with_centering=True, with_scaling=True)
scaler_st = StandardScaler(with_mean=True, with_std=True)
scaler_st

In [11]:
# Data scaling
scaler_st.fit(X=X_train)
X_train_sc = pd.DataFrame(data=scaler_st.transform(X_train), index=X_train.index, columns=X_train.columns)
X_val_sc = pd.DataFrame(data=scaler_st.transform(X_val), index=X_val.index, columns=X_val.columns)
X_test_sc = pd.DataFrame(data=scaler_st.transform(X_test), index=X_test.index, columns=X_test.columns)

X_train_sc.shape, X_val_sc.shape, X_test_sc.shape

((413378, 273), (118108, 273), (59054, 273))

In [12]:
# Models:
# kNN - KNearestNeighbours
# NB - NaiveBayes
# LR - LogisticRegression
# SVC - SupportVectorMachine
# DT - DecisionTrees
# RF - RandomForest

# Scikit-Learn Scoring Strings
# https://scikit-learn.org/stable/modules/model_evaluation.html 

In [191]:
estimator_lr = LogisticRegression(
    penalty='l2', solver='lbfgs',
    max_iter=1000, 
    C=0.0001,
    fit_intercept=True,
    class_weight={0:1, 1:8},
    verbose=True,
    random_state=RANDOM_STATE,
)

estimator_lr.fit(X=X_train_sc, y=y_train)

y_pred = estimator_lr.predict(X=X_test_sc)
y_prob = estimator_lr.predict_proba(X=X_test_sc)[:, 1]

report_lr = classification_report(y_true=y_test, y_pred=y_pred)
roc_aur_lr = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_lr)
print(roc_aur_lr)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          274     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  2.74230D-01


 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  274     46     53      1     0     0   8.809D-05   3.758D-01
  F =  0.37579196953600780     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
              precision    recall  f1-score   support

           0       0.98      0.93      0.96     56841
           1       0.24      0.52      0.33      2213

    accuracy                           0.92     59054
   macro avg       0.61      0.73      0.64     59054
weighted avg       0.95      0.92      0.93     59054

0.8439065559025676


```
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     56841
           1       0.32      0.45      0.37      2213

    accuracy                           0.94     59054
   macro avg       0.65      0.71      0.67     59054
weighted avg       0.95      0.94      0.95     59054

0.8436407380278231
```

In [187]:
estimator_rf = RandomForestClassifier(
    n_estimators=400,
    criterion='entropy',
    min_samples_split = 2,
    min_samples_leaf = 1,
    min_weight_fraction_leaf = 0,
    max_features = 'sqrt',
    max_leaf_nodes = None,
    verbose=True,
    class_weight=None, # {0:12, 1:1} Weighting has opposite effect on recall
    random_state=RANDOM_STATE,
)

estimator_rf.fit(X=X_val_sc, y=y_val)

y_pred = estimator_rf.predict(X=X_test_sc)
y_prob = estimator_rf.predict_proba(X=X_test_sc)[:, 1]

report_rf = classification_report(y_true=y_test, y_pred=y_pred)
roc_auc_rf = roc_auc_score(y_true=y_test, y_score=y_prob)

print(report_rf)
print(roc_auc_rf)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    8.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   36.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s


              precision    recall  f1-score   support

           0       0.97      1.00      0.99     56841
           1       0.87      0.29      0.44      2213

    accuracy                           0.97     59054
   macro avg       0.92      0.64      0.71     59054
weighted avg       0.97      0.97      0.97     59054

0.9009746215517679


```
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     56841
           1       0.87      0.31      0.46      2213

    accuracy                           0.97     59054
   macro avg       0.92      0.65      0.72     59054
weighted avg       0.97      0.97      0.97     59054

0.894709378432555
```

THE PLAN MOVING FORWARD

Build all of the models - atleast on a basic level
Embed the preprocessing steps into functions to be able to preprocess the test dataset similarly
From the local testing, select up to 5 models to submit test predictions for
Compare the results and see which models score the best 
Consider adding SMOTE technique to see if the models learn better from that 
test both with and without feature selection (how did it affect performance )
Which models are complex to build and tune (if any?) and which are easy

Models that qualified:
Standard:
- LogisticRegression
- RandomForest

GBMs:
- XGBoost
- CatBoost
- LightGBM

DL:
- Dense Neural Network
- Convolutional Neural Network

Extra:
- GMB + NN (DNN with GMB features)
- AutoEncoder + DNN


In [None]:
estimator_nb = GaussianNB()

estimator_nb.fit(X=X_train_sc, y=y_train)

y_pred = estimator_nb.predict(X=X_val_sc)
y_prob = estimator_nb.predict_proba(X=X_val_sc)[:, 1]

report_nb = classification_report(y_true=y_val, y_pred=y_pred)
roc_auc_nb = roc_auc_score(y_true=y_val, y_score=y_prob)

print(report_nb)
print(roc_auc_nb)

In [None]:
# NOTE Use param search here, fast training and unsure about parameters

estimator_dt = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features=None,
    min_impurity_decrease=0,
    class_weight='balanced',
)

estimator_dt.fit(X=X_train_sc, y=y_train)

y_pred = estimator_dt.predict(X=X_val_sc)
y_prob = estimator_dt.predict_proba(X=X_val_sc)[:, 1]

report_dt = classification_report(y_true=y_val, y_pred=y_pred)
roc_auc_dt = roc_auc_score(y_true=y_val, y_score=y_prob)

print(report_dt)
print(roc_auc_dt)

In [None]:
estimator_knn = KNeighborsClassifier(n_neighbors=2)

estimator_knn.fit(X=X_train_sc, y=y_train)

y_pred = estimator_knn.predict(X=X_val_sc)
y_prob = estimator_knn.predict_proba(X=X_val_sc)[:, 1]

report_knn = classification_report(y_true=y_val, y_pred=y_pred)
roc_auc_knn = roc_auc_score(y_true=y_val, y_score=y_prob)

print(report_knn)
print(roc_auc_knn)

In [None]:
# SVC is too inefficient for this amount of data
# Did not complete in over 30 mins of training - is generally recommended for data less than 10,000 rows