In [1]:
import pandas as pd
import gc
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

import os

Reading the Proccessed Data from the Data Pre-Proccessing Step

In [2]:
# Read the 'train_12.pkl' pickle file and load it into the 'train' DataFrame
train = pd.read_pickle('./train_12.pkl')

# Read the 'test_12.pkl' pickle file and load it into the 'test' DataFrame
test = pd.read_pickle('./test_12.pkl')

Drop the 'Date' Columns from dataframe

In [3]:
# Date columns are not indicators
train.drop('TransactionDT', axis=1, inplace=True)
train.drop('DT', axis=1, inplace=True)

test.drop('TransactionDT', axis=1, inplace=True)
test.drop('DT', axis=1, inplace=True)

Define Train and Test sets

In [4]:
# Target variable for training set (y_train)
y_train = train['isFraud']

# Independent variables for training set (X_train)
X_train = train.drop(['isFraud'], axis=1)

# Target variable for test set (y_test)
y_test = test['isFraud']

# Independent variables for test set (X_test)
X_test = test.drop(['isFraud'], axis=1)

In [5]:
# Get the count of negative and positive examples
count_negative = (y_train == 0).sum()
count_positive = (y_train == 1).sum()

# Calculate the value of scale_pos_weight
scale_pos_weight = math.sqrt(count_negative / count_positive)

Run the LightGBM model with Default Parameters

In [6]:
# Defualt Parameter
lgbclf = lgb.LGBMClassifier(
  random_state=1003,
  scale_pos_weight=scale_pos_weight,
  metric='auc',
  objective= 'binary',
  device = 'gpu')
lgbclf.fit(X_train,y_train)

#prediction
y_pred_lgbm = lgbclf.predict(X_test)

# Probas for train
y_train_lgbm_proba = lgbclf.predict_proba(X_train)[:, 1]  

train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf.predict_proba(X_test)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

[LightGBM] [Info] Number of positive: 15563, number of negative: 427342
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9894
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 216
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 46 dense feature groups (20.27 MB) transferred to GPU in 0.026963 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035138 -> initscore=-3.312688
[LightGBM] [Info] Start training from score -3.312688
Train AUC: 0.9444135149441979
Test AUC: 0.8918848765731431


Hyper Parameter Optimization using Random_Search as a sklearn built in function

In [7]:
params={
 "learning_rate"    : [0.01, 0.05, 0.10, 0.15 ] ,
 "max_depth"        : [ 3, 6, 9, 12, 15],
 "num_leaves"       : [ 10, 500, 1000 ],
 "n_estimators"     : [ 0.1, 1, 10 , 100, 1000 ],
 "subsample"        : [ 0.1, 0.2, 0.3, 0.4, 0.5 , 0.7 , 0.8 , 0.9],
 "reg_alpha"        : [ 0.1, 0.3 , 0.6, 1 ],
 "colsample_bytree" : [ 0.1, 0.2, 0.3, 0.4, 0.5 , 0.7 , 0.8 , 0.9 ]
}

from sklearn.model_selection import RandomizedSearchCV

lgbclf = lgb.LGBMClassifier()

random_search=RandomizedSearchCV(
  lgbclf,
  param_distributions=params,
  n_iter=10,
  scoring='roc_auc',
  n_jobs=-1,
  cv=5,
  verbose=3)

random_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 15563, number of negative: 427342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9889
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035138 -> initscore=-3.312688
[LightGBM] [Info] Start training from score -3.312688


In [8]:
random_search.best_params_

{'subsample': 0.7,
 'reg_alpha': 0.3,
 'num_leaves': 500,
 'n_estimators': 1000,
 'max_depth': 12,
 'learning_rate': 0.1,
 'colsample_bytree': 0.5}

Second Model With Optimized Parameters

In [9]:
lgbclf_1 = lgb.LGBMClassifier(
             
        random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.7,
        reg_alpha= 0.3,
        num_leaves=500,
        n_estimators= 1000,
        max_depth=12,
        learning_rate=0.1,
        colsample_bytree= 0.5
)

lgbclf_1.fit(X_train,y_train)

#prediction
y_pred_lgbm_1 = lgbclf_1.predict(X_test)

# Probas for train
y_train_lgbm_proba = lgbclf_1.predict_proba(X_train)[:, 1]  
train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf_1.predict_proba(X_test)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

[LightGBM] [Info] Number of positive: 15563, number of negative: 427342
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9894
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 216
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 46 dense feature groups (20.27 MB) transferred to GPU in 0.026472 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035138 -> initscore=-3.312688
[LightGBM] [Info] Start training from score -3.312688
Train AUC: 1.0
Test AUC: 0.898795621577638


Using Feature Importance to Extract the most important Features

In [10]:
# Feature Importance
cols = list( X_train.columns)
feature_imp = pd.DataFrame(
  sorted(zip(lgbclf_1.feature_importances_, cols), 
         key=lambda x: x[0], 
         reverse=True), 
  columns=['Value', 'Feature'])
feature_imp.to_clipboard()

In [11]:
feature_imp.head(50)

Unnamed: 0,Value,Feature
0,31581,card1
1,28536,TransactionAmt
2,23987,card2_target_encoded
3,17195,addr_target_encoded
4,16455,addr1_target_encoded
5,13567,id_02
6,12638,dist1
7,11872,TransactionAmt_decimal
8,11406,D15
9,10444,card5_target_encoded


In [12]:
# Select the top 50 important features from X_train
selected_features = feature_imp.head(50)['Feature'].tolist()

# Creating new x dataframes
X_train_lgbm_2 = X_train[selected_features ] 
X_test_lgbm_2 = X_test[selected_features ] 

In [None]:
# Extracted Features which using for thr transformation for the Deployments
X_train_lgbm_2 = X_train[['card1',
 'TransactionAmt',
 'card2_target_encoded',
 'addr_target_encoded',
 'addr1_target_encoded',
 'dist1',
 'D15',
 'id_02',
 'TransactionAmt_decimal',
 'C1',
 'D4',
 'card5_target_encoded',
 'D2',
 'D10',
 'D11',
 'V307',
 'D1',
 'D8',
 'D5',
 'V310',
 'D3',
 'id_05',
 'V127',
 'id_06',
 'V314',
 'D9',
 'V264',
 'V312',
 'id_01',
 'D14',
 'V203',
 'C5',
 'D6',
 'V283',
 'D12',
 'V36',
 'V96',
 'V221',
 'V62',
 'V82',
 'V282',
 'V54',
 'V76',
 'V37',
 'V20',
 'V5',
 'V44',
 'V285',
 'V77',
 'V56']]



In [None]:
X_test_lgbm_2 = X_test[['card1',
 'TransactionAmt',
 'card2_target_encoded',
 'addr_target_encoded',
 'addr1_target_encoded',
 'dist1',
 'D15',
 'id_02',
 'TransactionAmt_decimal',
 'C1',
 'D4',
 'card5_target_encoded',
 'D2',
 'D10',
 'D11',
 'V307',
 'D1',
 'D8',
 'D5',
 'V310',
 'D3',
 'id_05',
 'V127',
 'id_06',
 'V314',
 'D9',
 'V264',
 'V312',
 'id_01',
 'D14',
 'V203',
 'C5',
 'D6',
 'V283',
 'D12',
 'V36',
 'V96',
 'V221',
 'V62',
 'V82',
 'V282',
 'V54',
 'V76',
 'V37',
 'V20',
 'V5',
 'V44',
 'V285',
 'V77',
 'V56']]

Third Run the model with top 50 important features (We did with 100 as well. but it results to more overfitted model)

In [14]:
lgbclf_2 = lgb.LGBMClassifier(
              
        random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.7,
        reg_alpha= 0.3,
        num_leaves=500,
        n_estimators= 1000,
        max_depth=12,
        learning_rate=0.1,
        colsample_bytree= 0.5
      
    )

lgbclf_2.fit(X_train_lgbm_2,y_train)

#prediction
y_pred_lgbm_3 = lgbclf_2.predict(X_test_lgbm_2)

# Probas for train
y_train_lgbm_proba = lgbclf_2.predict_proba(X_train_lgbm_2)[:, 1]  
train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf_2.predict_proba(X_test_lgbm_2)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

[LightGBM] [Info] Number of positive: 15563, number of negative: 427342
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 7502
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 50
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 30 dense feature groups (13.52 MB) transferred to GPU in 0.026195 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035138 -> initscore=-3.312688
[LightGBM] [Info] Start training from score -3.312688
Train AUC: 0.9999992038460231
Test AUC: 0.8762438871223236


Tunning the Hyperparameters to reduce the Overfitting Issue :

      To Reduce the Model Complexity we shoulde  Decrease "num_leaves" and "max_depth"

      To Increase Regularization , we need to increase "reg_alpha"

      To reduce the Overfitting, we may need to increase the "subsample" ratio to use a larger share of data for each boosting round.

      Reducing the "learning_rate" slow down the convergence of the training process and can lead to more robust model


In [17]:
lgbclf_3 = lgb.LGBMClassifier(
              
    random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.8,
        reg_alpha= 0.5,
        num_leaves=20,
        n_estimators= 1000,
        max_depth=9,
        learning_rate=0.01,        
        colsample_bytree= 0.5,
        init_score = 0.18,
        boosting_type = 'gbdt'
    )

lgbclf_3.fit(X_train_lgbm_2,y_train)

#prediction
y_pred_lgbm_3 = lgbclf_3.predict(X_test_lgbm_2)

#classification report
print(classification_report(y_test, y_pred_lgbm_3))

#confusion matrix
print(confusion_matrix(y_test, y_pred_lgbm_3, normalize='true'))

# Probas for train
y_train_lgbm_proba = lgbclf_3.predict_proba(X_train_lgbm_2)[:, 1]  
train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf_3.predict_proba(X_test_lgbm_2)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

[LightGBM] [Info] Number of positive: 15563, number of negative: 427342
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 7502
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 50
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 30 dense feature groups (13.52 MB) transferred to GPU in 0.020733 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035138 -> initscore=-3.312688
[LightGBM] [Info] Start training from score -3.312688
              precision    recall  f1-score   support

           0       0.98      0.99      0.98    142535
           1       0.49      0.40      0.44      5100

    accuracy                           0.96    147635
   macro avg       0.73      0.69      0

Cross Validation

In [19]:
from sklearn.model_selection import KFold
n_fold = 4
folds = KFold(n_splits=n_fold,shuffle=True)

print(folds)


lgb_CV=train.copy()
lgb_CV['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    
    lgbclf = lgb.LGBMClassifier(
        random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.8,
        reg_alpha= 0.5,
        num_leaves=20,
        n_estimators= 1000,
        max_depth=9,
        learning_rate=0.01,        
        colsample_bytree= 0.5,
        init_score = 0.18,
        boosting_type = 'gbdt'
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    lgbclf.fit(X_train_,y_train_)
    
    del X_train_,y_train_
    print('finish train')
    pred=lgbclf.predict_proba(X_test)[:,1]
    val=lgbclf.predict_proba(X_valid)[:,1]
    print('finish pred')
    del lgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    del pred

KFold(n_splits=4, random_state=None, shuffle=True)
0
[LightGBM] [Info] Number of positive: 11645, number of negative: 320533
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9874
[LightGBM] [Info] Number of data points in the train set: 332178, number of used features: 216
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 46 dense feature groups (15.21 MB) transferred to GPU in 0.022416 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035057 -> initscore=-3.315108
[LightGBM] [Info] Start training from score -3.315108
finish train
finish pred
ROC accuracy: 0.9204336311743208
1
[LightGBM] [Info] Number of positive: 11660, number of negative: 320519
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins