In [None]:
import pandas as pd
import gc
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

import os

Reading the Proccessed Data from the Data Pre-Proccessing Step

In [None]:
# Read the 'train_12.pkl' pickle file and load it into the 'train' DataFrame
train = pd.read_pickle('./train_12.pkl')

# Read the 'test_12.pkl' pickle file and load it into the 'test' DataFrame
test = pd.read_pickle('./test_12.pkl')

Drop the 'Date' Columns from dataframe

In [None]:
# Date columns are not indicators
train.drop('TransactionDT', axis=1, inplace=True)
train.drop('DT', axis=1, inplace=True)

test.drop('TransactionDT', axis=1, inplace=True)
test.drop('DT', axis=1, inplace=True)

Define Train and Test sets

In [None]:
# Target variable for training set (y_train)
y_train = train['isFraud']

# Independent variables for training set (X_train)
X_train = train.drop(['isFraud'], axis=1)

# Target variable for test set (y_test)
y_test = test['isFraud']

# Independent variables for test set (X_test)
X_test = test.drop(['isFraud'], axis=1)

In [None]:
# Get the count of negative and positive examples
count_negative = (y_train == 0).sum()
count_positive = (y_train == 1).sum()

# Calculate the value of scale_pos_weight
scale_pos_weight = math.sqrt(count_negative / count_positive)

Run the LightGBM model woth Default Parameters

In [None]:
# Defualt Parameter
lgbclf = lgb.LGBMClassifier(
  random_state=1003,
  scale_pos_weight=scale_pos_weight,
  metric='auc',
  objective= 'binary',
  device = 'gpu')
lgbclf.fit(X_train,y_train)

#prediction
y_pred_lgbm = lgbclf.predict(X_test)

# Probas for train
y_train_lgbm_proba = lgbclf.predict_proba(X_train)[:, 1]  

train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf.predict_proba(X_test)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

Hyper Parameter Optimization using Random_Search as a sklearn built in function

In [None]:
params={
 "learning_rate"    : [0.01, 0.05, 0.10, 0.15 ] ,
 "max_depth"        : [ 3, 6, 9, 12, 15],
 "num_leaves"       : [ 10, 500, 1000 ],
 "n_estimators"     : [ 0.1, 1, 10 , 100, 1000 ],
 "subsample"        : [ 0.1, 0.2, 0.3, 0.4, 0.5 , 0.7 , 0.8 , 0.9],
 "reg_alpha"        : [ 0.1, 0.3 , 0.6, 1 ],
 "colsample_bytree" : [ 0.1, 0.2, 0.3, 0.4, 0.5 , 0.7 , 0.8 , 0.9 ]
}

from sklearn.model_selection import RandomizedSearchCV

lgbclf = lgb.LGBMClassifier()

random_search=RandomizedSearchCV(
  lgbclf,
  param_distributions=params,
  n_iter=10,
  scoring='roc_auc',
  n_jobs=-1,
  cv=5,
  verbose=3)

random_search.fit(X_train,y_train)

In [None]:
random_search.best_params_

Second Model With Optimized Parameters

In [None]:
lgbclf_1 = lgb.LGBMClassifier(
             
        random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.3,
        reg_alpha= 0.3,
        num_leaves=500,
        n_estimators= 1000,
        max_depth=9,
        learning_rate=0.15,
        colsample_bytree= 0.3
)

lgbclf_1.fit(X_train,y_train)

#prediction
y_pred_lgbm_1 = lgbclf_1.predict(X_test)

# Probas for train
y_train_lgbm_proba = lgbclf_1.predict_proba(X_train)[:, 1]  
train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf_1.predict_proba(X_test)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')

Using Feature Importance to Extract the most important Features

In [None]:
# Feature Importance
cols = list( X_train.columns)
feature_imp = pd.DataFrame(
  sorted(zip(lgbclf_1.feature_importances_, cols), 
         key=lambda x: x[0], 
         reverse=True), 
  columns=['Value', 'Feature'])
feature_imp.to_clipboard()

In [None]:
feature_imp.head(50)

In [None]:
# Select the top 50 important features from X_train
selected_features = feature_imp.head(50)['Feature'].tolist()

# Creating new x dataframes
X_train_lgbm_2 = X_train[selected_features ] 
X_test_lgbm_2 = X_test[selected_features ] 

In [None]:
# Extracted Features which using for thr transformation for the Deployments
X_train_lgbm_2 = X_train[['card1',
 'TransactionAmt',
 'card2_target_encoded',
 'addr_target_encoded',
 'addr1_target_encoded',
 'dist1',
 'D15',
 'id_02',
 'TransactionAmt_decimal',
 'C1',
 'D4',
 'card5_target_encoded',
 'D2',
 'D10',
 'D11',
 'V307',
 'D1',
 'D8',
 'D5',
 'V310',
 'D3',
 'id_05',
 'V127',
 'id_06',
 'V314',
 'D9',
 'V264',
 'V312',
 'id_01',
 'D14',
 'V203',
 'C5',
 'D6',
 'V283',
 'D12',
 'V36',
 'V96',
 'V221',
 'V62',
 'V82',
 'V282',
 'V54',
 'V76',
 'V37',
 'V20',
 'V5',
 'V44',
 'V285',
 'V77',
 'V56']]



In [None]:
X_test_lgbm_2 = X_test[['card1',
 'TransactionAmt',
 'card2_target_encoded',
 'addr_target_encoded',
 'addr1_target_encoded',
 'dist1',
 'D15',
 'id_02',
 'TransactionAmt_decimal',
 'C1',
 'D4',
 'card5_target_encoded',
 'D2',
 'D10',
 'D11',
 'V307',
 'D1',
 'D8',
 'D5',
 'V310',
 'D3',
 'id_05',
 'V127',
 'id_06',
 'V314',
 'D9',
 'V264',
 'V312',
 'id_01',
 'D14',
 'V203',
 'C5',
 'D6',
 'V283',
 'D12',
 'V36',
 'V96',
 'V221',
 'V62',
 'V82',
 'V282',
 'V54',
 'V76',
 'V37',
 'V20',
 'V5',
 'V44',
 'V285',
 'V77',
 'V56']]

Third Run the model with top 50 important features (We did with 100 as well. but it results to more overfitted model)

In [None]:
lgbclf_2 = lgb.LGBMClassifier(
              
        random_state=1003,
        scale_pos_weight=scale_pos_weight,
        metric='auc',
        objective= 'binary',
        device = 'gpu',
        subsample= 0.3,
        reg_alpha= 0.3,
        num_leaves=500,
        n_estimators= 1000,
        max_depth=9,
        learning_rate=0.15,
        colsample_bytree= 0.3
    )

lgbclf_2.fit(X_train_lgbm_2,y_train)

#prediction
y_pred_lgbm_3 = lgbclf_2.predict(X_test_lgbm_2)

# Probas for train
y_train_lgbm_proba = lgbclf_2.predict_proba(X_train_lgbm_2)[:, 1]  
train_auc = roc_auc_score(y_train, y_train_lgbm_proba)
print(f'Train AUC: {train_auc}')

# Probas for test
y_test_lgbm_proba = lgbclf_2.predict_proba(X_test_lgbm_2)[:, 1]  
test_auc = roc_auc_score(y_test, y_test_lgbm_proba)
print(f'Test AUC: {test_auc}')