# Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from bayes_opt import BayesianOptimization


from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, f1_score

import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
# References:
# https://www.kaggle.com/code/ashishpatel26/kfold-lightgbm
# https://www.kaggle.com/code/sz8416/simple-bayesian-optimization-for-lightgbm/notebook
# https://www.kaggle.com/code/jsaguiar/lightgbm-7th-place-solution
# https://medium.com/thecyphy/home-credit-default-risk-part-2-84b58c1ab9d5

# Data Cleaning & Feature Engineering

In [3]:
def one_hot_encoder(df):
    
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    
    ohe = OneHotEncoder(handle_unknown='ignore')
    cat_ohe = ohe.fit_transform(df[categorical_columns]).toarray()
    
    # concat the numerical columns and tranformed catrgorical columns
    df_new = df.drop(categorical_columns, axis=1)
    df_new2 = pd.DataFrame(cat_ohe, columns=ohe.get_feature_names_out())
    df_all = pd.concat([df_new, df_new2], axis=1)
    new_columns = [c for c in df_all.columns if c not in original_columns]
    
    return df_all, new_columns 

In [4]:
def application_preprocessing(train, test):
    
    train['CREDIT_INCOME_PERCENT'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
    train['ANNUITY_INCOME_PERCENT'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
    train['CREDIT_TERM'] = train['AMT_ANNUITY'] / train['AMT_CREDIT']
    train['DAYS_EMPLOYED_PERCENT'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']

    test['CREDIT_INCOME_PERCENT'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
    test['ANNUITY_INCOME_PERCENT'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
    test['CREDIT_TERM'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
    test['DAYS_EMPLOYED_PERCENT'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']
    
    

#     poly_features = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
#     poly_features_test = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

#     # imputer for handling missing values
#     imputer = SimpleImputer(strategy = 'median')
#     poly_features = imputer.fit_transform(poly_features)
#     poly_features_test = imputer.transform(poly_features_test)
    
#     # create Polynomial Features
#     poly_transformer = PolynomialFeatures(degree = 3)
#     poly_features = poly_transformer.fit_transform(poly_features)
#     poly_features_test = poly_transformer.transform(poly_features_test)

#     # Create a dataframe of the features 
#     poly_features = pd.DataFrame(poly_features, columns = poly_transformer.get_feature_names_out(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))
#     poly_features_test = pd.DataFrame(poly_features_test, columns = poly_transformer.get_feature_names_out(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))

#     # Merge polynomial features into training/testing dataframe
#     poly_features['SK_ID_CURR'] = train['SK_ID_CURR']
#     train_poly = train.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

#     poly_features_test['SK_ID_CURR'] = test['SK_ID_CURR']
#     test_poly = test.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

#     # Print out the new shapes
#     print('Training data with polynomial features shape: ', train_poly.shape)
#     print('Testing data with polynomial features shape:  ', test_poly.shape)

#     #X_train = train_poly.drop('TARGET', axis=1)
#     #y_train = train_poly['TARGET']
#     #X_test = test_poly.copy()

    categorical_columns = [col for col in train.columns if train[col].dtype == 'object']
    
    class_dict = {}
    for i in categorical_columns:
        le = LabelEncoder()
        train[i] = le.fit_transform(train[i])
        class_dict[i] = list(le.classes_)
        test[i] = le.transform(test[i])
        
    return train, test

#    return train_poly, test_poly

In [5]:
def previous_application_preprocessing(df):
    
    df, cat_cols = one_hot_encoder(df)
    df['DAYS_FIRST_DRAWING'].max()
    df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    df['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    df['APP_CREDIT_PERC'] = df['AMT_CREDIT']/df['AMT_APPLICATION'] 

    num_aggregations = {
        'AMT_ANNUITY': [ 'max', 'mean'],
        'AMT_APPLICATION': [ 'max','mean'],
        'AMT_CREDIT': [ 'max', 'mean'],
        'APP_CREDIT_PERC': [ 'max', 'mean'],
        'AMT_DOWN_PAYMENT': [ 'max', 'mean'],
        'AMT_GOODS_PRICE': [ 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': [ 'max', 'mean'],
        'RATE_DOWN_PAYMENT': [ 'max', 'mean'],
        'DAYS_DECISION': [ 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
        
    df_agg = df.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    df_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
    
    # Previous Applications: Approved Applications - only numerical features
    approved = df[df['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    df_agg = df_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = df[df['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    df_agg = df_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    
    return df_agg

In [6]:
# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance_preprocessing(bureau_balance, bureau):

    bb, bb_cat = one_hot_encoder(bureau_balance)
    bureau, bureau_cat = one_hot_encoder(bureau)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': [ 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': [ 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')

    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

    
    return bureau_agg

In [7]:
def credit_card_balance_preprocessing(cc):
    
    cc, cat_cols = one_hot_encoder(cc)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg([ 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    
    return cc_agg

In [8]:
def pos_cash_preprocessing(pos):
   
    pos, cat_cols = one_hot_encoder(pos)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()

    return pos_agg

In [9]:
def installments_payments_preprocessing(ins):
    
    ins, cat_cols = one_hot_encoder(ins)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum','min','std' ],
        'DBD': ['max', 'mean', 'sum','min','std'],
        'PAYMENT_PERC': [ 'max','mean',  'var','min','std'],
        'PAYMENT_DIFF': [ 'max','mean', 'var','min','std'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum','min','std'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum','std'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum','std']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()

    return ins_agg

# Code for Reducing Memory Usage

In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Importing Required Files

In [11]:
# import data files
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train = reduce_mem_usage(train)
print('train: ', train.shape)

test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
test = reduce_mem_usage(test)
print('test: ', test.shape)

previous_application = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
previous_application = reduce_mem_usage(previous_application)
print('previous_application: ', previous_application.shape)

bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau_balance = reduce_mem_usage(bureau_balance)
print('bureau_balance: ', bureau_balance.shape)

bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau = reduce_mem_usage(bureau)
print('bureau: ', bureau.shape)

credit_card_balance = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
credit_card_balance = reduce_mem_usage(credit_card_balance)
print('credit_card_balance: ', credit_card_balance.shape)

POS_CASH_balance = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')
POS_CASH_balance = reduce_mem_usage(POS_CASH_balance)
print('POS_CASH_balance: ', POS_CASH_balance.shape)

installments_payments = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')
installments_payments = reduce_mem_usage(installments_payments)
print('installments_payments: ', installments_payments.shape)

# Data Processing using Functions

In [12]:
# data preprocessing for every dataset
train, test = application_preprocessing(train, test)
previous_application = previous_application_preprocessing(previous_application)
bureau_and_balance = bureau_and_balance_preprocessing(bureau_balance, bureau)
credit_card_balance = credit_card_balance_preprocessing(credit_card_balance)
pos_cash = pos_cash_preprocessing(POS_CASH_balance)
installments_payments = installments_payments_preprocessing(installments_payments)

In [13]:
## merge dataset using ID
df_train = train.join(bureau_and_balance, how='left', on='SK_ID_CURR')
df_train = df_train.join(previous_application, how='left', on='SK_ID_CURR')
df_train = df_train.join(pos_cash, how='left', on='SK_ID_CURR')
df_train = df_train.join(installments_payments, how='left', on='SK_ID_CURR')
df_train = df_train.join(credit_card_balance, how='left', on='SK_ID_CURR')

df_test = test.join(bureau_and_balance, how='left', on='SK_ID_CURR')
df_test = df_test.join(previous_application, how='left', on='SK_ID_CURR')
df_test = df_test.join(pos_cash, how='left', on='SK_ID_CURR')
df_test = df_test.join(installments_payments, how='left', on='SK_ID_CURR')
df_test = df_test.join(credit_card_balance, how='left', on='SK_ID_CURR')

In [14]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [15]:
import gc

del train
del test
del previous_application
del bureau_balance
del credit_card_balance
del POS_CASH_balance
del installments_payments

gc.collect()

# Train Test Split

In [16]:
X_train = df_train.drop('TARGET', axis=1)
y_train = df_train['TARGET']

X_test = df_test.copy()

In [17]:
import re
X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [18]:
X_train.shape

In [19]:
df_train.TARGET.value_counts()

In [20]:
print(282686/24825)

# XGBoost Model

In [21]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

### Tuning using Bayesian Optimization

In [49]:
def xgb_evaluation(max_depth, min_child_weight, gamma, subsample, colsample_bytree, colsample_bylevel,
                   colsample_bynode, reg_alpha, reg_lambda, scale_pos_weight):
    '''
    Objective function for Bayesian Optimization of XGBoost's Hyperparamters. Takes the hyperparameters as input, and
    returns the Cross-Validation AUC as output.
    
    Inputs: Hyperparamters to be tuned.
        max_depth, min_child_weight, gamma, subsample, colsample_bytree, colsample_bylevel,
        colsample_bynode, reg_alpha, reg_lambda
        
    Returns:
        CV ROC-AUC Score
    '''
    params = {
        'learning_rate' : 0.01,
        'n_estimators' : 10000,
        'tree_method' : 'gpu_hist',
        'gpu_id' : 0,
        'max_depth' : int(round(max_depth)),
        'min_child_weight' : int(round(min_child_weight)),
        'subsample': subsample,
        'gamma' : gamma,
        'colsample_bytree' : colsample_bytree,
        'colsample_bylevel' : colsample_bylevel,
        'colsample_bynode' : colsample_bynode,
        'reg_alpha' : reg_alpha,
        'reg_lambda' : reg_lambda,
        'scale_pos_weight' : int(round(scale_pos_weight)),
        'random_state' : 51412
    }    
    
    #defining the Cross-Validation Strategry
    stratified_cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 33)
    cv_preds = np.zeros(X_train.shape[0])
    
    #iterating over each fold, training the model, and making Out of Fold Predictions
    for train_indices, cv_indices in stratified_cv.split(X_train, y_train):
        
        x_tr = X_train.iloc[train_indices]
        y_tr = y_train.iloc[train_indices]
        x_cv = X_train.iloc[cv_indices]
        y_cv = y_train.iloc[cv_indices]
        
        xgbc = XGBClassifier(**params)
        xgbc.fit(x_tr, y_tr, eval_set= [(x_cv,y_cv)],
                        eval_metric='auc', verbose = False, early_stopping_rounds=200)
        
        cv_preds[cv_indices] = xgbc.predict_proba(x_cv, ntree_limit = xgbc.get_booster().best_ntree_limit)[:,1]
        gc.collect()   
        
    return roc_auc_score(y_train, cv_preds)
  
#using the above objective function to find the optimal hyperparams
#defining the optimizer and the hyperparameters along with ranges of values.
bopt_xgb =  BayesianOptimization(xgb_evaluation, {'max_depth' : (5,15),
                                                  'min_child_weight' : (5,80),
                                                  'gamma' : (0.2,1),
                                                  'subsample' : (0.5,1),
                                                  'colsample_bytree' : (0.5,1),
                                                  'colsample_bylevel' : (0.3,1),
                                                  'colsample_bynode' : (0.3,1),
                                                  'reg_alpha' : (0.001, 0.3),
                                                  'reg_lambda' : (0.001, 0.3),
                                                  'scale_pos_weight' : (1,11)}, 
                                 random_state = 55)

bopt_xgb.maximize(n_iter = 10, init_points = 5)

In [25]:
print('AUC: ', bopt_xgb.max['target'])
print('parameters: ', bopt_xgb.max['params'])

In [26]:
max_params = bopt_xgb.max['params']
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['min_child_weight'] = int(round(max_params['min_child_weight']))

### Best XGBoost Model

In [37]:
clf_xgb = xgboost.XGBClassifier(
 n_estimators = 10000,
 learning_rate = 0.01,
 colsample_bylevel = 0.916,
 colsample_bynode = 0.5289,
 colsample_bytree = 0.52050,
 gamma = 0.85416,
 max_depth = 6,
 min_child_weight = 67,
 reg_alpha = 0.24312,
 reg_lambda = 0.104283,
 subsample = 0.854,
 tree_method = 'gpu_hist',
 gpu_id = 0,
 eval_metric='auc',
 early_stopping_rounds=200,
 random_state = 51412)

In [38]:
clf_xgb.fit(X_train, y_train)

### Making Predictions

In [39]:
y_train_predict = clf_xgb.predict(X_train)
y_test_predict =clf_xgb.predict(X_test)

In [41]:
y_test_pred=clf_xgb.predict_proba(X_test)[:,1]

### Preparing Submission File

In [50]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission['TARGET'] = y_test_pred
submission['TARGET'] = submission['TARGET'].apply(lambda x: 0 if x <0 else x)
submission.to_csv('submission_xgb.csv', index=False)
submission.head()