# Home Credit Default Risk


#     Baseline and Improvement

<!-- https://blog.ml.cmu.edu/2020/08/31/3-baselines/-->
<div> <img src="./image/baseline.jpg" alt="Drawing" style="width: 650px;"/></div>

<!--![](./image/baseline.jpg) -->

# Creating Baseline

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import gc
import os 

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
###################################################################################
# The `reduce_mem_usage()` function is commonly used in Kaggle competitions to 
# reduce the memory usage of a pandas DataFrame. It optimizes the data types of 
# the DataFrame columns to occupy less memory without losing significant information. 
# This helps in handling large datasets efficiently, improving performance and 
# avoiding memory limitations.
###################################################################################

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
####################################################
# train    = 'application_train.csv'
# used in: 
# 1) baseline
# 2) feature engineering 1

#pos_cash = 'POS_CASH_balance.csv' 
# used in: 
# feature engineering 2

# test     = 'application_test.csv'
# used for: 
# kaggle submission
####################################################

dir_ = './data'

train    = 'application_train.csv'
test     = 'application_test.csv'
pos_cash = 'POS_CASH_balance.csv' 

application_train = pd.read_csv(os.path.join(dir_, train))
application_train = reduce_mem_usage(application_train)
print(application_train.shape)
display(application_train.sample(10))


Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%
(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
277817,421871,0,Cash loans,F,N,Y,1,90000.0,315000.0,20259.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
299164,446578,1,Cash loans,F,N,Y,0,135000.0,232434.0,10368.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
179485,307989,0,Cash loans,F,N,Y,0,157500.0,263686.5,16258.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,6.0
306292,454871,0,Revolving loans,F,N,Y,0,180000.0,180000.0,9000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
214591,348660,0,Cash loans,M,N,Y,0,144000.0,276277.5,15115.5,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0
173697,301298,0,Cash loans,F,Y,Y,2,225000.0,510853.5,43974.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
250182,389467,0,Cash loans,F,Y,Y,0,112500.0,263686.5,26208.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
84861,198458,0,Cash loans,F,N,Y,1,360000.0,1288350.0,37800.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
156020,280863,0,Cash loans,F,N,Y,0,135000.0,450000.0,21888.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
284812,429848,0,Cash loans,F,N,Y,0,49500.0,85320.0,4891.5,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0


## Create dataset


In [4]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

print(application_train.shape)
print(x_train.shape)
print(y_train.shape)

(307511, 122)
(307511, 120)
(307511,)


#### Convert category datatype

In [5]:
print (x_train.info())

for col in x_train.columns:
    if (x_train[col].dtype=="0") or (x_train[col].dtype=="object"):
        x_train[col] = x_train[col].astype("category")
print (x_train.info())

x_train.iloc[:,:10].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 120 entries, NAME_CONTRACT_TYPE to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float16(61), float32(4), int16(2), int32(1), int8(36), object(16)
memory usage: 90.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 120 entries, NAME_CONTRACT_TYPE to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: category(16), float16(61), float32(4), int16(2), int32(1), int8(36)
memory usage: 58.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   NAME_CONTRACT_TYPE  307511 non-null  category
 1   CODE_GENDER         307511 non-null  category
 2   FLAG_OWN_CAR        307511 non-null  category
 3   FLAG_OWN_REALTY     307511 non-null  category
 4   CNT_CHILDREN        307511 non-null  int8    
 5   AMT_INCOME_TOTAL    3075

## Validation design
#### 0 vs 1 target ratio 

In [6]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


0    282686
1     24825
Name: TARGET, dtype: int64

In [7]:
print(x_train.shape)
x_train.sample(10)

(307511, 120)


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
108206,Cash loans,M,N,Y,0,157500.0,2025000.0,53419.5,2025000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
270997,Cash loans,F,N,Y,0,103500.0,405000.0,22099.5,405000.0,Unaccompanied,...,0,0,0,0,0.0,1.0,0.0,0.0,0.0,1.0
56109,Cash loans,F,N,N,3,112500.0,187704.0,12672.0,148500.0,Unaccompanied,...,0,0,0,0,,,,,,
300563,Cash loans,F,Y,Y,3,135000.0,723996.0,34960.5,585000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
14384,Cash loans,F,Y,Y,0,171000.0,265851.0,21442.5,229500.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,2.0
56048,Cash loans,M,N,N,0,72000.0,295668.0,11277.0,193500.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,5.0,0.0,2.0
180459,Cash loans,M,N,N,2,99000.0,263686.5,15268.5,238500.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
32394,Cash loans,F,Y,Y,1,112500.0,545040.0,25537.5,450000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
296424,Cash loans,F,Y,N,0,108000.0,454500.0,27193.5,454500.0,Family,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
196042,Cash loans,M,N,Y,0,202500.0,695439.0,27076.5,580500.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# https://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation

<!-- https://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation  -->
### Cross Validation
Cross Validation: Splits the data into k "random" folds

<div> <img src="./image/CrossValidation.png" alt="Drawing" style="width: 450px;"/></div>

### Stratified Cross Validation  --> **This project use this!!**
Stratified Cross Valiadtion: Splits the data into k folds, making sure each fold is an appropriate representative of the original data. (class distribution, mean, variance, etc)


Stratified merit-based approach addresses imbalanced data by considering the relative importance or performance of different classes, ensuring proportional representation during sampling or modeling.
<div> <img src="./image/StratifiedCrossValidation.png" alt="Drawing" style="width: 600px;"/></div>

## 7.3.5 Training the model
#### Script 7-9: Split train and validation dataset

In [9]:
# Create index list for each fold
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# For example index list on fold 0
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# Split train and validation dataset
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


In [10]:
# https://machinelearningmastery.com/tour-of-evaluation-metrics-for-imbalanced-classification/

<!-- https://machinelearningmastery.com/tour-of-evaluation-metrics-for-imbalanced-classification/ -->
<div> <img src="./image/ImbalancedData_AUC_Jason.png" alt="Drawing" style="width: 650px;"/></div>

#### Model training

In [11]:
params = {
    'metric': 'auc',
    "random_state": 123,
}
# Training the model
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          early_stopping_rounds=100,
          verbose=100
         )

#model.fit(x_tr, y_tr)
#results = cross_val_score(model, x_va, y_va, cv = kfold) 

# Save the model
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[100]	training's auc: 0.805563	valid_1's auc: 0.75705


#### Model Valuation

<div> <img src="./image/ChatGPT_Predict_Proba.png" alt="Drawing" style="width: 650px;"/></div>

In [12]:
# AUC on training data
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# AUC on validation data
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

metrics = []
metrics.append([nfold, metric_tr, metric_va])

# Results
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

# [auc] tr:0.8126, va:0.7586

[auc] tr:0.8056, va:0.7571


#### Get predicted Out Of Fold (OOF)

In [13]:
train_oof = np.zeros(len(x_train))
train_oof[idx_va] = y_va_pred

#### Get importance of explanatory variables

In [14]:
# Feature importances
imp_fold = pd.DataFrame({"col":x_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# Sanity check (Top 10 feature importance）
display(imp_fold.sort_values("imp", ascending=False)[:10])

imp = pd.DataFrame()
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
38,ORGANIZATION_TYPE,432,0
39,EXT_SOURCE_1,233,0
41,EXT_SOURCE_3,216,0
40,EXT_SOURCE_2,187,0
6,AMT_CREDIT,148,0
7,AMT_ANNUITY,142,0
15,DAYS_BIRTH,133,0
8,AMT_GOODS_PRICE,108,0
18,DAYS_ID_PUBLISH,100,0
16,DAYS_EMPLOYED,88,0


#### Model validation (summary of all folds)

In [15]:
metrics = np.array(metrics)
print(metrics)

print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:,1].mean(), metrics[:,1].std(),
    metrics[:,2].mean(), metrics[:,2].std(),
))

# Calculate the OOF value
print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

[[0.         0.80556334 0.75705038]]
[cv] tr:0.8056+-0.0000, va:0.7571+-0.0000
[oof] 0.5103


#### Get predicted OOF (summary of all folds)

In [16]:
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031197
3,100006,0,0.0
4,100007,0,0.0


#### Get feature importance  (summary of all folds)

In [17]:
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
imp.columns = ["col", "imp", "imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,142.0,
1,AMT_CREDIT,148.0,
2,AMT_GOODS_PRICE,108.0,
3,AMT_INCOME_TOTAL,52.0,
4,AMT_REQ_CREDIT_BUREAU_DAY,7.0,


#### Define training function

In [18]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

In [19]:
x_train.shape

(307511, 120)

#### Run training process

In [20]:
params = {
    'metric': 'auc',
    "random_state": 123,
}

train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[100]	training's auc: 0.805563	valid_1's auc: 0.75705
[auc] tr:0.8056, va:0.7571
-------------------- 1 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.807226	valid_1's auc: 0.757708
[auc] tr:0.8072, va:0.7577
-------------------- 2 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.806607	valid_1's auc: 0.759367
[auc] tr:0.8066, va:0.7594
-------------------- 3 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.807561	valid_1's auc: 0.754912
[auc] tr:0.8076, va:0.7549
-------------------- 4 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.808651	valid_1's auc: 0.750329
[auc] tr:0.8087, va:0.7503
-------------------- result --------------------
[[0.         0.80556334 0.75705038]
 [1.         0.80722559 0.75770765]
 [2.         0.80660738 0.75936687]
 [3.         0.80756082 0.75491247]
 [4.         0.80865059 0.75032941]]
[cv] t

#### Checking the feature importance 

In [21]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
102,ORGANIZATION_TYPE,423.2,11.322544
36,EXT_SOURCE_1,235.8,8.526429
38,EXT_SOURCE_3,219.2,2.588436
37,EXT_SOURCE_2,185.2,8.497058
1,AMT_CREDIT,136.4,9.964939
22,DAYS_BIRTH,130.0,7.745967
0,AMT_ANNUITY,123.2,13.00769
24,DAYS_ID_PUBLISH,109.6,7.668116
2,AMT_GOODS_PRICE,106.6,5.458938
23,DAYS_EMPLOYED,90.4,6.730527


# Model Prediction
#### ... on test.csv dataset

In [22]:
application_test = pd.read_csv(os.path.join(dir_, test))
application_test = reduce_mem_usage(application_test)
print(application_test.shape)
display(application_test.sample(10))

x_test = application_test.drop(columns=["SK_ID_CURR" ])
id_test = application_test[["SK_ID_CURR"]]

# Change categorical type as category
for col in x_test.columns:
    if (x_test[col].dtype=="0") or (x_test[col].dtype=="object"):
        x_test[col] = x_test[col].astype("category")
        

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%
(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
48285,452759,Cash loans,F,Y,N,1,405000.0,500490.0,48888.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
44490,425338,Cash loans,F,N,Y,1,112500.0,481176.0,24696.0,360000.0,...,0,0,0,0,,,,,,
30197,319602,Cash loans,F,N,Y,0,319500.0,757606.5,28692.0,625500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
46852,442328,Cash loans,F,N,N,1,315000.0,177768.0,9076.5,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
9316,167898,Cash loans,F,N,N,0,157500.0,49752.0,5224.5,45000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
24229,276550,Revolving loans,M,Y,Y,1,135000.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
7069,151678,Cash loans,F,N,Y,0,112500.0,815733.0,29430.0,688500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
47857,449760,Cash loans,M,Y,N,0,135000.0,431280.0,22149.0,360000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
47957,450428,Cash loans,F,Y,N,0,360000.0,450000.0,35685.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
33333,343080,Cash loans,M,N,Y,0,225000.0,573408.0,31234.5,495000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### Read trainded model

In [23]:
with open("model_lgb_fold0.pickle", "rb") as f:
    model = pickle.load(f)

#### Prediction

In [24]:
test_pred_fold = model.predict_proba(x_test)[:,1]
test_pred = np.zeros((len(x_test), 5))

# Prediction on first fold 
test_pred[:, 0] = test_pred_fold

In [25]:
# Calculate the average prediction for each fold
test_pred_mean = test_pred.mean(axis=1)

df_test_pred = pd.concat([
        id_test,
        pd.DataFrame({"pred": test_pred_mean}),
    ], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006279
1,100005,0.025689
2,100013,0.004323
3,100028,0.008313
4,100038,0.030235


In [26]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

#### Run prediction

In [27]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### Create kaggle submission file

In [28]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

df_submit.to_csv("a0_submission_baseline.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.041861
1,100005,0.129015
2,100013,0.02404
3,100028,0.039112
4,100038,0.154073


# Feature Engineering
## Feature Engineering #1 added combination features

<!-- https://blog.ml.cmu.edu/2020/08/31/3-baselines/-->
<div> <img src="./image/CombineColumns.png" alt="Drawing" style="width: 350px;"/></div>



In [29]:
display(application_train["DAYS_EMPLOYED"].value_counts())
print("Ratio of postive value : {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).mean()))
print("Number of postive value: {}".format((application_train["DAYS_EMPLOYED"]>0).sum()))


 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-13961         1
-11827         1
-10176         1
-9459          1
-8694          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

Ratio of postive value : 0.1801
Number of postive value: 55374


#### Dealing with missing values（converted 365243 to null）

In [30]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

#### Hypothesis-based feature generation

In [31]:
# Feature combination

# Feature 1: INCOME_div_PERSON
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# Feature 2: INCOME_div_EMPLOYED
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# Feature 3: Add stats as mean, max, etc
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# Feature 4: DAYS_EMPLOYED_div_BIRTH
application_train['DAYS_EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# Feature 5: ANNUITY_div_INCOME
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# Feature 6: ANNUITY_div_CREDIT
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

In [32]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if (x_train[col].dtype=="0") or (x_train[col].dtype=="object"):
        x_train[col] = x_train[col].astype("category")

In [33]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 130) (61503, 130)
[100]	training's auc: 0.813555	valid_1's auc: 0.762234
[auc] tr:0.8136, va:0.7622
-------------------- 1 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.814392	valid_1's auc: 0.765774
[auc] tr:0.8144, va:0.7658
-------------------- 2 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.812038	valid_1's auc: 0.766051
[auc] tr:0.8120, va:0.7661
-------------------- 3 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.814433	valid_1's auc: 0.761373
[auc] tr:0.8144, va:0.7614
-------------------- 4 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.814147	valid_1's auc: 0.757861
[auc] tr:0.8141, va:0.7579
-------------------- result --------------------
[[0.         0.81355544 0.76223382]
 [1.         0.81439179 0.76577447]
 [2.         0.81203825 0.76605053]
 [3.         0.81443332 0.76137314]
 [4.         0.81414662 0.75786134]]
[cv] 

In [34]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
112,ORGANIZATION_TYPE,381.4,19.603571
10,ANNUITY_div_CREDIT,308.2,14.307341
41,EXT_SOURCE_3,129.0,5.744563
24,DAYS_BIRTH,116.2,8.786353
39,EXT_SOURCE_1,108.8,7.049823
44,EXT_SOURCE_mean,108.0,6.78233
27,DAYS_ID_PUBLISH,85.0,7.314369
0,AMT_ANNUITY,81.6,9.838699
2,AMT_GOODS_PRICE,79.0,5.385165
1,AMT_CREDIT,70.4,5.22494


#### Create test dataset

In [35]:

application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)


application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['DAYS_EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']


x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]


for col in x_test.columns:
    if (x_test[col].dtype=="0") or (x_test[col].dtype=="object"):
        x_test[col] = x_test[col].astype("category")


In [36]:
print(x_train.shape)
print(x_test.shape)

(307511, 130)
(48744, 130)


In [37]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [38]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("a1_submission_FeatureEngineering1.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04262
1,100005,0.114618
2,100013,0.021747
3,100028,0.043205
4,100038,0.180968


## Feature engineering #2:  added POS_CASH_balance.csv

<!-- https://blog.ml.cmu.edu/2020/08/31/3-baselines/-->
<div> <img src="./image/combine.png" alt="Drawing" style="width: 300px;"/></div>

In [39]:
pos = pd.read_csv(os.path.join(dir_, pos_cash))
pos = reduce_mem_usage(pos)
print(pos.shape)
display(pos.sample(10))

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 238.45 MB
Decreased by 60.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
6986571,1735553,320625,-50,12.0,1.0,Active,0,0
415259,2717377,425659,-71,12.0,12.0,Active,0,0
8821976,2046141,218617,-30,24.0,19.0,Active,0,0
1917441,2538519,359101,-10,12.0,12.0,Active,0,0
9613022,2234123,275760,-1,10.0,4.0,Active,0,0
9369691,1468460,129932,-7,12.0,3.0,Active,0,0
5455205,1365779,200649,-35,24.0,22.0,Active,0,0
7931271,1726434,438305,-6,36.0,34.0,Active,0,0
3385416,2083374,158346,-39,1.0,0.0,Active,0,0
4368239,2508258,400655,-43,6.0,5.0,Active,0,0


#### Convert categorical variables to numeric with one-hot-encoding

In [40]:
# https://medium.com/hackernoon/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f

<!-- https://medium.com/hackernoon/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f -->
<div> <img src="./image/Onehot_1.png" alt="Drawing" style="width: 650px;"/></div>
<div> <img src="./image/Onehot_2.png" alt="Drawing" style="width: 650px;"/></div>

In [41]:
pos_ohe = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], dummy_na=True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

10


['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_nan']

#### adding more statistics columns based on  SK_ID_CURR using groupby

In [42]:
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        
        "SK_ID_PREV":["count", "nunique"],
    }
)


pos_ohe_agg.columns = [i + "_" + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

(337252, 33)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_std,MONTHS_BALANCE_min,MONTHS_BALANCE_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_std,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_FUTURE_mean,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100001,-72.555556,20.863312,-96,-53,4.0,0.0,4.0,4.0,1.444336,...,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,9,2
1,100002,-10.0,5.627314,-19,-1,24.0,0.0,24.0,24.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1
2,100003,-43.785714,24.640162,-77,-18,10.109375,2.806597,6.0,12.0,5.785156,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28,3
3,100004,-25.5,1.290994,-27,-24,3.75,0.5,3.0,4.0,2.25,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4,1
4,100005,-20.0,3.316625,-25,-15,11.703125,0.948683,9.0,12.0,7.199219,...,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,11,1


#### Merge train and pos files using SK_ID_CURR as a key

In [43]:
df_train = pd.merge(application_train, pos_ohe_agg, on="SK_ID_CURR", how="left")
print(df_train.shape)
df_train.head()

(307511, 164)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


In [44]:
# Save merged train dataset

merged_train = 'merged_train.csv'
df_train.to_csv(os.path.join(dir_, merged_train))

In [45]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if (x_train[col].dtype=="0") or (x_train[col].dtype=="object"):
        x_train[col] = x_train[col].astype("category")

In [46]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.821034	valid_1's auc: 0.769234
[auc] tr:0.8210, va:0.7692
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.820685	valid_1's auc: 0.77298
[auc] tr:0.8207, va:0.7730
-------------------- 2 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.821858	valid_1's auc: 0.771818
[auc] tr:0.8219, va:0.7718
-------------------- 3 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.821348	valid_1's auc: 0.769572
[auc] tr:0.8213, va:0.7696
-------------------- 4 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.822686	valid_1's auc: 0.763546
[auc] tr:0.8227, va:0.7635
-------------------- result --------------------
[[0.         0.82103437 0.76923391]
 [1.         0.82068489 0.77297952]
 [2.         0.82185786 0.77181802]
 [3.         0.82134843 0.76957235]
 [4.         0.8226858  0.76354639]]
[cv] t

In [47]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
134,ORGANIZATION_TYPE,359.2,8.01249
10,ANNUITY_div_CREDIT,222.6,2.966479
49,EXT_SOURCE_3,97.4,7.300685
32,DAYS_BIRTH,94.6,11.081516
52,EXT_SOURCE_mean,91.6,4.09878
47,EXT_SOURCE_1,87.2,7.52994
21,CNT_INSTALMENT_FUTURE_mean,84.0,9.77241
35,DAYS_ID_PUBLISH,69.2,2.774887
0,AMT_ANNUITY,63.6,4.827007
23,CNT_INSTALMENT_FUTURE_std,63.2,4.868265


In [61]:
df_test = pd.merge(application_test, pos_ohe_agg, on="SK_ID_CURR", how="left")

# Save merged test dataset

merged_test = 'merged_test.csv'
df_test.to_csv(os.path.join(dir_, merged_test))

In [48]:
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

for col in x_test.columns:
    if (x_test[col].dtype=="0") or (x_test[col].dtype=="object"):
        x_test[col] = x_test[col].astype("category")

In [49]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [50]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("a2_submission_FeatureEngineering2.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.037767
1,100005,0.10573
2,100013,0.03026
3,100028,0.041845
4,100038,0.216325


# Light GBM model hyperparameter tuning


In [51]:
col_filter = sorted(list(imp.sort_values("imp", ascending=False)[:100]["col"]))
# col_filter

## Run hyperparameter optimization with optuna

In [52]:
import optuna


In [53]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if (x_train[col].dtype=="0") or (x_train[col].dtype=="object"):
        x_train[col] = x_train[col].astype("category")

#### Define the objective function

In [54]:
# Hyperparameters -- constants 
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "bagging_freq": 1,
    "random_state": 123,
}

# Define the objective function
def objective(trial):
    # Hyperparameters -- variables 
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)
    
    # Model training and validation 
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold = [0]  # To speed up the process, only the first fold will be used
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = x_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # Set the evaluation metric to AUC (Area Under the Curve)
        list_metrics.append(metric_va)
    
    # Average of the valuation metrics
    metrics = np.mean(list_metrics)
    
    return metrics

#### Optimization Process 

-Important Note
Even if the seed is fixed in Optuna, there is no guarantee of reproducibility in the search results. 
Therefore, the results may differ each run.
If you require reproducibility, please set parallelization to none (n_jobs=1).

In [55]:
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=5)

[32m[I 2023-05-28 15:33:48,342][0m A new study created in memory with name: no-name-5cd39a84-a92b-4739-ba15-a2ed68c1d149[0m
[32m[I 2023-05-28 15:34:49,084][0m Trial 0 finished with value: 0.7726843253155904 and parameters: {'num_leaves': 72, 'min_child_samples': 32, 'min_sum_hessian_in_leaf': 0.004582436023061751, 'feature_fraction': 0.7409252925900391, 'bagging_fraction': 0.9932767945167148, 'lambda_l1': 4.509990484361508, 'lambda_l2': 0.2304137805098549}. Best is trial 0 with value: 0.7726843253155904.[0m
[32m[I 2023-05-28 15:34:59,852][0m Trial 3 finished with value: 0.7725126470742152 and parameters: {'num_leaves': 109, 'min_child_samples': 109, 'min_sum_hessian_in_leaf': 8.47895307204338e-05, 'feature_fraction': 0.7630207328213399, 'bagging_fraction': 0.9340128938596983, 'lambda_l1': 4.50172933043339, 'lambda_l2': 0.041294280571166964}. Best is trial 0 with value: 0.7726843253155904.[0m
[32m[I 2023-05-28 15:35:40,898][0m Trial 5 finished with value: 0.7717600086950583 a

[32m[I 2023-05-28 15:41:42,270][0m Trial 20 finished with value: 0.7734721386398695 and parameters: {'num_leaves': 174, 'min_child_samples': 14, 'min_sum_hessian_in_leaf': 8.069383843944009e-05, 'feature_fraction': 0.7137625902309237, 'bagging_fraction': 0.9986063667250312, 'lambda_l1': 5.1081565882914015, 'lambda_l2': 0.6185665676324995}. Best is trial 20 with value: 0.7734721386398695.[0m
[32m[I 2023-05-28 15:42:08,593][0m Trial 22 finished with value: 0.7713202577581789 and parameters: {'num_leaves': 72, 'min_child_samples': 9, 'min_sum_hessian_in_leaf': 0.009849950963265021, 'feature_fraction': 0.7113520164474019, 'bagging_fraction': 0.7157819089573046, 'lambda_l1': 20.447870010956976, 'lambda_l2': 0.5138577467021287}. Best is trial 20 with value: 0.7734721386398695.[0m
[32m[I 2023-05-28 15:42:11,988][0m Trial 23 finished with value: 0.7710191083596709 and parameters: {'num_leaves': 78, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.008044595319661623, 'feature_fracti

[32m[I 2023-05-28 15:48:08,205][0m Trial 42 finished with value: 0.7748298402233156 and parameters: {'num_leaves': 32, 'min_child_samples': 91, 'min_sum_hessian_in_leaf': 0.0029183469962990602, 'feature_fraction': 0.5806211907674361, 'bagging_fraction': 0.9291681451992353, 'lambda_l1': 7.328902822317442, 'lambda_l2': 0.16474271074865926}. Best is trial 40 with value: 0.7749842017330482.[0m
[32m[I 2023-05-28 15:48:26,840][0m Trial 43 finished with value: 0.7735297387702812 and parameters: {'num_leaves': 33, 'min_child_samples': 90, 'min_sum_hessian_in_leaf': 0.0033158345226905325, 'feature_fraction': 0.5558269883540323, 'bagging_fraction': 0.9307338002306628, 'lambda_l1': 40.6302050038806, 'lambda_l2': 0.18109877200900043}. Best is trial 40 with value: 0.7749842017330482.[0m
[32m[I 2023-05-28 15:49:04,482][0m Trial 44 finished with value: 0.7735610627820759 and parameters: {'num_leaves': 34, 'min_child_samples': 88, 'min_sum_hessian_in_leaf': 0.0023134440746155494, 'feature_frac

#### Confirmation of the optimization results

In [56]:
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7759


{'num_leaves': 9,
 'min_child_samples': 96,
 'min_sum_hessian_in_leaf': 0.00256322488863174,
 'feature_fraction': 0.5812861069467355,
 'bagging_fraction': 0.9281514312192686,
 'lambda_l1': 6.8267287761485544,
 'lambda_l2': 0.17892062255143676}



<div> <img src="./image/hyperparameter.png" alt="Drawing" style="width: 350px;"/></div>


#### Script 7-52: Find best hyperparameters

In [57]:
params_best = trial.params
params_best.update(params_base)
display(params_best)



{'num_leaves': 9,
 'min_child_samples': 96,
 'min_sum_hessian_in_leaf': 0.00256322488863174,
 'feature_fraction': 0.5812861069467355,
 'bagging_fraction': 0.9281514312192686,
 'lambda_l1': 6.8267287761485544,
 'lambda_l2': 0.17892062255143676,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'verbosity': -1,
 'learning_rate': 0.05,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'random_state': 123}

#### Script 7-53: Train model with best hyperparameters

In [58]:

train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                    params=params_best,
                                   )



-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.765571	valid_1's auc: 0.758153
[200]	training's auc: 0.778053	valid_1's auc: 0.767225
[300]	training's auc: 0.78542	valid_1's auc: 0.770731
[400]	training's auc: 0.790937	valid_1's auc: 0.772669
[500]	training's auc: 0.795902	valid_1's auc: 0.773889
[600]	training's auc: 0.800108	valid_1's auc: 0.774282
[700]	training's auc: 0.804199	valid_1's auc: 0.774666
[800]	training's auc: 0.808092	valid_1's auc: 0.775149
[900]	training's auc: 0.811767	valid_1's auc: 0.775339
[1000]	training's auc: 0.815254	valid_1's auc: 0.775564
[1100]	training's auc: 0.818908	valid_1's auc: 0.775625
[1200]	training's auc: 0.822111	valid_1's auc: 0.775704
[1300]	training's auc: 0.825312	valid_1's auc: 0.77585
[auc] tr:0.8241, va:0.7759
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.764833	valid_1's auc: 0.759916
[200]	training's auc: 0.77689	valid_1's auc: 0.769018
[300]

#### Script 7-54: Create submission dataset

In [59]:
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

for col in x_test.columns:
    if (x_test[col].dtype=="0") or (x_test[col].dtype=="object"):
        x_test[col] = x_test[col].astype("category")

# predict
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

# make submission-file
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("a3_submission_HyperParameterTuning.csv", index=None)


-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.
(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.042383
1,100005,0.1245
2,100013,0.028526
3,100028,0.047363
4,100038,0.193946




<div> <img src="./image/kaggle_submission.png" alt="Drawing" style="width: 750px;"/></div>


<!-- https://www.kaggle.com/code/pestipeti/probing-private-lb-->
<div> <img src="./image/public_vs_private.png" alt="Drawing" style="width: 450px;"/></div>





<div> <img src="./image/end.jpg" alt="Drawing" style="width: 450px;"/></div>


In [60]:
##########################################################################
# Book reference:
# https://www.ric.co.jp/pdfs/contents/pdfs/1326_support.pdf
##########################################################################