In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

In [54]:
# Load the data
application_test = pd.read_csv('./HomeCreditCSV/application_test.csv')
application_train = pd.read_csv('./HomeCreditCSV/application_train.csv')

**Data preprocessing**

In [55]:
def split_feature_type(data):
    categorical_list = []
    discrete_list = []
    continuos_list = []
    for i in data.columns.tolist():
        if data[i].dtype == 'object':
            categorical_list.append(i)
        elif data[i].nunique() < 20:
            discrete_list.append(i)
        else:
            continuos_list.append(i)
    return categorical_list, discrete_list, continuos_list

def label_encoder(data):
    categorical_cols = data.select_dtypes(include='object').columns
    for i in categorical_cols:
        encoder = LabelEncoder()
        data[i] = encoder.fit_transform(data[i])

In [56]:
# Imputing missing data by SimpleImputer 
categorical_list, discrete_list, continuos_list = split_feature_type(application_train)
discrete_list.remove('TARGET')

application_train[categorical_list] = SimpleImputer(
    strategy='most_frequent').fit_transform(
    application_train[categorical_list])

application_train[discrete_list] = SimpleImputer(
    strategy='most_frequent').fit_transform(
    application_train[discrete_list])

application_train[continuos_list] = SimpleImputer(
    strategy='median').fit_transform(
    application_train[continuos_list])

application_test[categorical_list] = SimpleImputer(
    strategy='most_frequent').fit_transform(
    application_test[categorical_list])

application_test[discrete_list] = SimpleImputer(
    strategy='most_frequent').fit_transform(
    application_test[discrete_list])

application_test[continuos_list] = SimpleImputer(
    strategy='median').fit_transform(
    application_test[continuos_list])

label_encoder(application_train)
label_encoder(application_test)

In [57]:
X_train = application_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y_train = application_train.TARGET
X_pred = application_test.drop(['SK_ID_CURR'], axis=1)
y_pred = np.zeros(X_pred.shape[0])
folds = KFold(n_splits=5)
validation_pred = np.zeros(X_train.shape[0])

# Problem 1

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
for train_index, validation_index in folds.split(X_train):
    train_x, train_y = X_train.iloc[train_index], y_train[train_index]
    validation_x, validation_y = X_train.iloc[validation_index], y_train[validation_index]
    
    model_sk = lgb.LGBMClassifier(max_depth=7, learning_rate=0.05, min_split_gain=0.01, 
                                  early_stopping_rounds = 100, n_estimators= 2000,
                                  min_child_weight=40, num_leaves=24)
    
    
    model_sk.fit(train_x, train_y, early_stopping_rounds=100, 
                 eval_set = [(validation_x, validation_y)], eval_metric = 'auc', verbose = 200)
    # Estimation of validation set
    validation_pred[validation_index] = model_sk.predict(validation_x, num_iteration=model_sk.best_iteration_)
    print('AUC score: {}'.format(roc_auc_score(validation_y, validation_pred[validation_index])))

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.759242	valid_0's binary_logloss: 0.244332
[400]	valid_0's auc: 0.761539	valid_0's binary_logloss: 0.243642
Early stopping, best iteration is:
[463]	valid_0's auc: 0.761649	valid_0's binary_logloss: 0.243592
AUC score: 0.5102507661362059
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.759564	valid_0's binary_logloss: 0.249955
[400]	valid_0's auc: 0.761377	valid_0's binary_logloss: 0.249382
Early stopping, best iteration is:
[494]	valid_0's auc: 0.761716	valid_0's binary_logloss: 0.249285
AUC score: 0.5128723337275438
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.751654	valid_0's binary_logloss: 0.248823
[400]	valid_0's auc: 0.753431	valid_0's binary_logloss: 0.248368
Early stopping, best iteration is:
[393]	valid_0's auc: 0.753498	valid_0's binary_logloss: 0.248352
AUC score: 0.5091098595577611
Training until validation scores d

# Problem 2

In [8]:
from sklearn.model_selection import GridSearchCV
# Create params list to search
modelParams = {
    'num_leaves': [12,16,20],
    'min_split_gain': [0.01,0.02],
}
model = lgb.LGBMClassifier(max_depth=7, learning_rate=0.05, min_split_gain=0.01,
                           n_estimators= 2000, min_child_weight=40, num_leaves=24)
grid = GridSearchCV(model, modelParams, verbose=0, cv=5, n_jobs=2)
grid.fit(X_train,y_train)
print(grid.best_params_)
print(grid.best_score_)

{'min_split_gain': 0.02, 'num_leaves': 12}
0.9194727994176329


In [9]:
grid.best_estimator_

LGBMClassifier(learning_rate=0.05, max_depth=7, min_child_weight=40,
               min_split_gain=0.02, n_estimators=2000, num_leaves=12)

# Problem 3

[[Parameter tuning with Gradient Boosting Machine]](https://www.kaggle.com/willkoehrsen/intro-to-model-tuning-grid-and-random-search)<br>
 - In my implemetation of learning and verification, I use **LightGBMClassifer** which is an API for Gradient Boosting Machine for skcitlearn library. The published notebook use the Gradient Boosting Machine implementation instead for greater accuracy and speed.<br>
 - This notebook work with a subset of the data consisting of 10000 rows since Hyperparameter tuning is extremely computationally expensive and working with the full dataset in a Kaggle Kernel would not be feasible for more than a few search iterations.<br>
 - In this notebook, Grid Search and Random Search are two of approaches used for hyperparameters tuning.<br>
[[Bayesian optimization for LightGBM]](https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm)
 <br>
 - This notebook use **Bayesian Optimization** technique for hyperparameters tuning
 - The **Bayesian Optimization** is an optimization technique based on **Bayes theorem** which direct the search in order to find the minimum or maximum of an objective function.

# Problem 4

After doing some researches on both papers, I have decided to choose **BayesianOptimization** for tuning hyperparameters while still use **LightGBM** as the model for prediction.

In [50]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

For doing optimization with Bayesian model, we have to pass in an objective function in order for the Optimizer to find the greatest set of hyperparameters. <code>lgb_eval</code> is the objective function in this case.
<br>
**Cross Validation** can also be performed by inside this function using **Light GBM** API. This will return the best splits which has the greatest AUC score.

In [66]:
train_set = lgb.Dataset(data = X_train, label = y_train)

def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, 
             lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    hyperParameters = {
        'application': 'binary',
        'num_iterations': 4000, 
        'learning_rate': 0.05, 
        'early_stopping_round': 100, 
        'metric':'auc',
        'num_leaves' : round(num_leaves),
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'bagging_fraction': max(min(bagging_fraction, 1), 0),
        'max_depth' : round(max_depth),
        'lambda_l1' : max(lambda_l1, 0),
        'lambda_l2' : max(lambda_l2, 0),
        'min_split_gain' : min_split_gain,
        'min_child_weight' : min_child_weight
    }
    
    cv_result = lgb.cv(hyperParameters, train_set, nfold=5, 
                       seed=42, stratified=True, verbose_eval =200, metrics=['auc'])
    return max(cv_result['auc-mean'])

We creeate a **BayesianOptimizer** which takes the above function as objective function and range of parameters are specified in types of a dictionary.

In [67]:
lgbOptimizer = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=0)

# Doing optimization

In [68]:
lgbOptimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, numbe



[200]	cv_agg's auc: 0.758871 + 0.004577








[400]	cv_agg's auc: 0.76046 + 0.00453301








| [0m 1       [0m | [0m 0.7605  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.69    [0m | [0m 34.07   [0m | [0m 0.04432 [0m | [0m 42.73   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To



[400]	cv_agg's auc: 0.760312 + 0.00419772


[600]	cv_agg's auc: 0.760636 + 0.00425725


| [95m 2       [0m | [95m 0.7607  [0m | [95m 0.9927  [0m | [95m 0.4068  [0m | [95m 3.959   [0m | [95m 1.587   [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 25.83   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

[400]	cv_agg's auc: 0.760344 + 0.00418311


| [0m 3       [0m | [0m 0.7605  [0m | [0m 0.804   [0m | [0m 0.7661  [0m | [0m 3.891   [0m | [0m 2.61    [0m | [0m 8.905   [0m | [0m 40.96   [0m | [0m 0.04669 [0m | [0m 40.39   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

[200]	cv_agg's auc: 0.759243 + 0.00423709








[400]	cv_agg's auc: 0.760354 + 0.00410458






| [0m 4       [0m | [0m 0.7605  [0m | [0m 0.8237  [0m | [0m 0.6119  [0m | [0m 0.7168  [0m | [0m 2.834   [0m | [0m 7.082   [0m | [0m 23.66   [0m | [0m 0.02719 [0m | [0m 40.26   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[200]	cv_agg's auc: 0.758633 + 0.00460421










[400]	cv_agg's auc: 0.760162 + 0.00441271






| [0m 5       [0m | [0m 0.7602  [0m | [0m 0.8912  [0m | [0m 0.5547  [0m | [0m 0.09395 [0m | [0m 1.853   [0m | [0m 7.442   [0m | [0m 32.76   [0m | [0m 0.09443 [0m | [0m 38.32   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

[400]	cv_agg's auc: 0.760492 + 0.00418109


[600]	cv_agg's auc: 0.760704 + 0.00424932
| [95m 6       [0m | [95m 0.7608  [0m | [95m 0.8719  [0m | [95m 0.4496  [0m | [95m 3.488   [0m | [95m 0.1807  [0m | [95m 7.66    [0m | [95m 35.18   [0m | [95m 0.02183 [0m | [95m 26.71   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wis

[200]	cv_agg's auc: 0.757524 + 0.00430732


[400]	cv_agg's auc: 0.760197 + 0.00406019


[600]	cv_agg's auc: 0.760768 + 0.00388045


| [95m 8       [0m | [95m 0.7609  [0m | [95m 0.9306  [0m | [95m 0.3026  [0m | [95m 2.332   [0m | [95m 0.7333  [0m | [95m 5.634   [0m | [95m 9.967   [0m | [95m 0.06598 [0m | [95m 26.9    [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]

| [0m 9       [0m | [0m 0.7607  [0m | [0m 0.8393  [0m | [0m 0.395   [0m | [0m 4.105   [0m | [0m 0.2913  [0m | [0m 8.343   [0m | [0m 9.324   [0m | [0m 0.09767 [0m | [0m 33.84   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

[400]	cv_agg's auc: 0.760559 + 0.00412945
[600]	cv_agg's auc: 0.760816 + 0.00406105
| [0m 10      [0m | [0m 0.7608  [0m | [0m 0.9954  [0m | [0m 0.5839  [0m | [0m 3.696   [0m | [0m 0.1176  [0m | [0m 6.128   [0m | [0m 10.41   [0m | [0m 0.03032 [0m | [0m 26.49   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486


[200]	cv_agg's auc: 0.756797 + 0.00433095










[400]	cv_agg's auc: 0.759562 + 0.00421225










[600]	cv_agg's auc: 0.760231 + 0.00431606






| [0m 11      [0m | [0m 0.7603  [0m | [0m 0.9133  [0m | [0m 0.672   [0m | [0m 0.287   [0m | [0m 1.777   [0m | [0m 5.141   [0m | [0m 10.93   [0m | [0m 0.03257 [0m | [0m 24.33   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

[400]	cv_agg's auc: 0.75892 + 0.00462096


[600]	cv_agg's auc: 0.759714 + 0.00435529


| [0m 12      [0m | [0m 0.7599  [0m | [0m 1.0     [0m | [0m 0.192   [0m | [0m 3.553   [0m | [0m 1.292   [0m | [0m 5.531   [0m | [0m 9.612   [0m | [0m 0.1     [0m | [0m 27.64   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[400]	cv_agg's auc: 0.760486 + 0.00422849
[600]	cv_agg's auc: 0.760726 + 0.00414091
| [0m 13      [0m | [0m 0.7608  [0m | [0m 0.8729  [0m | [0m 0.5144  [0m | [0m 3.165   [0m | [0m 1.62    [0m | [0m 8.711   [0m | [0m 39.32   [0m | [0m 0.03409 [0m | [0m 27.24   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 2



[200]	cv_agg's auc: 0.757815 + 0.00442114








[400]	cv_agg's auc: 0.760351 + 0.00435668








[600]	cv_agg's auc: 0.76071 + 0.00433161








| [0m 14      [0m | [0m 0.7608  [0m | [0m 0.9212  [0m | [0m 0.3702  [0m | [0m 3.659   [0m | [0m 2.989   [0m | [0m 6.279   [0m | [0m 33.83   [0m | [0m 0.02548 [0m | [0m 30.52   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[600]	cv_agg's auc: 0.759304 + 0.0046746


| [0m 15      [0m | [0m 0.7594  [0m | [0m 0.9175  [0m | [0m 0.2053  [0m | [0m 0.7007  [0m | [0m 0.203   [0m | [0m 8.891   [0m | [0m 41.4    [0m | [0m 0.06127 [0m | [0m 25.83   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

[200]	cv_agg's auc: 0.758772 + 0.00433235








[400]	cv_agg's auc: 0.760503 + 0.00435847




| [0m 16      [0m | [0m 0.7606  [0m | [0m 0.9723  [0m | [0m 0.4481  [0m | [0m 1.993   [0m | [0m 1.737   [0m | [0m 7.396   [0m | [0m 22.67   [0m | [0m 0.03566 [0m | [0m 42.06   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[400]	cv_agg's auc: 0.760516 + 0.00419519


[600]	cv_agg's auc: 0.760685 + 0.00427797
| [0m 18      [0m | [0m 0.7609  [0m | [0m 0.8104  [0m | [0m 0.5932  [0m | [0m 4.229   [0m | [0m 1.605   [0m | [0m 7.63    [0m | [0m 47.09   [0m | [0m 0.05909 [0m | [0m 25.5    [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, numbe

| [0m 19      [0m | [0m 0.7606  [0m | [0m 0.927   [0m | [0m 0.6955  [0m | [0m 4.428   [0m | [0m 1.535   [0m | [0m 7.735   [0m | [0m 47.68   [0m | [0m 0.006278[0m | [0m 25.91   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

[400]	cv_agg's auc: 0.760407 + 0.00450999
| [0m 20      [0m | [0m 0.7605  [0m | [0m 0.9974  [0m | [0m 0.8639  [0m | [0m 2.259   [0m | [0m 0.3818  [0m | [0m 6.531   [0m | [0m 10.49   [0m | [0m 0.03078 [0m | [0m 26.67   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to



[400]	cv_agg's auc: 0.760666 + 0.00406893


[600]	cv_agg's auc: 0.761007 + 0.00412079
| [95m 21      [0m | [95m 0.7611  [0m | [95m 0.878   [0m | [95m 0.4182  [0m | [95m 2.423   [0m | [95m 1.168   [0m | [95m 5.846   [0m | [95m 10.16   [0m | [95m 0.087   [0m | [95m 26.94   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

[200]	cv_agg's auc: 0.758255 + 0.00434407


[400]	cv_agg's auc: 0.760514 + 0.00421695


[600]	cv_agg's auc: 0.760883 + 0.00425151


| [0m 22      [0m | [0m 0.7609  [0m | [0m 0.9783  [0m | [0m 0.5151  [0m | [0m 1.606   [0m | [0m 1.612   [0m | [0m 5.911   [0m | [0m 10.23   [0m | [0m 0.03086 [0m | [0m 26.9    [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the tr



[200]	cv_agg's auc: 0.757223 + 0.00432797












[400]	cv_agg's auc: 0.760175 + 0.00429115












[600]	cv_agg's auc: 0.760806 + 0.00422679




| [0m 23      [0m | [0m 0.7608  [0m | [0m 0.8731  [0m | [0m 0.6143  [0m | [0m 1.785   [0m | [0m 1.061   [0m | [0m 5.139   [0m | [0m 10.91   [0m | [0m 0.01383 [0m | [0m 26.92   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] [binary:Bo

[400]	cv_agg's auc: 0.756617 + 0.00448556


[600]	cv_agg's auc: 0.758419 + 0.00436846


[800]	cv_agg's auc: 0.758531 + 0.00406907


| [0m 24      [0m | [0m 0.7586  [0m | [0m 0.8842  [0m | [0m 0.1709  [0m | [0m 1.94    [0m | [0m 2.395   [0m | [0m 5.867   [0m | [0m 10.66   [0m | [0m 0.05848 [0m | [0m 26.21   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[200]	cv_agg's auc: 0.758505 + 0.00439195








[400]	cv_agg's auc: 0.760256 + 0.00430886






| [0m 25      [0m | [0m 0.7603  [0m | [0m 0.8748  [0m | [0m 0.3377  [0m | [0m 2.064   [0m | [0m 0.4128  [0m | [0m 6.614   [0m | [0m 23.01   [0m | [0m 0.08979 [0m | [0m 41.71   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the tr



[400]	cv_agg's auc: 0.760254 + 0.00400017


| [0m 26      [0m | [0m 0.7604  [0m | [0m 0.882   [0m | [0m 0.8611  [0m | [0m 1.724   [0m | [0m 1.462   [0m | [0m 5.962   [0m | [0m 10.38   [0m | [0m 0.0695  [0m | [0m 27.6    [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the tr

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486


[200]	cv_agg's auc: 0.758135 + 0.0043071








[400]	cv_agg's auc: 0.760407 + 0.00433686






[600]	cv_agg's auc: 0.76038 + 0.00432738
| [0m 27      [0m | [0m 0.7605  [0m | [0m 0.8203  [0m | [0m 0.8639  [0m | [0m 2.333   [0m | [0m 1.967   [0m | [0m 6.181   [0m | [0m 15.71   [0m | [0m 0.03701 [0m | [0m 33.66   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[200]	cv_agg's auc: 0.751743 + 0.00468214


[400]	cv_agg's auc: 0.757731 + 0.00440285




[600]	cv_agg's auc: 0.758849 + 0.00443058
| [0m 28      [0m | [0m 0.759   [0m | [0m 0.8039  [0m | [0m 0.1564  [0m | [0m 0.9863  [0m | [0m 0.7033  [0m | [0m 6.506   [0m | [0m 42.62   [0m | [0m 0.06218 [0m | [0m 26.0    [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to

[200]	cv_agg's auc: 0.757642 + 0.00473434


[400]	cv_agg's auc: 0.760293 + 0.00456255


[600]	cv_agg's auc: 0.760752 + 0.00441102




| [0m 29      [0m | [0m 0.7608  [0m | [0m 0.8997  [0m | [0m 0.287   [0m | [0m 2.081   [0m | [0m 1.231   [0m | [0m 5.923   [0m | [0m 10.39   [0m | [0m 0.09111 [0m | [0m 27.26   [0m |
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11287
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[600]	cv_agg's auc: 0.758444 + 0.00436335


[800]	cv_agg's auc: 0.758822 + 0.00415193


| [0m 30      [0m | [0m 0.7589  [0m | [0m 0.9869  [0m | [0m 0.1704  [0m | [0m 3.825   [0m | [0m 2.348   [0m | [0m 7.645   [0m | [0m 47.1    [0m | [0m 0.01479 [0m | [0m 25.6    [0m |


After doing maximization, I find the set of hyperparameters which make the objective function reach the maxima and use this set for constructing the prediction model, **LGBMClassifier**. Output the result and submit it to kaggle.

In [89]:
opt_params = lgbOptimizer.max['params']
opt_params['num_leaves'] = int(round(opt_params['num_leaves']))
opt_params['max_depth'] = int(round(opt_params['max_depth']))
model = lgb.LGBMClassifier(**opt_params)
model.fit(X_train, y_train)
                        
# Predictions on the test data
preds = model.predict_proba(X_pred)[:, 1]



# Problem 5

In [90]:
submission = pd.DataFrame({'SK_ID_CURR': np.int32(application_test.SK_ID_CURR)
                           , 'TARGET': preds})
submission.to_csv('tuning_submission.csv', index = False)

![title](img/pic2.png)

The reuslt of my submission this time is much better compared to that of Class Assignment in Week 4. During doing problem 4, I have grasped some of the ideas:
 - Bayesian Optimization works very well with huge dataset while GridSearch can not perform effectively when dealing with huge set of hyperparameters and dataset.
 - Cross Validation is a wonderful technique for evaluating how good a model perform on the train set as well as preventing overfitting problems.