In [65]:
## Importing liberies or dependencies 
import time
import itertools
import re
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from scipy import stats

pd.set_option('display.max_rows', 50)
pd.set_option("display.max_columns", 50)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# PROBLEM 1

## Cross validation

In [66]:
## Reading or loading CSV data files
train = pd.read_csv("application_train.csv")
test  = pd.read_csv("application_test.csv")

In [67]:
## Selecting the features to be used.
feats = ['EXT_SOURCE_1', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'DAYS_EMPLOYED',
         'DAYS_BIRTH', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
         'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION']

In [68]:
## Missing values processing performing 0 filling or avarage value filling
train["EXT_SOURCE_1"] = train["EXT_SOURCE_1"].fillna(0)
train["EXT_SOURCE_2"] = train["EXT_SOURCE_1"].fillna(0)
train["EXT_SOURCE_3"] = train["EXT_SOURCE_1"].fillna(0)
train["AMT_ANNUITY"] = train["AMT_ANNUITY"].fillna(train["AMT_ANNUITY"].mean())
train["AMT_GOODS_PRICE"] = train["AMT_GOODS_PRICE"].fillna(train["AMT_GOODS_PRICE"].mean())

test["EXT_SOURCE_1"] = test["EXT_SOURCE_1"].fillna(0)
test["EXT_SOURCE_2"] = test["EXT_SOURCE_1"].fillna(0)
test["EXT_SOURCE_3"] = test["EXT_SOURCE_1"].fillna(0)
test["AMT_ANNUITY"] = test["AMT_ANNUITY"].fillna(test["AMT_ANNUITY"].mean())
test["AMT_GOODS_PRICE"] = test["AMT_GOODS_PRICE"].fillna(test["AMT_GOODS_PRICE"].mean())

In [69]:
X_train = train[feats]
y_train = train["TARGET"]

In [70]:
## Cross-validation using Random Forest (K = 2)
scores = []

kf = KFold(n_splits=2, shuffle=True, random_state=1)
for tr_idx, va_idx in kf.split(X_train):
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    
    rand = RandomForestClassifier()
    rand.fit(tr_x, tr_y)
    
    pred = rand.predict_proba(va_x)[:, 1]
    score = roc_auc_score(va_y, pred)
    scores.append(score)
    
print("Average auc_score: %.4f" % np.mean(scores))

Average auc_score: 0.6323


# PROBLEM 2

## GridSearch

In [71]:
params = {"n_estimators":[100, 200],
           "max_depth":[3, 5, 7]
         }

grid_search = GridSearchCV(RandomForestClassifier(), params,
                           cv=2, n_jobs=-1, return_train_score=True, scoring="roc_auc")
grid_search.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7], 'n_estimators': [100, 200]},
             return_train_score=True, scoring='roc_auc')

In [72]:
grid_search.best_params_

{'max_depth': 7, 'n_estimators': 200}

# PROBLEM 3

## Survey from Kaggle Notebooks. Find and list different ideas from Kaggle's Notebooks.

1. Gradient Boosting Machine: A strong predictive baseline model these days is ligth-gbm.

* The time required for model training is short. High memory efficiency because the measured value is treated as a histogram.

* Can handle missing values as they are, there is no need to perform conversion processing such as scaling, because the magnitude relationship makes sense.

* The number of estimators of the gradient boosting model does not need to be adjusted as a hyperparameter by using early stopping.

* The "Leaf-wise" format complicates the model and is prone to overfitting if not properly adjusted with hyperparameters.

2. Objective function: A function that returns a score that is an index for optimizing hyperparameters. The function you want to minimize / maximize. Auc for binary classification



3. Domain: A trial combination of model hyperparameters. That range. See the official documentation



4. Algorithm: How to determine the selection of hyperparameters to try. Since humans determine the trial range, it is not always possible to find globally optimized parameters. Very high computational cost to try all combinations. On the contrary, if you try all combinations, you will be able to find an answer that is close to the overall optimal solution.

Humans set the trial range, and it is not always possible to find a truly totally optimized combination of parameters.
Since the combination of parameters is randomly determined and learning / evaluation is performed, a combination close to the optimum solution can be investigated with a small number of trials.
Since it is a random selection, if the number of trials is too low, it may be far from the optimal solution.
It is efficient to perform a random search at the beginning, determine the peripheral values ​​of the optimum solution of the parameters, and then perform a grid search.

While randomly determining the parameters, repeat learning and evaluation around the highly evaluated parameter combinations to find an answer close to the optimal solution in a short time.


5. Execution history: A data structure containing scores obtained from each set and objective function. Easy to understand if made with df.

# PROBLEM 4

## Creating a model with high generalization performance


In [73]:
## Reloading of data
train = pd.read_csv("application_train.csv")
test  = pd.read_csv("application_test.csv")

train_x = train.drop(["TARGET", "SK_ID_CURR"], axis=1)
train_y = train["TARGET"]
test_x = test.copy()

print(train_x.shape)
print(test_x.shape)

(307511, 120)
(48744, 121)


In [74]:
## Create a label encoder object
le = LabelEncoder()
le_count = 0

## Iterate through the columns
for col in train_x:
    if train_x[col].dtype == 'object':
        ## If 2 or fewer unique categories
        if len(list(train[col].unique())) <= 2:
            ## Train on the training data
            le.fit(train_x[col])
            ## Transform both training and testing data
            train_x[col] = le.transform(train_x[col])
            test_x[col] = le.transform(test_x[col])
            
            ## Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

## one-hot encoding of categorical variables
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

print('Training Features shape: ', train_x.shape)
print('Testing Features shape: ', test_x.shape)

3 columns were label encoded.
Training Features shape:  (307511, 241)
Testing Features shape:  (48744, 239)


In [75]:
train_x, test_x = train_x.align(test_x, join="inner", axis=1)

print('Training Features shape: ', train_x.shape)
print('Testing Features shape: ', test_x.shape)


Training Features shape:  (307511, 238)
Testing Features shape:  (48744, 238)


Since the number of data is large, this time we will compare each model using 50,000 rows of train data.

In [76]:
## If JSON characters are included in the column name, an error will occur.
train_x = train_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_x  =  test_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


## Random state should be described so that the correspondence between x and y does not change.
train_x_all = train_x.copy()
train_y_all = train_y.copy()

n = 50000
train_x = train_x.sample(n, random_state=0)
train_y = train_y.sample(n, random_state=0)

print(train_x_all.shape)
print(train_y_all.shape)
print(train_x.shape)
print(train_y.shape)

(307511, 238)
(307511,)
(50000, 238)
(50000,)


In [77]:
## Create training data and test data
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.25, random_state=1)

## lgb Create a dataset for learning
train_set = lgb.Dataset(data = X_train, label = y_train)
test_set = lgb.Dataset(data = X_test, label = y_test)
train_test_set = lgb.Dataset(data = train_x, label = train_y)

## Create df for storing results
results = pd.DataFrame(columns=["Andy", "Train_Time", "Val_Score", "Test_Score", "keggle_Score"],
           index=list(range(7)))

i = 0

In [78]:
## 1. No cross-validation
model = lgb.LGBMClassifier()
default_params = model.get_params()

## Exclude the number of classifiers from hyperparameters ⇒ For early stopping
del default_params["n_estimators"]

start = time.time()

model.fit(X_train, y_train)

t = time.time() - start

preds = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)

results.loc[i, :] = ["not cv", t, np.nan, auc, np.nan]
i += 1

display(results)

Unnamed: 0,Andy,Train_Time,Val_Score,Test_Score,keggle_Score
0,not cv,7.55708,,0.727502,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,


In [79]:
## 2 Perform cross-validation (k = 4.9)
andy = {"cv(k=4)": 4, "cv(k=9)": 9}

for andy, cv in andy.items():
    
    start = time.time()
    cv_result = lgb.cv(default_params, train_set, num_boost_round=10000,
                       early_stopping_rounds=100, metrics="auc", nfold=cv, seed=1)
    t = time.time() - start
    
    results.loc[i, :] = [andy, t, np.nan, cv_result["auc-mean"][-1], np.nan]
    i += 1
    
display(results)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10808
[LightGBM] [Info] Number of data points in the train set: 28125, number of used features: 219
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10808
[LightGBM] [Info] Number of data points in the train set: 28125, number of used features: 219
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10808
[LightGBM] [Info] Number of data points in the train set: 28125, number of used features: 219
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10808
[LightGBM] [Info] Number of data points in the train set: 28125, number of used features: 219
[LightGBM] [Info] Start training from score 0.078507
[LightGBM] [Info] Start training from score 0.078471
[LightGBM] [Info] 

Unnamed: 0,Andy,Train_Time,Val_Score,Test_Score,keggle_Score
0,not cv,7.55708,,0.727502,
1,cv(k=4),18.4874,,0.737986,
2,cv(k=9),30.8559,,0.739489,
3,,,,,
4,,,,,
5,,,,,
6,,,,,


# PROBLEM 5

## Final model selection

In [80]:
display(results)

Unnamed: 0,Andy,Train_Time,Val_Score,Test_Score,keggle_Score
0,not cv,7.55708,,0.727502,
1,cv(k=4),18.4874,,0.737986,
2,cv(k=9),30.8559,,0.739489,
3,,,,,
4,,,,,
5,,,,,
6,,,,,


As shown in the results in the table above, using Bayesian optimization for the light-gbm model resulted in the model with the highest generalization performance. In addition, this time, we compared the scores at the time of verification using training data of 50,000 samples, and the larger the number of cross-validation division.