In [74]:
import numpy as np
import pandas as pd
import pickle as pkl
import operator
import numbers
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from itertools import product

## Dataset Preparation

In [51]:
data_path = '/Users/liujiayao/Desktop/3001 Python/Project/home-credit-default-risk/'

In [52]:
train = pd.read_csv(data_path+'application_train.csv')

In [53]:
def feature_normalization(dataset):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    selected = dataset.select_dtypes(include=numerics)
    # exclude SK_ID_CURR and target
    # exclude FLAG_ish indicator variables
    columns_sel = selected.columns[~selected.columns.str.contains(pat = 'FLAG_|REG_|LIVE_|HOUR_APPR_PROCESS_START')][2:]
    #print(columns_sel.values)
    #print(len(columns_sel))
    dataset[columns_sel] = (dataset[columns_sel] - dataset[columns_sel].min(axis=0))/ \
    (dataset[columns_sel].max(axis=0) - dataset[columns_sel].min(axis=0))
#     df[columns_sel] = pd.DataFrame(scaler.fit_transform(df[columns_sel]))
    return dataset
train = feature_normalization(train)

['CNT_CHILDREN' 'AMT_INCOME_TOTAL' 'AMT_CREDIT' 'AMT_ANNUITY'
 'AMT_GOODS_PRICE' 'REGION_POPULATION_RELATIVE' 'DAYS_BIRTH'
 'DAYS_EMPLOYED' 'DAYS_REGISTRATION' 'DAYS_ID_PUBLISH' 'OWN_CAR_AGE'
 'CNT_FAM_MEMBERS' 'REGION_RATING_CLIENT' 'REGION_RATING_CLIENT_W_CITY'
 'EXT_SOURCE_1' 'EXT_SOURCE_2' 'EXT_SOURCE_3' 'APARTMENTS_AVG'
 'BASEMENTAREA_AVG' 'YEARS_BEGINEXPLUATATION_AVG' 'YEARS_BUILD_AVG'
 'COMMONAREA_AVG' 'ELEVATORS_AVG' 'ENTRANCES_AVG' 'FLOORSMAX_AVG'
 'FLOORSMIN_AVG' 'LANDAREA_AVG' 'LIVINGAPARTMENTS_AVG' 'LIVINGAREA_AVG'
 'NONLIVINGAPARTMENTS_AVG' 'NONLIVINGAREA_AVG' 'APARTMENTS_MODE'
 'BASEMENTAREA_MODE' 'YEARS_BEGINEXPLUATATION_MODE' 'YEARS_BUILD_MODE'
 'COMMONAREA_MODE' 'ELEVATORS_MODE' 'ENTRANCES_MODE' 'FLOORSMAX_MODE'
 'FLOORSMIN_MODE' 'LANDAREA_MODE' 'LIVINGAPARTMENTS_MODE'
 'LIVINGAREA_MODE' 'NONLIVINGAPARTMENTS_MODE' 'NONLIVINGAREA_MODE'
 'APARTMENTS_MEDI' 'BASEMENTAREA_MEDI' 'YEARS_BEGINEXPLUATATION_MEDI'
 'YEARS_BUILD_MEDI' 'COMMONAREA_MEDI' 'ELEVATORS_MEDI' 'ENTRANCES_

In [85]:
train = train.dropna()

In [86]:
train_with_dummy = pd.get_dummies(train)
train_with_dummy.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
count,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,...,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0,8602.0
mean,279002.948617,0.061149,0.031712,0.001686,0.163545,0.11678,0.147982,0.321978,0.622322,0.040748,...,0.005464,0.065682,0.012323,0.017205,0.011741,0.476633,0.397698,0.018717,0.991514,0.008486
std,103218.951947,0.239617,0.04079,0.001209,0.116083,0.064691,0.107098,0.226553,0.183721,0.005421,...,0.07372,0.247741,0.110328,0.130043,0.107726,0.499483,0.489451,0.13553,0.091735,0.091735
min,100083.0,0.0,0.0,6.9e-05,0.0,0.002738,0.001122,0.0,0.02221,0.009231,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,188965.75,0.0,0.0,0.000935,0.067416,0.069937,0.057239,0.142153,0.50334,0.038573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,278239.0,0.0,0.0,0.001435,0.134831,0.107617,0.113356,0.27633,0.639543,0.042363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,370083.5,0.0,0.052632,0.002089,0.228001,0.150948,0.214366,0.421848,0.764628,0.044616,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
max,456226.0,1.0,0.263158,0.038251,1.0,0.8712,1.0,1.0,0.98726,0.046738,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [87]:
y_train = train_with_dummy['TARGET']
X_train = train_with_dummy.drop(columns = ['TARGET','SK_ID_CURR'])

## Random Forest

### Hyperparameter Tuning: max_depth, n_estimators, min_samples_split, max_features

In [111]:
rf = RandomForestClassifier(random_state=0)

In [101]:

param_grid1 = param_grid = [{'max_depth': [5,10],'n_estimators':[50,100], 
               'min_samples_split': [100,300],'max_features':['sqrt','log2']}]

rf_grid = GridSearchCV(rf, 
                    param_grid1,
                    cv = 2,scoring = 'roc_auc'
                  # n_jobs = -1
                   )


In [102]:
rf_grid.fit(X_train,y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [5, 10], 'n_estimators': [50, 100], 'min_samples_split': [100, 300], 'max_features': ['sqrt', 'log2']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [91]:
param_grid = [{'max_depth': [5,10,20,30,40,50],'n_estimators':np.arange(50,550,50), 
               'min_samples_split': np.arange(10,320,50),'max_features':['sqrt','log2']}]
grid = GridSearchCV(rf, 
                    param_grid,
                    cv = 5,scoring = 'roc_auc'
                  # n_jobs = -1
                   )
grid.fit(X_train,y_train) 


KeyboardInterrupt: 

In [103]:
rf_grid.best_params_ 

{'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_split': 100,
 'n_estimators': 100}

In [108]:
rf_model = rf_grid.best_estimator_

In [110]:
rf_grid.best_score_

0.7035331649708001

In [109]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Gradient Boosting Tree

### Hyperparameter Tuning: loss, learning_rate, max_depth, min_samples_split

In [64]:
loss = ['deviance', 'exponential']
learning_rate = [0.01,0.05,0.1]
max_depth = [5,10,20,30,40,50]
min_samples_split = np.arange (10,320,50)

In [72]:
param_grid = list(product(loss,learning_rate,max_depth,min_samples_split))

In [76]:
param_grid = {'loss':['deviance', 'exponential'],'learning_rate':[0.01,0.05,0.1],'max_depth':[5,10,20,30,40,50],'min_samples_split':np.arange (10,320,50)}

In [75]:
gbdt = GradientBoostingClassifier(random_state = 0)

In [112]:
gbdt_grid = GridSearchCV(gbdt, param_grid,cv = 2) # n_jobs = -1
gbdt_grid.fit(X_train,y_train) 

KeyboardInterrupt: 