In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold,cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
from bayes_opt import BayesianOptimization
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
import gc
gc.enable()

In [2]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    print (categorical_columns)
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

print('Read data and test')
data = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
print('Shapes : ', data.shape, test.shape)

y = data['TARGET']
del data['TARGET']
gc.collect()
all_data = pd.concat((data, test)).reset_index(drop=True)

docs = [_f for _f in all_data.columns if 'FLAG_DOC' in _f]
live = [_f for _f in all_data.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
inc_by_org = all_data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
all_data['NEW_INC_BY_ORG'] = all_data['ORGANIZATION_TYPE'].map(inc_by_org)
all_data['NEW_EXT_SOURCES_MEAN'] = all_data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
all_data['NEW_INC_PER_CHLD'] = all_data['AMT_INCOME_TOTAL'] /  all_data['CNT_CHILDREN']
all_data['NEW_DOC_IND_KURT'] = all_data[docs].kurtosis(axis=1)
all_data['NEW_LIVE_IND_SUM'] = all_data[live].sum(axis=1)
all_data['NEW_CREDIT_TO_GOODS_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_GOODS_PRICE']
all_data['NEW_SCORES_STD'] = all_data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
#all_data['NEW_SCORES_STD'] = all_data['NEW_SCORES_STD'].fillna(all_data['NEW_SCORES_STD'].mean())
all_data['NEW_CAR_TO_BIRTH_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_BIRTH']
all_data['NEW_CAR_TO_EMPLOY_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_PHONE_TO_BIRTH_RATIO'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_BIRTH']
all_data['NEW_PHONE_TO_DAYS_EMPLOYED'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_SOURCES_PROD'] = all_data['EXT_SOURCE_1'] * all_data['EXT_SOURCE_2'] * all_data['EXT_SOURCE_3']
print all_data.shape
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    all_data[bin_feature], uniques = pd.factorize(all_data[bin_feature])
all_data, cat_cols = one_hot_encoder(all_data)
print all_data.shape
# Some simple new features (percentages)
#all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
n_data = data.shape[0]

all_data['DAYS_EMPLOYED_PERC'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
all_data['INCOME_CREDIT_PERC'] = all_data['AMT_INCOME_TOTAL'] / all_data['AMT_CREDIT']
all_data['INCOME_PER_PERSON'] = all_data['AMT_INCOME_TOTAL'] / all_data['CNT_FAM_MEMBERS']
all_data['ANNUITY_INCOME_PERC'] = all_data['AMT_ANNUITY'] / all_data['AMT_INCOME_TOTAL']
all_data['PAYMENT_RATE'] = all_data['AMT_CREDIT'] / all_data['AMT_ANNUITY']
print all_data.shape
del data,test
gc.collect()

bureau = pd.read_csv('./input/bureau.csv')
buro_bal = pd.read_csv('./input/bureau_balance.csv')
buro_bal, bb_cat = one_hot_encoder(buro_bal)
bureau, bureau_cat = one_hot_encoder(bureau)
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max','mean']}
for col in bb_cat:
    bb_aggregations[col] = ['mean']
    bb_agg = buro_bal.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bb_agg['BB_COUNT'] = buro_bal.groupby('SK_ID_BUREAU').size()
print bb_agg.shape
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(columns= 'SK_ID_BUREAU', inplace= True)
del buro_bal,bb_agg
gc.collect()

num_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'CNT_CREDIT_PROLONG': ['sum','mean'],
    'AMT_CREDIT_SUM': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'DAYS_CREDIT_UPDATE': ['min','max','mean'],
    'AMT_ANNUITY': ['max', 'mean'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_MEAN': ['mean'],
    'BB_COUNT': ['mean', 'sum']
}
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
bureau_agg = bureau.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
bureau_agg['BURO_COUNT'] = bureau.groupby('SK_ID_CURR').size()
print bureau_agg.shape

active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
active_agg['ACT_COUNT'] = active.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.reset_index().join(active_agg, how='left', on='SK_ID_CURR')
print bureau_agg.shape
del active, active_agg
gc.collect()

closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(dict({'DAYS_ENDDATE_FACT': ['min','max','mean']},**num_aggregations))
closed_agg.columns = pd.Index(['CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
closed_agg['CLS_COUNT'] = closed.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
print bureau_agg.shape
del closed, closed_agg, bureau
gc.collect()

all_data = all_data.merge(bureau_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del bureau_agg
gc.collect()

prev = pd.read_csv('./input/previous_application.csv')
prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['min','max', 'mean'],
    'AMT_APPLICATION': ['min','max', 'mean'],
    'AMT_CREDIT': ['min','max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'SELLERPLACE_AREA': ['min', 'max', 'mean'],
    'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['min', 'max', 'mean'],
    'DAYS_TERMINATION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum']
}
# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
prev_agg['PREV_COUNT'] = prev.groupby('SK_ID_CURR').size()
print prev_agg.shape
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APR_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
approved_agg['APR_COUNT'] = approved.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.reset_index().join(approved_agg, how='left', on='SK_ID_CURR')
print prev_agg.shape
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REF_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
refused_agg['REF_COUNT'] = refused.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
print prev_agg.shape
del refused, refused_agg, approved, approved_agg, prev
gc.collect()

all_data = all_data.merge(prev_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del prev_agg
gc.collect()

pos = pd.read_csv('./input/POS_CASH_balance.csv')
pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
# Features
aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'CNT_INSTALMENT': ['sum', 'mean'],
    'SK_DPD': ['sum','max','min','mean'],
    'SK_DPD_DEF': ['sum','max', 'min','mean']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
print pos_agg.shape
del pos
gc.collect()

all_data = all_data.join(pos_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del pos_agg
gc.collect()

ins = pd.read_csv('./input/installments_payments.csv')
ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'],
    'DAYS_INSTALMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
print ins_agg.shape
del ins
gc.collect()

all_data = all_data.join(ins_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del ins_agg
gc.collect()

cc = pd.read_csv('./input/credit_card_balance.csv')
cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
# Features: Perform aggregations
num_aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'AMT_BALANCE': ['min','max', 'mean','sum'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['min','max', 'mean','sum','var'],
    'AMT_DRAWINGS_ATM_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_OTHER_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_POS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_INST_MIN_REGULARITY': ['min','max', 'mean','sum','var'],
    'AMT_PAYMENT_CURRENT':['min','max', 'mean','sum'],
    'AMT_PAYMENT_TOTAL_CURRENT':['min','max', 'mean','sum'],
    'AMT_RECEIVABLE_PRINCIPAL':['min','max', 'mean','sum','var'],
    'AMT_RECIVABLE':['min','max', 'mean','sum','var'],
    'AMT_TOTAL_RECEIVABLE':['min','max', 'mean','sum','var'],
    'CNT_DRAWINGS_ATM_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_OTHER_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_POS_CURRENT':['mean','sum'],
    'CNT_INSTALMENT_MATURE_CUM':['mean','sum'],
    'SK_DPD':['max', 'mean', 'sum'],
    'SK_DPD_DEF':['max', 'mean', 'sum']
}
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']
# General aggregations
cc_agg = cc.groupby('SK_ID_CURR').agg(dict(num_aggregations,**cat_aggregations))
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()

#cc_agg['INSTALLMENTS_PER_LOAN'] = cc_agg['CNT_INSTALMENT_MATURE_CUM_SUM'] / cc_agg['CC_COUNT']
del cc
print cc_agg.shape
gc.collect()

all_data = all_data.join(cc_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del cc_agg
gc.collect()

Read data and test
('Shapes : ', (307511, 122), (48744, 121))
(356255, 133)
['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
(356255, 266)
(356255, 271)
['STATUS']
['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
(817395, 13)
(305811, 67)
(305811, 100)
(305811, 135)
(356255, 405)
['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']
(338857, 208)
(338857, 258)
(338857, 307)
(356255, 711)
['NAME_CONTRACT_STATUS']
(337252, 24)
(356255, 735)
[]
(339587, 2

7

In [3]:
features_with_no_imp_at_least_twice = [
    'ACTIVE_CNT_CREDIT_PROLONG_SUM', 'ACTIVE_CREDIT_DAY_OVERDUE_MEAN', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_WEEK', 'BURO_CNT_CREDIT_PROLONG_SUM', 'BURO_CREDIT_ACTIVE_Bad debt_MEAN', 'BURO_CREDIT_ACTIVE_nan_MEAN',
    'BURO_CREDIT_CURRENCY_currency 1_MEAN', 'BURO_CREDIT_CURRENCY_currency 2_MEAN', 'BURO_CREDIT_CURRENCY_currency 3_MEAN',
    'BURO_CREDIT_CURRENCY_currency 4_MEAN', 'BURO_CREDIT_CURRENCY_nan_MEAN', 'BURO_CREDIT_DAY_OVERDUE_MAX', 'BURO_CREDIT_DAY_OVERDUE_MEAN',
    'BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN', 'BURO_CREDIT_TYPE_Interbank credit_MEAN', 'BURO_CREDIT_TYPE_Loan for business development_MEAN',
    'BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN', 'BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN',
    'BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN', 'BURO_CREDIT_TYPE_Mobile operator loan_MEAN',
    'BURO_CREDIT_TYPE_Real estate loan_MEAN', 'BURO_CREDIT_TYPE_Unknown type of loan_MEAN', 'BURO_CREDIT_TYPE_nan_MEAN',
    'BURO_MONTHS_BALANCE_MAX_MAX', 'BURO_STATUS_2_MEAN_MEAN', 'BURO_STATUS_3_MEAN_MEAN', 'BURO_STATUS_4_MEAN_MEAN', 'BURO_STATUS_5_MEAN_MEAN',
    'BURO_STATUS_nan_MEAN_MEAN', 'CC_AMT_DRAWINGS_ATM_CURRENT_MIN', 'CC_AMT_DRAWINGS_CURRENT_MIN', 'CC_AMT_DRAWINGS_OTHER_CURRENT_MAX',
    'CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN', 'CC_AMT_DRAWINGS_OTHER_CURRENT_MIN', 'CC_AMT_DRAWINGS_OTHER_CURRENT_SUM',
    'CC_AMT_DRAWINGS_OTHER_CURRENT_VAR', 'CC_AMT_INST_MIN_REGULARITY_MIN', 'CC_AMT_PAYMENT_TOTAL_CURRENT_MIN', 'CC_AMT_PAYMENT_TOTAL_CURRENT_VAR',
    'CC_AMT_RECIVABLE_SUM', 'CC_AMT_TOTAL_RECEIVABLE_MAX', 'CC_AMT_TOTAL_RECEIVABLE_MIN', 'CC_AMT_TOTAL_RECEIVABLE_SUM', 'CC_AMT_TOTAL_RECEIVABLE_VAR',
    'CC_CNT_DRAWINGS_ATM_CURRENT_MIN', 'CC_CNT_DRAWINGS_CURRENT_MIN', 'CC_CNT_DRAWINGS_OTHER_CURRENT_MAX', 'CC_CNT_DRAWINGS_OTHER_CURRENT_MEAN',
    'CC_CNT_DRAWINGS_OTHER_CURRENT_MIN', 'CC_CNT_DRAWINGS_OTHER_CURRENT_SUM', 'CC_CNT_DRAWINGS_OTHER_CURRENT_VAR', 'CC_CNT_DRAWINGS_POS_CURRENT_SUM',
    'CC_CNT_INSTALMENT_MATURE_CUM_MAX', 'CC_CNT_INSTALMENT_MATURE_CUM_MIN', 'CC_COUNT', 'CC_MONTHS_BALANCE_MAX', 'CC_MONTHS_BALANCE_MEAN',
    'CC_MONTHS_BALANCE_MIN', 'CC_MONTHS_BALANCE_SUM', 'CC_NAME_CONTRACT_STATUS_Active_MAX', 'CC_NAME_CONTRACT_STATUS_Active_MIN',
    'CC_NAME_CONTRACT_STATUS_Approved_MAX', 'CC_NAME_CONTRACT_STATUS_Approved_MEAN', 'CC_NAME_CONTRACT_STATUS_Approved_MIN',
    'CC_NAME_CONTRACT_STATUS_Approved_SUM', 'CC_NAME_CONTRACT_STATUS_Approved_VAR', 'CC_NAME_CONTRACT_STATUS_Completed_MAX',
    'CC_NAME_CONTRACT_STATUS_Completed_MEAN', 'CC_NAME_CONTRACT_STATUS_Completed_MIN', 'CC_NAME_CONTRACT_STATUS_Completed_SUM', 'CC_NAME_CONTRACT_STATUS_Completed_VAR',
    'CC_NAME_CONTRACT_STATUS_Demand_MAX', 'CC_NAME_CONTRACT_STATUS_Demand_MEAN', 'CC_NAME_CONTRACT_STATUS_Demand_MIN', 'CC_NAME_CONTRACT_STATUS_Demand_SUM',
    'CC_NAME_CONTRACT_STATUS_Demand_VAR', 'CC_NAME_CONTRACT_STATUS_Refused_MAX', 'CC_NAME_CONTRACT_STATUS_Refused_MEAN', 'CC_NAME_CONTRACT_STATUS_Refused_MIN',
    'CC_NAME_CONTRACT_STATUS_Refused_SUM', 'CC_NAME_CONTRACT_STATUS_Refused_VAR', 'CC_NAME_CONTRACT_STATUS_Sent proposal_MAX',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN', 'CC_NAME_CONTRACT_STATUS_Sent proposal_MIN', 'CC_NAME_CONTRACT_STATUS_Sent proposal_SUM',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_VAR', 'CC_NAME_CONTRACT_STATUS_Signed_MAX', 'CC_NAME_CONTRACT_STATUS_Signed_MEAN', 'CC_NAME_CONTRACT_STATUS_Signed_MIN',
    'CC_NAME_CONTRACT_STATUS_Signed_SUM', 'CC_NAME_CONTRACT_STATUS_Signed_VAR', 'CC_NAME_CONTRACT_STATUS_nan_MAX', 'CC_NAME_CONTRACT_STATUS_nan_MEAN',
    'CC_NAME_CONTRACT_STATUS_nan_MIN', 'CC_NAME_CONTRACT_STATUS_nan_SUM', 'CC_NAME_CONTRACT_STATUS_nan_VAR', 'CC_SK_DPD_DEF_MAX',
    'CC_SK_DPD_DEF_MIN', 'CC_SK_DPD_DEF_SUM', 'CC_SK_DPD_DEF_VAR', 'CC_SK_DPD_MAX', 'CC_SK_DPD_MEAN', 'CC_SK_DPD_MIN', 'CC_SK_DPD_SUM',
    'CC_SK_DPD_VAR', 'CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN', 'CLOSED_AMT_CREDIT_SUM_LIMIT_SUM', 'CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN',
    'CLOSED_CNT_CREDIT_PROLONG_SUM', 'CLOSED_CREDIT_DAY_OVERDUE_MAX', 'CLOSED_CREDIT_DAY_OVERDUE_MEAN', 'CLOSED_MONTHS_BALANCE_MAX_MAX',
    'CNT_CHILDREN', 'ELEVATORS_MEDI', 'ELEVATORS_MODE', 'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes', 'ENTRANCES_MODE', 'FLAG_CONT_MOBILE',
    'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
    'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
    'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_OWN_CAR', 'FLOORSMAX_MODE',
    'FONDKAPREMONT_MODE_not specified', 'FONDKAPREMONT_MODE_org spec account', 'FONDKAPREMONT_MODE_reg oper account', 'FONDKAPREMONT_MODE_reg oper spec account',
    'HOUSETYPE_MODE_block of flats', 'HOUSETYPE_MODE_specific housing', 'HOUSETYPE_MODE_terraced house', 'LIVE_REGION_NOT_WORK_REGION',
    'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_EDUCATION_TYPE_Academic degree', 'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Single / not married',
    'NAME_FAMILY_STATUS_Unknown', 'NAME_FAMILY_STATUS_Widow', 'NAME_HOUSING_TYPE_Co-op apartment', 'NAME_HOUSING_TYPE_With parents',
    'NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Maternity leave', 'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_Student',
    'NAME_INCOME_TYPE_Unemployed', 'NAME_TYPE_SUITE_Children', 'NAME_TYPE_SUITE_Family', 'NAME_TYPE_SUITE_Group of people',
    'NAME_TYPE_SUITE_Other_A', 'NAME_TYPE_SUITE_Other_B', 'NAME_TYPE_SUITE_Spouse, partner', 'NAME_TYPE_SUITE_Unaccompanied',
    'NEW_RATIO_BURO_AMT_CREDIT_SUM_DEBT_MEAN', 'NEW_RATIO_BURO_AMT_CREDIT_SUM_LIMIT_SUM', 'NEW_RATIO_BURO_AMT_CREDIT_SUM_OVERDUE_MEAN',
    'NEW_RATIO_BURO_CNT_CREDIT_PROLONG_SUM', 'NEW_RATIO_BURO_CREDIT_DAY_OVERDUE_MAX', 'NEW_RATIO_BURO_CREDIT_DAY_OVERDUE_MEAN', 'NEW_RATIO_BURO_MONTHS_BALANCE_MAX_MAX',
    'NEW_RATIO_PREV_AMT_DOWN_PAYMENT_MIN', 'NEW_RATIO_PREV_RATE_DOWN_PAYMENT_MAX', 'OCCUPATION_TYPE_Cleaning staff', 'OCCUPATION_TYPE_Cooking staff',
    'OCCUPATION_TYPE_HR staff', 'OCCUPATION_TYPE_IT staff', 'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Managers',
    'OCCUPATION_TYPE_Private service staff', 'OCCUPATION_TYPE_Realty agents', 'OCCUPATION_TYPE_Sales staff', 'OCCUPATION_TYPE_Secretaries',
    'OCCUPATION_TYPE_Security staff', 'OCCUPATION_TYPE_Waiters/barmen staff', 'ORGANIZATION_TYPE_Advertising', 'ORGANIZATION_TYPE_Agriculture',
    'ORGANIZATION_TYPE_Business Entity Type 1', 'ORGANIZATION_TYPE_Business Entity Type 2', 'ORGANIZATION_TYPE_Cleaning', 'ORGANIZATION_TYPE_Culture',
    'ORGANIZATION_TYPE_Electricity', 'ORGANIZATION_TYPE_Emergency', 'ORGANIZATION_TYPE_Government', 'ORGANIZATION_TYPE_Hotel', 'ORGANIZATION_TYPE_Housing',
    'ORGANIZATION_TYPE_Industry: type 1', 'ORGANIZATION_TYPE_Industry: type 10', 'ORGANIZATION_TYPE_Industry: type 11', 'ORGANIZATION_TYPE_Industry: type 12',
    'ORGANIZATION_TYPE_Industry: type 13', 'ORGANIZATION_TYPE_Industry: type 2', 'ORGANIZATION_TYPE_Industry: type 3', 'ORGANIZATION_TYPE_Industry: type 4',
    'ORGANIZATION_TYPE_Industry: type 5', 'ORGANIZATION_TYPE_Industry: type 6', 'ORGANIZATION_TYPE_Industry: type 7', 'ORGANIZATION_TYPE_Industry: type 8',
    'ORGANIZATION_TYPE_Insurance', 'ORGANIZATION_TYPE_Legal Services', 'ORGANIZATION_TYPE_Mobile', 'ORGANIZATION_TYPE_Other', 'ORGANIZATION_TYPE_Postal',
    'ORGANIZATION_TYPE_Realtor', 'ORGANIZATION_TYPE_Religion', 'ORGANIZATION_TYPE_Restaurant', 'ORGANIZATION_TYPE_Security',
    'ORGANIZATION_TYPE_Security Ministries', 'ORGANIZATION_TYPE_Services', 'ORGANIZATION_TYPE_Telecom', 'ORGANIZATION_TYPE_Trade: type 1',
    'ORGANIZATION_TYPE_Trade: type 2', 'ORGANIZATION_TYPE_Trade: type 3', 'ORGANIZATION_TYPE_Trade: type 4', 'ORGANIZATION_TYPE_Trade: type 5',
    'ORGANIZATION_TYPE_Trade: type 6', 'ORGANIZATION_TYPE_Trade: type 7',
    'ORGANIZATION_TYPE_Transport: type 1', 'ORGANIZATION_TYPE_Transport: type 2', 'ORGANIZATION_TYPE_Transport: type 4', 'ORGANIZATION_TYPE_University',
    'ORGANIZATION_TYPE_XNA', 'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN', 'POS_NAME_CONTRACT_STATUS_Approved_MEAN', 'POS_NAME_CONTRACT_STATUS_Canceled_MEAN',
    'POS_NAME_CONTRACT_STATUS_Demand_MEAN', 'POS_NAME_CONTRACT_STATUS_XNA_MEAN', 'POS_NAME_CONTRACT_STATUS_nan_MEAN', 'PREV_CHANNEL_TYPE_Car dealer_MEAN',
    'PREV_CHANNEL_TYPE_nan_MEAN', 'PREV_CODE_REJECT_REASON_CLIENT_MEAN', 'PREV_CODE_REJECT_REASON_SYSTEM_MEAN', 'PREV_CODE_REJECT_REASON_VERIF_MEAN',
    'PREV_CODE_REJECT_REASON_XNA_MEAN', 'PREV_CODE_REJECT_REASON_nan_MEAN', 'PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN', 'PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN',
    'PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Buying a home_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a new car_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Buying a used car_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Education_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Everyday expenses_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Furniture_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Hobby_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Journey_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Money for a third person_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Other_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Payments on other loans_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_XAP_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_nan_MEAN', 'PREV_NAME_CLIENT_TYPE_XNA_MEAN',
    'PREV_NAME_CLIENT_TYPE_nan_MEAN', 'PREV_NAME_CONTRACT_STATUS_Unused offer_MEAN', 'PREV_NAME_CONTRACT_STATUS_nan_MEAN', 'PREV_NAME_CONTRACT_TYPE_XNA_MEAN',
    'PREV_NAME_CONTRACT_TYPE_nan_MEAN', 'PREV_NAME_GOODS_CATEGORY_Additional Service_MEAN', 'PREV_NAME_GOODS_CATEGORY_Animals_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Auto Accessories_MEAN', 'PREV_NAME_GOODS_CATEGORY_Clothing and Accessories_MEAN', 'PREV_NAME_GOODS_CATEGORY_Construction Materials_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Direct Sales_MEAN', 'PREV_NAME_GOODS_CATEGORY_Education_MEAN', 'PREV_NAME_GOODS_CATEGORY_Fitness_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Gardening_MEAN', 'PREV_NAME_GOODS_CATEGORY_Homewares_MEAN', 'PREV_NAME_GOODS_CATEGORY_House Construction_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Insurance_MEAN', 'PREV_NAME_GOODS_CATEGORY_Jewelry_MEAN', 'PREV_NAME_GOODS_CATEGORY_Medical Supplies_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Medicine_MEAN', 'PREV_NAME_GOODS_CATEGORY_Office Appliances_MEAN', 'PREV_NAME_GOODS_CATEGORY_Other_MEAN', 'PREV_NAME_GOODS_CATEGORY_Tourism_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Vehicles_MEAN', 'PREV_NAME_GOODS_CATEGORY_Weapon_MEAN', 'PREV_NAME_GOODS_CATEGORY_XNA_MEAN', 'PREV_NAME_GOODS_CATEGORY_nan_MEAN',
    'PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN', 'PREV_NAME_PAYMENT_TYPE_Non-cash from your account_MEAN', 'PREV_NAME_PAYMENT_TYPE_nan_MEAN',
    'PREV_NAME_PORTFOLIO_Cars_MEAN', 'PREV_NAME_PORTFOLIO_nan_MEAN', 'PREV_NAME_PRODUCT_TYPE_nan_MEAN', 'PREV_NAME_SELLER_INDUSTRY_Construction_MEAN',
    'PREV_NAME_SELLER_INDUSTRY_Furniture_MEAN', 'PREV_NAME_SELLER_INDUSTRY_Industry_MEAN', 'PREV_NAME_SELLER_INDUSTRY_Jewelry_MEAN', 'PREV_NAME_SELLER_INDUSTRY_MLM partners_MEAN',
    'PREV_NAME_SELLER_INDUSTRY_Tourism_MEAN', 'PREV_NAME_SELLER_INDUSTRY_nan_MEAN', 'PREV_NAME_TYPE_SUITE_Group of people_MEAN', 'PREV_NAME_YIELD_GROUP_nan_MEAN',
    'PREV_PRODUCT_COMBINATION_POS industry without interest_MEAN', 'PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN', 'PREV_PRODUCT_COMBINATION_POS others without interest_MEAN',
    'PREV_PRODUCT_COMBINATION_nan_MEAN', 'PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN', 'REFUSED_AMT_DOWN_PAYMENT_MAX', 'REFUSED_AMT_DOWN_PAYMENT_MEAN',
    'REFUSED_RATE_DOWN_PAYMENT_MIN', 'REG_CITY_NOT_WORK_CITY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
    'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel',
    'WALLSMATERIAL_MODE_Wooden', 'WEEKDAY_APPR_PROCESS_START_FRIDAY', 'WEEKDAY_APPR_PROCESS_START_THURSDAY', 'WEEKDAY_APPR_PROCESS_START_TUESDAY'
]

In [4]:
exclude_fea = [x for x in all_data.columns.values if x in features_with_no_imp_at_least_twice]
exclude_fea.append('SK_ID_CURR') 

In [5]:
len(exclude_fea)

275

In [6]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
drop_col = missing_data[missing_data['Missing Ratio']==100]
all_data.drop(drop_col.index.values,1,inplace=True)

data = all_data[:n_data]
test = all_data[n_data:]
#del avg_buro, avg_prev,avg_pos,avg_cc_bal,avg_inst
del all_data
gc.collect()
print data.shape
print test.shape

(307511, 830)
(48744, 830)


In [3]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
drop_col = missing_data[missing_data['Missing Ratio']==100]
missing_data = missing_data[missing_data['Missing Ratio']<100]
all_data.drop(drop_col.index.values,1,inplace=True)
for i in missing_data.index.values:
    all_data[i] = all_data[i].fillna(-999999999)
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print missing_data
all_data[all_data==np.inf]=999999999
all_data[all_data==-np.inf]=-999999999
data = all_data[:n_data]
test = all_data[n_data:]
#del avg_buro, avg_prev,avg_pos,avg_cc_bal,avg_inst
del all_data
gc.collect()
print data.shape
print test.shape

exclude_fea = []
exclude_fea.append('SK_ID_CURR') 
exclude_fea

Empty DataFrame
Columns: [Missing Ratio]
Index: []
(307511, 830)
(48744, 830)


['SK_ID_CURR']

In [4]:
NFOLDS=5
SEED = 1700
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
feats = [f for f in data.columns if f not in exclude_fea]

In [5]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    def predict(self, x):
        return self.clf.predict(x)
    def predict_proba(self,x):
        return self.clf.predict_proba(x)[:, 1]

In [6]:
def get_oof(clff,param, x_train, y_train, x_test):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, x_test.shape[0]))

    for i, (train_index, test_index) in enumerate(folds.split(x_train)):
        x_tr = x_train[feats].iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train[feats].iloc[test_index]
        y_te = y_train.iloc[test_index]
        clf = SklearnHelper(clf=clff, seed=SEED, params=param)
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)
        oof_test_skf[i, :] = clf.predict_proba(x_test[feats])
        print('Fold %2d Train AUC : %.6f' % (i + 1, roc_auc_score(y_tr, clf.predict_proba(x_tr))))
        print('Fold %2d Test AUC : %.6f' % (i + 1, roc_auc_score(y_te, oof_train[test_index])))
        del clf
        gc.collect()
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [7]:
log_params = {
    'solver': 'sag',
    'max_iter': 500, 
    'C': 10,
    'random_state': 1700,
    'verbose': 10,
    'n_jobs':-1
}

In [8]:
oof_train, oof_test = get_oof(LogisticRegression,log_params,data, y, test)

max_iter reached after 728 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 12.1min finished


Fold  1 Train AUC : 0.613096
Fold  1 Test AUC : 0.613403
max_iter reached after 532 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  8.9min finished


Fold  2 Train AUC : 0.614944
Fold  2 Test AUC : 0.607860
max_iter reached after 540 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  9.0min finished


Fold  3 Train AUC : 0.612972
Fold  3 Test AUC : 0.617557
max_iter reached after 532 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  8.9min finished


Fold  4 Train AUC : 0.613295
Fold  4 Test AUC : 0.612553
max_iter reached after 532 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  8.9min finished


Fold  5 Train AUC : 0.614724
Fold  5 Test AUC : 0.609208


In [20]:
oof_train_csv = pd.DataFrame()
oof_train_csv['ID'] = data.SK_ID_CURR
oof_train_csv['xgb_oof_train'] = oof_train

oof_train_csv.to_csv('./output/xgb_oof_train2.csv', index=False)

oof_test_csv = pd.DataFrame()
oof_test_csv['ID'] = test.SK_ID_CURR
oof_test_csv['xgb_oof_test'] = oof_test

oof_test_csv.to_csv('./output/xgb_oof_test2.csv', index=False)

In [17]:
NFOLDS=5
SEED = 1700
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
feats = [f for f in data.columns if f not in exclude_fea]

In [8]:
len(feats)

555

In [18]:
def get_xgb_oof(x_train, y_train, x_test):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, x_test.shape[0]))

    for i, (train_index, test_index) in enumerate(folds.split(x_train)):
        x_tr = x_train[feats].iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train[feats].iloc[test_index]
        y_te = y_train.iloc[test_index]
        clf = XGBClassifier(learning_rate =0.01,
                            n_estimators=10000, 
                            max_depth=4, 
                            min_child_weight=5, 
                            subsample=0.8, 
                            colsample_bytree=0.8,
                            objective= 'binary:logistic', 
                            scale_pos_weight=2.5, 
                            random_state=SEED,
                            reg_lambda = 1.2,
                            tree_method='gpu_hist')

        clf.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr), (x_te, y_te)], 
                eval_metric= 'auc', verbose= 100, early_stopping_rounds= 300)

        oof_train[test_index] = clf.predict_proba(x_te)[:,1]
        oof_test_skf[i, :] = clf.predict_proba(x_test[feats])[:,1]
        print('Fold %2d Train AUC : %.6f' % (i + 1, roc_auc_score(y_tr, clf.predict_proba(x_tr)[:,1])))
        print('Fold %2d Test AUC : %.6f' % (i + 1, roc_auc_score(y_te, oof_train[test_index])))
        del clf,x_tr,y_tr,x_te,y_te
        gc.collect()
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [19]:
oof_train, oof_test = get_xgb_oof(data, y, test)

[0]	validation_0-auc:0.720789	validation_1-auc:0.716476
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 300 rounds.
[100]	validation_0-auc:0.742665	validation_1-auc:0.736431
[200]	validation_0-auc:0.752352	validation_1-auc:0.743934
[300]	validation_0-auc:0.763455	validation_1-auc:0.752831
[400]	validation_0-auc:0.774074	validation_1-auc:0.761076
[500]	validation_0-auc:0.781714	validation_1-auc:0.766764
[600]	validation_0-auc:0.787406	validation_1-auc:0.770945
[700]	validation_0-auc:0.792121	validation_1-auc:0.774145
[800]	validation_0-auc:0.796083	validation_1-auc:0.776596
[900]	validation_0-auc:0.799564	validation_1-auc:0.778524
[1000]	validation_0-auc:0.802624	validation_1-auc:0.780137
[1100]	validation_0-auc:0.805573	validation_1-auc:0.781445
[1200]	validation_0-auc:0.808177	validation_1-auc:0.782493
[1300]	validation_0-auc:0.810571	validation_1-auc:0.783404
[1400]	validation_0-auc:0.81

[5600]	validation_0-auc:0.87417	validation_1-auc:0.798273
Stopping. Best iteration:
[5324]	validation_0-auc:0.871171	validation_1-auc:0.798301

Fold  2 Train AUC : 0.874440
Fold  2 Test AUC : 0.798292
[0]	validation_0-auc:0.719003	validation_1-auc:0.717515
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 300 rounds.
[100]	validation_0-auc:0.741807	validation_1-auc:0.739089
[200]	validation_0-auc:0.751615	validation_1-auc:0.747706
[300]	validation_0-auc:0.762571	validation_1-auc:0.755977
[400]	validation_0-auc:0.773321	validation_1-auc:0.764036
[500]	validation_0-auc:0.78116	validation_1-auc:0.769443
[600]	validation_0-auc:0.787139	validation_1-auc:0.773206
[700]	validation_0-auc:0.79186	validation_1-auc:0.776113
[800]	validation_0-auc:0.795877	validation_1-auc:0.778312
[900]	validation_0-auc:0.799512	validation_1-auc:0.780221
[1000]	validation_0-auc:0.80272	validation_1-auc:0.781603
[1100]	

[6200]	validation_0-auc:0.881331	validation_1-auc:0.794109
[6300]	validation_0-auc:0.882304	validation_1-auc:0.794091
[6400]	validation_0-auc:0.883325	validation_1-auc:0.794115
[6500]	validation_0-auc:0.884396	validation_1-auc:0.794082
[6600]	validation_0-auc:0.885342	validation_1-auc:0.794123
[6700]	validation_0-auc:0.886297	validation_1-auc:0.794121
[6800]	validation_0-auc:0.887271	validation_1-auc:0.794102
Stopping. Best iteration:
[6588]	validation_0-auc:0.885214	validation_1-auc:0.79415

Fold  4 Train AUC : 0.888148
Fold  4 Test AUC : 0.794084
[0]	validation_0-auc:0.722516	validation_1-auc:0.713287
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 300 rounds.
[100]	validation_0-auc:0.743444	validation_1-auc:0.731682
[200]	validation_0-auc:0.753557	validation_1-auc:0.739609
[300]	validation_0-auc:0.764411	validation_1-auc:0.748078
[400]	validation_0-auc:0.774767	validation_1-auc:0.75632


In [9]:
def get_lgb_oof(x_train, y_train, x_test):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, x_test.shape[0]))

    for i, (train_index, test_index) in enumerate(folds.split(x_train)):
        x_tr = x_train[feats].iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train[feats].iloc[test_index]
        y_te = y_train.iloc[test_index]
        clf = LGBMClassifier(
            n_estimators=10000,
            learning_rate=0.01,
            num_leaves=31,
            colsample_bytree=0.9338690,
            subsample=0.97291513,
            max_depth=8,
            reg_alpha=0.06663684,
            reg_lambda=0.0988343,
            min_split_gain=0.07916152,
            min_child_weight=82.8850799,
            random_state=1700,
        )
        clf.fit(x_tr, y_tr, 
            eval_set= [(x_tr, y_tr), (x_te, y_te)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=300  #30
           )

        oof_train[test_index] = clf.predict_proba(x_te, num_iteration=clf.best_iteration_)[:,1]
        oof_test_skf[i, :] = clf.predict_proba(x_test[feats], num_iteration=clf.best_iteration_)[:,1]
        print('Fold %2d Train AUC : %.6f' % (i + 1, roc_auc_score(y_tr, clf.predict_proba(x_tr, num_iteration=clf.best_iteration_)[:,1])))
        print('Fold %2d Test AUC : %.6f' % (i + 1, roc_auc_score(y_te, oof_train[test_index])))
        del clf,x_tr,y_tr,x_te,y_te
        gc.collect()
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [10]:
oof_train, oof_test = get_lgb_oof(data, y, test)

Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.747738	valid_1's auc: 0.739531
[200]	training's auc: 0.756677	valid_1's auc: 0.744977
[300]	training's auc: 0.769407	valid_1's auc: 0.754124
[400]	training's auc: 0.781641	valid_1's auc: 0.763256
[500]	training's auc: 0.792469	valid_1's auc: 0.770741
[600]	training's auc: 0.80074	valid_1's auc: 0.776132
[700]	training's auc: 0.807385	valid_1's auc: 0.779873
[800]	training's auc: 0.812949	valid_1's auc: 0.782526
[900]	training's auc: 0.817778	valid_1's auc: 0.78436
[1000]	training's auc: 0.822059	valid_1's auc: 0.785877
[1100]	training's auc: 0.826026	valid_1's auc: 0.786973
[1200]	training's auc: 0.829681	valid_1's auc: 0.787944
[1300]	training's auc: 0.832903	valid_1's auc: 0.788772
[1400]	training's auc: 0.836207	valid_1's auc: 0.789485
[1500]	training's auc: 0.839298	valid_1's auc: 0.790071
[1600]	training's auc: 0.842129	valid_1's auc: 0.790551
[1700]	training's auc: 0.84485	valid_1's auc: 0.7910

[1400]	training's auc: 0.835507	valid_1's auc: 0.79001
[1500]	training's auc: 0.838424	valid_1's auc: 0.790406
[1600]	training's auc: 0.841253	valid_1's auc: 0.790909
[1700]	training's auc: 0.843902	valid_1's auc: 0.791231
[1800]	training's auc: 0.846494	valid_1's auc: 0.791554
[1900]	training's auc: 0.849046	valid_1's auc: 0.791801
[2000]	training's auc: 0.851625	valid_1's auc: 0.792072
[2100]	training's auc: 0.854049	valid_1's auc: 0.792356
[2200]	training's auc: 0.856483	valid_1's auc: 0.792584
[2300]	training's auc: 0.85884	valid_1's auc: 0.792754
[2400]	training's auc: 0.860963	valid_1's auc: 0.792952
[2500]	training's auc: 0.863159	valid_1's auc: 0.793116
[2600]	training's auc: 0.865318	valid_1's auc: 0.793329
[2700]	training's auc: 0.867484	valid_1's auc: 0.793396
[2800]	training's auc: 0.869587	valid_1's auc: 0.793496
[2900]	training's auc: 0.871675	valid_1's auc: 0.793539
[3000]	training's auc: 0.873656	valid_1's auc: 0.793575
[3100]	training's auc: 0.875543	valid_1's auc: 0.7

In [14]:
test['TARGET'] = oof_test

test[['SK_ID_CURR', 'TARGET']].to_csv('submission.csv', index=False)