In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import gc
%matplotlib inline

In [2]:
sns.set_style('whitegrid')
train_data = pd.read_csv('E:/pnk/kaggle_credit/application_train.csv')
test_data = pd.read_csv('E:/pnk/kaggle_credit/application_test.csv')
bureau_data = pd.read_csv('E:/pnk/kaggle_credit/bureau.csv')
previous_data = pd.read_csv('E:/pnk/kaggle_credit/previous_application.csv')

# 探索bureau文件特征

In [3]:
bureau_data.drop(labels=['DAYS_ENDDATE_FACT','AMT_ANNUITY'], axis=1, inplace=True)

In [4]:
num_variables = []
cat_variables = []
for col in bureau_data.columns:
    if bureau_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [5]:
num_dict = {}
for col in num_variables:
    num_dict[col] = bureau_data[col].count()/len(bureau_data)
print(num_dict)

{'SK_ID_CURR': 1.0, 'SK_ID_BUREAU': 1.0, 'DAYS_CREDIT': 1.0, 'CREDIT_DAY_OVERDUE': 1.0, 'DAYS_CREDIT_ENDDATE': 0.93850426583579383, 'AMT_CREDIT_MAX_OVERDUE': 0.34486736408401636, 'CNT_CREDIT_PROLONG': 1.0, 'AMT_CREDIT_SUM': 0.99999242613147765, 'AMT_CREDIT_SUM_DEBT': 0.84988068244051018, 'AMT_CREDIT_SUM_LIMIT': 0.65522585275933509, 'AMT_CREDIT_SUM_OVERDUE': 1.0, 'DAYS_CREDIT_UPDATE': 1.0}


In [6]:
def fill_with0(origin_data, fillmid_cols):
    for i in fillmid_cols:
        origin_data[i].fillna(0,inplace=True)
    return
fill_with0(bureau_data, list(num_dict.keys()))

In [7]:
bureau_data = pd.get_dummies(bureau_data)

In [8]:
bureau_data.columns

Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE',
       'DAYS_CREDIT_ENDDATE', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
       'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
       'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'CREDIT_ACTIVE_Active',
       'CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Closed', 'CREDIT_ACTIVE_Sold',
       'CREDIT_CURRENCY_currency 1', 'CREDIT_CURRENCY_currency 2',
       'CREDIT_CURRENCY_currency 3', 'CREDIT_CURRENCY_currency 4',
       'CREDIT_TYPE_Another type of loan', 'CREDIT_TYPE_Car loan',
       'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit',
       'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit',
       'CREDIT_TYPE_Loan for business development',
       'CREDIT_TYPE_Loan for purchase of shares (margin lending)',
       'CREDIT_TYPE_Loan for the purchase of equipment',
       'CREDIT_TYPE_Loan for working capital replenishment',
       'CREDIT_TYPE_Microloan', 'CR

In [9]:
sum_features = ['CREDIT_DAY_OVERDUE','AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM',
               'AMT_CREDIT_SUM_DEBT','AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE']
for col in bureau_data.columns:
    if col.startswith('CREDIT_ACTIVE') or col.startswith('CREDIT_CURRENCY') or col.startswith('CREDIT_TYPE'):
        sum_features.append(col)
minmax_features = []
for col in bureau_data.columns:
    if col not in ['SK_ID_CURR','SK_ID_BUREAU'] and col not in sum_features:
        minmax_features.append(col)
sum_features.append('SK_ID_CURR')
minmax_features.append('SK_ID_CURR')
print(sum_features)
print(minmax_features)

['CREDIT_DAY_OVERDUE', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_ACTIVE_Active', 'CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Closed', 'CREDIT_ACTIVE_Sold', 'CREDIT_CURRENCY_currency 1', 'CREDIT_CURRENCY_currency 2', 'CREDIT_CURRENCY_currency 3', 'CREDIT_CURRENCY_currency 4', 'CREDIT_TYPE_Another type of loan', 'CREDIT_TYPE_Car loan', 'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit', 'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit', 'CREDIT_TYPE_Loan for business development', 'CREDIT_TYPE_Loan for purchase of shares (margin lending)', 'CREDIT_TYPE_Loan for the purchase of equipment', 'CREDIT_TYPE_Loan for working capital replenishment', 'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan', 'CREDIT_TYPE_Mortgage', 'CREDIT_TYPE_Real estate loan', 'CREDIT_TYPE_Unknown type of loan', 'SK_ID_CURR']
['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_CREDIT_UPD

In [10]:
bureau_sum = bureau_data[sum_features].groupby(by=['SK_ID_CURR']).agg('sum')
bureau_sum.reset_index(inplace=True)

In [11]:
bureau_min = bureau_data[minmax_features].groupby(by=['SK_ID_CURR']).agg('min')
bureau_min.columns=['DAYS_CREDIT_MIN','DAYS_CREDIT_ENDDATE_MIN','DAYS_CREDIT_UPDATE_MIN']
bureau_min.reset_index(inplace=True)

In [12]:
bureau_max = bureau_data[minmax_features].groupby(by=['SK_ID_CURR']).agg('max')
bureau_max.columns=['DAYS_CREDIT_MAX','DAYS_CREDIT_ENDDATE_MAX','DAYS_CREDIT_UPDATE_MAX']
bureau_max.reset_index(inplace=True)

In [13]:
bureau_sum = bureau_sum.merge(bureau_min, on='SK_ID_CURR', how='inner')
bureau_sum = bureau_sum.merge(bureau_max, on='SK_ID_CURR', how='inner')

In [427]:
bureau_data.describe()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
count,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,...,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0
mean,278214.9,5924434.0,-1142.108,0.8181666,479.1227,1319.262,0.006410406,354991.9,116506.0,4081.739,...,5.826053e-07,0.001150645,2.330421e-06,1.10695e-05,0.0002732419,0.007231879,5.826053e-07,0.01071469,1.573034e-05,0.0003233459
std,102938.6,532265.7,795.1649,36.54443,4839.776,121006.5,0.09622391,1149807.0,626405.8,36571.69,...,0.0007632858,0.03390165,0.00152657,0.003327068,0.01652778,0.08473242,0.0007632858,0.1029558,0.00396612,0.01797892
min,100001.0,5000000.0,-2922.0,0.0,-42060.0,0.0,0.0,0.0,-4705600.0,-586406.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,188866.8,5463954.0,-1666.0,0.0,-1074.0,0.0,0.0,51300.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278055.0,5926304.0,-987.0,0.0,-237.0,0.0,0.0,125518.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,367426.0,6385681.0,-474.0,0.0,389.0,0.0,0.0,315000.0,1975.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,456255.0,6843457.0,0.0,2792.0,31199.0,115987200.0,9.0,585000000.0,170100000.0,4705600.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
bureau_data.dtypes.value_counts()

uint8      23
float64     6
int64       6
dtype: int64

# 探索previous_application文件特征

In [14]:
previous_data.drop(labels=['WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START',
                          'FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY',
                          'RATE_DOWN_PAYMENT','RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED',
                          'DAYS_FIRST_DRAWING','DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION',
                          'DAYS_LAST_DUE','DAYS_TERMINATION',], axis=1, inplace=True)

In [15]:
num_variables = []
cat_variables = []
for col in previous_data.columns:
    if previous_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [16]:
num_dict = {}
for col in num_variables:
    num_dict[col] = previous_data[col].count()/len(previous_data)
print(num_dict)

{'SK_ID_PREV': 1.0, 'SK_ID_CURR': 1.0, 'AMT_ANNUITY': 0.77713334937918133, 'AMT_APPLICATION': 1.0, 'AMT_CREDIT': 0.99999940127432774, 'AMT_DOWN_PAYMENT': 0.46363519884278304, 'AMT_GOODS_PRICE': 0.76918227245131465, 'DAYS_DECISION': 1.0, 'SELLERPLACE_AREA': 1.0, 'CNT_PAYMENT': 0.77713634300754275, 'NFLAG_INSURED_ON_APPROVAL': 0.59701870538745339}


In [17]:
tt_data = previous_data[previous_data['NAME_CONTRACT_STATUS']!='Canceled']
num_dict = {}
for col in num_variables:
    num_dict[col] = tt_data[col].count()/len(tt_data)
print(num_dict)

{'SK_ID_PREV': 1.0, 'SK_ID_CURR': 1.0, 'AMT_ANNUITY': 0.95093415663696224, 'AMT_APPLICATION': 1.0, 'AMT_CREDIT': 0.99999926139028505, 'AMT_DOWN_PAYMENT': 0.57148227890641445, 'AMT_GOODS_PRICE': 0.94095184633963491, 'DAYS_DECISION': 1.0, 'SELLERPLACE_AREA': 1.0, 'CNT_PAYMENT': 0.95093784968553696, 'NFLAG_INSURED_ON_APPROVAL': 0.73650393863630492}


In [18]:
def fill_with0(origin_data, fillmid_cols):
    for i in fillmid_cols:
        origin_data[i].fillna(0,inplace=True)
    return
fill_with0(previous_data, list(num_dict.keys()))

In [19]:
previous_data = pd.get_dummies(previous_data)
previous_data.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'AMT_ANNUITY', 'AMT_APPLICATION',
       'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'DAYS_DECISION',
       'SELLERPLACE_AREA', 'CNT_PAYMENT',
       ...
       'PRODUCT_COMBINATION_Cash X-Sell: low',
       'PRODUCT_COMBINATION_Cash X-Sell: middle',
       'PRODUCT_COMBINATION_POS household with interest',
       'PRODUCT_COMBINATION_POS household without interest',
       'PRODUCT_COMBINATION_POS industry with interest',
       'PRODUCT_COMBINATION_POS industry without interest',
       'PRODUCT_COMBINATION_POS mobile with interest',
       'PRODUCT_COMBINATION_POS mobile without interest',
       'PRODUCT_COMBINATION_POS other with interest',
       'PRODUCT_COMBINATION_POS others without interest'],
      dtype='object', length=145)

In [20]:
sum_features = ['NFLAG_INSURED_ON_APPROVAL']
for col in previous_data.columns:
    if col.startswith('NAME_CONTRACT_TYPE') or col.startswith('NAME_CASH_LOAN_PURPOSE') or col.startswith('NAME_CONTRACT_STATUS') \
    or col.startswith('NAME_PAYMENT_TYPE') or col.startswith('CODE_REJECT_REASON') or col.startswith('NAME_TYPE_SUITE') \
    or col.startswith('NAME_CLIENT_TYPE') or col.startswith('NAME_GOODS_CATEGORY') or col.startswith('NAME_PORTFOLIO') \
    or col.startswith('NAME_PRODUCT_TYPE') or col.startswith('CHANNEL_TYPE') or col.startswith('NAME_SELLER_INDUSTRY') \
    or col.startswith('NAME_YIELD_GROUP') or col.startswith('PRODUCT_COMBINATION'):
        sum_features.append(col)
minmax_features = []
for col in previous_data.columns:
    if col not in ['SK_ID_CURR','SK_ID_BUREAU'] and col not in sum_features:
        minmax_features.append(col)
sum_features.append('SK_ID_CURR')
minmax_features.append('SK_ID_CURR')
print(sum_features)
print(minmax_features)

['NFLAG_INSURED_ON_APPROVAL', 'NAME_CONTRACT_TYPE_Cash loans', 'NAME_CONTRACT_TYPE_Consumer loans', 'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_CONTRACT_TYPE_XNA', 'NAME_CASH_LOAN_PURPOSE_Building a house or an annex', 'NAME_CASH_LOAN_PURPOSE_Business development', 'NAME_CASH_LOAN_PURPOSE_Buying a garage', 'NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land', 'NAME_CASH_LOAN_PURPOSE_Buying a home', 'NAME_CASH_LOAN_PURPOSE_Buying a new car', 'NAME_CASH_LOAN_PURPOSE_Buying a used car', 'NAME_CASH_LOAN_PURPOSE_Car repairs', 'NAME_CASH_LOAN_PURPOSE_Education', 'NAME_CASH_LOAN_PURPOSE_Everyday expenses', 'NAME_CASH_LOAN_PURPOSE_Furniture', 'NAME_CASH_LOAN_PURPOSE_Gasification / water supply', 'NAME_CASH_LOAN_PURPOSE_Hobby', 'NAME_CASH_LOAN_PURPOSE_Journey', 'NAME_CASH_LOAN_PURPOSE_Medicine', 'NAME_CASH_LOAN_PURPOSE_Money for a third person', 'NAME_CASH_LOAN_PURPOSE_Other', 'NAME_CASH_LOAN_PURPOSE_Payments on other loans', 'NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment', 'NAME_CAS

In [21]:
previous_sum = previous_data[sum_features].groupby(by=['SK_ID_CURR']).agg('sum')
previous_sum.reset_index(inplace=True)

In [22]:
previous_min = previous_data[previous_data['NAME_CONTRACT_STATUS_Canceled']==0][minmax_features].groupby(by=['SK_ID_CURR']).agg('min')
previous_min.columns=['SK_ID_PREV_MIN','AMT_ANNUITY_MIN','AMT_APPLICATION_MIN','AMT_CREDIT_MIN','AMT_DOWN_PAYMENT_MIN',
                     'AMT_GOODS_PRICE_MIN','DAYS_DECISION_MIN','SELLERPLACE_AREA_MIN','CNT_PAYMENT_MIN']
previous_min.reset_index(inplace=True)

In [23]:
previous_max = previous_data[previous_data['NAME_CONTRACT_STATUS_Canceled']==0][minmax_features].groupby(by=['SK_ID_CURR']).agg('max')
previous_max.columns=['SK_ID_PREV_MAX','AMT_ANNUITY_MAX','AMT_APPLICATION_MAX','AMT_CREDIT_MAX','AMT_DOWN_PAYMENT_MAX',
                     'AMT_GOODS_PRICE_MAX','DAYS_DECISION_MAX','SELLERPLACE_AREA_MAX','CNT_PAYMENT_MAX']
previous_max.reset_index(inplace=True)

In [24]:
previous_sum = previous_sum.merge(previous_min, on='SK_ID_CURR', how='left')
previous_sum = previous_sum.merge(previous_max, on='SK_ID_CURR', how='left')

In [410]:
previous_sum.describe()

Unnamed: 0,SK_ID_CURR,NFLAG_INSURED_ON_APPROVAL,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,NAME_CASH_LOAN_PURPOSE_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_Business development,NAME_CASH_LOAN_PURPOSE_Buying a garage,NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land,...,CNT_PAYMENT_MIN,SK_ID_PREV_MAX,AMT_ANNUITY_MAX,AMT_APPLICATION_MAX,AMT_CREDIT_MAX,AMT_DOWN_PAYMENT_MAX,AMT_GOODS_PRICE_MAX,DAYS_DECISION_MAX,SELLERPLACE_AREA_MAX,CNT_PAYMENT_MAX
count,338857.0,338857.0,338857.0,338857.0,338857.0,338857.0,338857.0,338857.0,338857.0,338857.0,...,338602.0,338602.0,338602.0,338602.0,338602.0,338602.0,338602.0,338602.0,338602.0,338602.0
mean,278149.909581,0.978649,2.206102,2.151796,0.570046,0.001021,0.007947,0.001257,0.000401,0.001573,...,6.072802,2331918.0,23712.249652,370304.0,409200.5,11064.04,370332.0,-544.780108,974.4344,23.797373
std,102879.193103,1.321227,3.297706,1.820743,1.024132,0.034939,0.130419,0.044382,0.028947,0.057955,...,7.098769,461450.6,19171.717738,415856.4,451482.9,27882.06,415864.2,542.642901,15686.19,16.523802
min,100001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1000001.0,0.0,0.0,0.0,0.0,0.0,-2922.0,-1.0,0.0
25%,189061.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2075677.0,10417.73625,98950.5,104926.5,0.0,98950.5,-671.0,50.0,12.0
50%,278221.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,2479420.0,17909.505,202500.0,225000.0,4995.0,202500.0,-370.0,159.0,18.0
75%,367302.0,1.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,10.0,2703941.0,31452.01875,454500.0,541629.0,13500.0,454500.0,-205.0,1200.0,36.0
max,456255.0,23.0,66.0,48.0,31.0,3.0,13.0,5.0,8.0,7.0,...,72.0,2845382.0,418058.145,6905160.0,6905160.0,3060045.0,6905160.0,-1.0,4000000.0,84.0


In [25]:
previous_sum.columns = list(map(lambda x: 'PRE'+x if x!='SK_ID_CURR' else x, list(previous_sum.columns)))

# 百分比特征

In [26]:
bureau_sum['AMT_OVERDUE_PERCENT'] = bureau_sum['AMT_CREDIT_MAX_OVERDUE'] / bureau_sum['AMT_CREDIT_SUM']
bureau_sum['AMT_DEBT_PERCENT'] = bureau_sum['AMT_CREDIT_SUM_DEBT'] / bureau_sum['AMT_CREDIT_SUM']
bureau_sum['AMT_OVERDUESUM_PERCENT'] = bureau_sum['AMT_CREDIT_SUM_OVERDUE'] / bureau_sum['AMT_CREDIT_SUM']
previous_sum['AMT_ANUCREDIT_PERCENT'] = previous_sum['PREAMT_ANNUITY_MAX'] / previous_sum['PREAMT_CREDIT_MAX']

train_data = train_data.merge(bureau_sum, on='SK_ID_CURR', how='left')
train_data = train_data.merge(previous_sum, on='SK_ID_CURR', how='left')
test_data = test_data.merge(bureau_sum, on='SK_ID_CURR', how='left')
test_data = test_data.merge(previous_sum, on='SK_ID_CURR', how='left')
train_data['HAS_BUREAU'] = train_data['CNT_CREDIT_PROLONG'].notnull().astype(int)
train_data['HAS_PREVIOUS1'] = train_data['PRENFLAG_INSURED_ON_APPROVAL'].notnull().astype(int)
train_data['HAS_PREVIOUS2'] = train_data['PRESK_ID_PREV_MAX'].notnull().astype(int)
test_data['HAS_BUREAU'] = test_data['CNT_CREDIT_PROLONG'].notnull().astype(int)
test_data['HAS_PREVIOUS1'] = test_data['PRENFLAG_INSURED_ON_APPROVAL'].notnull().astype(int)
test_data['HAS_PREVIOUS2'] = test_data['PRESK_ID_PREV_MAX'].notnull().astype(int)

# application文件+bureau文件交叉新特征

In [27]:
train_data['BUREAU_MAXOV_CREDIT_PERCENT'] = train_data['AMT_CREDIT_MAX_OVERDUE'] / train_data['AMT_CREDIT']
train_data['BUREAU_MAXOV_INCOME_PERCENT'] = train_data['AMT_CREDIT_MAX_OVERDUE'] / train_data['AMT_INCOME_TOTAL']
train_data['BUREAU_SUMOV_CREDIT_PERCENT'] = train_data['AMT_CREDIT_SUM_OVERDUE'] / train_data['AMT_CREDIT']
train_data['BUREAU_SUMOV_INCOME_PERCENT'] = train_data['AMT_CREDIT_SUM_OVERDUE'] / train_data['AMT_INCOME_TOTAL']
train_data['BUREAU_MAXOV_ANNUITY_PERCENT'] = train_data['AMT_CREDIT_MAX_OVERDUE'] / train_data['AMT_ANNUITY']
train_data['BUREAU_SUMOV_ANNUITY_PERCENT'] = train_data['AMT_CREDIT_SUM_OVERDUE'] / train_data['AMT_ANNUITY']
train_data['PRE_ANNUITY_PERCENT'] = train_data['PREAMT_ANNUITY_MAX'] / train_data['AMT_ANNUITY']
train_data['PRE_CREDIT_PERCENT'] = train_data['PREAMT_CREDIT_MAX'] / train_data['AMT_CREDIT']
test_data['BUREAU_MAXOV_CREDIT_PERCENT'] = test_data['AMT_CREDIT_MAX_OVERDUE'] / test_data['AMT_CREDIT']
test_data['BUREAU_MAXOV_INCOME_PERCENT'] = test_data['AMT_CREDIT_MAX_OVERDUE'] / test_data['AMT_INCOME_TOTAL']
test_data['BUREAU_SUMOV_CREDIT_PERCENT'] = test_data['AMT_CREDIT_SUM_OVERDUE'] / test_data['AMT_CREDIT']
test_data['BUREAU_SUMOV_INCOME_PERCENT'] = test_data['AMT_CREDIT_SUM_OVERDUE'] / test_data['AMT_INCOME_TOTAL']
test_data['BUREAU_MAXOV_ANNUITY_PERCENT'] = test_data['AMT_CREDIT_MAX_OVERDUE'] / test_data['AMT_ANNUITY']
test_data['BUREAU_SUMOV_ANNUITY_PERCENT'] = test_data['AMT_CREDIT_SUM_OVERDUE'] / test_data['AMT_ANNUITY']
test_data['PRE_ANNUITY_PERCENT'] = test_data['PREAMT_ANNUITY_MAX'] / test_data['AMT_ANNUITY']
test_data['PRE_CREDIT_PERCENT'] = test_data['PREAMT_CREDIT_MAX'] / test_data['AMT_CREDIT']

# drop掉50%以上缺失字段+label encoding

In [346]:
train_data.dtypes.value_counts()

float64    266
int64       41
object      16
int32        3
dtype: int64

In [28]:
num_variables = []
cat_variables = []
for col in train_data.columns:
    if train_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [29]:
train_data['EXT_SOURCE_1_NEW'] = (train_data['EXT_SOURCE_1'].isnull()).astype(int)
zero_cols = ['OWN_CAR_AGE','APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 
             'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 
             'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'EXT_SOURCE_1']
fill_with0(train_data, zero_cols)

In [30]:
num_dict = {}
for col in num_variables:
    num_dict[col] = train_data[col].count()/len(train_data)
for k, v in num_dict.items():
    if v < 0.5:
        train_data.drop(labels=k,axis=1,inplace=True)
        test_data.drop(labels=k,axis=1,inplace=True)

In [31]:
lb_cols = []
for col in cat_variables:
    if train_data[col].nunique() <= 2:
        lb_cols.append(col)
print(lb_cols)

['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']


In [32]:
# Create a label encoder objectcat_variables
le = preprocessing.LabelEncoder()
le_count = 0

# Iterate through the columns
for col in lb_cols:
    # Train on the training data
    train_data[col] = train_data[col].fillna('NAN')
    test_data[col] = test_data[col].fillna('NAN')
    le.fit(train_data[col])
    # Transform both training and testing data
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    # Keep track of how many columns were label encoded
    le_count += 1
            
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


# 独热编码

In [33]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [34]:
train_target = train_data['TARGET']
train_data, test_data = train_data.align(test_data, join = 'inner', axis = 1)
print(train_data.shape, test_data.shape)

(307511, 418) (48744, 418)


# 连续变量补缺

In [35]:
pop_item = []
for k, v in num_dict.items():
    if v < 0.5:
        pop_item.append(k)
for i in pop_item:
    num_dict.pop(i)
num_dict.pop('TARGET')

1.0

In [36]:
def fill_withmid(origin_data, fillmid_cols):
    for i in fillmid_cols:
        mid_value = origin_data[i].quantile(.5)
        origin_data[i].fillna(mid_value,inplace=True)
    return
fill_withmid(train_data, list(num_dict.keys()))
fill_withmid(test_data, list(num_dict.keys()))

In [37]:
#特殊列，新建分类变量
train_data['DAYS_EMPLOYED_NEW'] = (train_data['DAYS_EMPLOYED']==365243).astype(int)
train_data.loc[train_data['DAYS_EMPLOYED']==365243,'DAYS_EMPLOYED'] = -20000

In [38]:
test_data['DAYS_EMPLOYED_NEW'] = (test_data['DAYS_EMPLOYED']==365243).astype(int)
test_data.loc[test_data['DAYS_EMPLOYED']==365243,'DAYS_EMPLOYED'] = -20000

# application文件新特征

In [39]:
train_data['GOODS_CREDIT_PERCENT'] = train_data['AMT_GOODS_PRICE'] / train_data['AMT_CREDIT']
train_data['CREDIT_INCOME_PERCENT'] = train_data['AMT_CREDIT'] / train_data['AMT_INCOME_TOTAL']
train_data['ANNUITY_INCOME_PERCENT'] = train_data['AMT_ANNUITY'] / train_data['AMT_INCOME_TOTAL']
train_data['CREDIT_TERM'] = train_data['AMT_ANNUITY'] / train_data['AMT_CREDIT']
train_data['DAYS_EMPLOYED_PERCENT'] = train_data['DAYS_EMPLOYED'] / train_data['DAYS_BIRTH']

test_data['GOODS_CREDIT_PERCENT'] = test_data['AMT_GOODS_PRICE'] / test_data['AMT_CREDIT']
test_data['CREDIT_INCOME_PERCENT'] = test_data['AMT_CREDIT'] / test_data['AMT_INCOME_TOTAL']
test_data['ANNUITY_INCOME_PERCENT'] = test_data['AMT_ANNUITY'] / test_data['AMT_INCOME_TOTAL']
test_data['CREDIT_TERM'] = test_data['AMT_ANNUITY'] / test_data['AMT_CREDIT']
test_data['DAYS_EMPLOYED_PERCENT'] = test_data['DAYS_EMPLOYED'] / test_data['DAYS_BIRTH']

# 连续变量与TARGET的相关系数

In [40]:
train_num = train_data[list(num_dict.keys())].copy()
train_num['TARGET'] = train_target
train_corr = train_num.corr()

In [41]:
train_corr = train_corr['TARGET'].sort_values()
print('Most Positive Correlations:\n', train_corr.tail(15))
print('\nMost Negative Correlations:\n', train_corr.head(15))

Most Positive Correlations:
 DAYS_ID_PUBLISH                              0.051457
PREDAYS_DECISION_MIN                         0.052380
AMT_DEBT_PERCENT                             0.052563
DAYS_LAST_PHONE_CHANGE                       0.055218
PRECODE_REJECT_REASON_SCOFR                  0.055622
REGION_RATING_CLIENT                         0.058899
CREDIT_ACTIVE_Active                         0.060544
REGION_RATING_CLIENT_W_CITY                  0.060893
PRENAME_PRODUCT_TYPE_walk-in                 0.062785
PRENAME_CONTRACT_STATUS_Refused              0.064756
DAYS_EMPLOYED                                0.064810
DAYS_CREDIT_MIN                              0.067388
DAYS_BIRTH                                   0.078239
TARGET                                       1.000000
PRENAME_GOODS_CATEGORY_House Construction         NaN
Name: TARGET, dtype: float64

Most Negative Correlations:
 EXT_SOURCE_2                 -0.160295
EXT_SOURCE_3                 -0.155892
EXT_SOURCE_1            

# Polynomial features(随机森林筛选重要特征进行poly化)

In [44]:
#部分百分比inf，转为1
inf_features = ['AMT_OVERDUE_PERCENT','AMT_DEBT_PERCENT','AMT_OVERDUESUM_PERCENT','AMT_ANUCREDIT_PERCENT']
for col in inf_features:
    train_data[col] = train_data[col].apply(lambda x:1 if x==np.inf else x)
    test_data[col] = test_data[col].apply(lambda x:1 if x==np.inf else x)
    train_data[col] = train_data[col].apply(lambda x:0 if x==-np.inf else x)
    test_data[col] = test_data[col].apply(lambda x:0 if x==-np.inf else x)

In [45]:
forest = ExtraTreesClassifier(n_estimators=200,
                              random_state=0,n_jobs=6)

forest.fit(train_data, train_target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=6,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [46]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_data.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, train_data.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature EXT_SOURCE_2 (0.016848)
2. feature EXT_SOURCE_3 (0.014173)
3. feature DAYS_BIRTH (0.007208)
4. feature GOODS_CREDIT_PERCENT (0.007044)
5. feature AMT_DEBT_PERCENT (0.006878)
6. feature PREDAYS_DECISION_MIN (0.006557)
7. feature ANNUITY_INCOME_PERCENT (0.006552)
8. feature DAYS_CREDIT_MIN (0.006534)
9. feature DAYS_ID_PUBLISH (0.006534)
10. feature DAYS_EMPLOYED_PERCENT (0.006528)
11. feature EXT_SOURCE_1 (0.006478)
12. feature DAYS_LAST_PHONE_CHANGE (0.006387)
13. feature CREDIT_INCOME_PERCENT (0.006300)
14. feature HOUR_APPR_PROCESS_START (0.006206)
15. feature DAYS_REGISTRATION (0.006184)
16. feature CREDIT_ACTIVE_Active (0.006177)
17. feature PREDAYS_DECISION_MAX (0.006175)
18. feature PRESK_ID_PREV_MAX (0.006154)
19. feature CREDIT_TERM (0.006134)
20. feature SK_ID_CURR (0.006121)
21. feature PRESK_ID_PREV_MIN (0.006100)
22. feature AMT_INCOME_TOTAL (0.006013)
23. feature DAYS_EMPLOYED (0.005971)
24. feature REGION_POPULATION_RELATIVE (0.005960)
25. feat

In [47]:
pf = preprocessing.PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
train_poly = train_data[train_data.columns[indices[0:36]]]
test_poly = test_data[test_data.columns[indices[0:36]]]
train_poly = pf.fit_transform(train_poly)
test_poly = pf.fit_transform(test_poly)
print('Polynomial Features shape: ', train_poly.shape)

Polynomial Features shape:  (307511, 9138)


In [101]:
pf = preprocessing.PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
train_poly = train_data[train_data.columns[indices[64:90]]]
test_poly = test_data[test_data.columns[indices[64:90]]]
train_poly = pf.fit_transform(train_poly)
test_poly = pf.fit_transform(test_poly)
print('Polynomial Features shape: ', train_poly.shape)

Polynomial Features shape:  (307511, 3653)


# 对Poly特征进行PCA降维

In [48]:
scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
train_poly = scaler.fit_transform(train_poly)
test_poly = scaler.transform(test_poly)
pca = PCA(n_components=60)
pca.fit(train_poly)

PCA(copy=True, iterated_power='auto', n_components=60, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [49]:
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

[ 0.19402269  0.10744072  0.06227941  0.05669899  0.0460492   0.04342429
  0.03802893  0.03527774  0.03383176  0.02899593  0.02487743  0.02251755
  0.02167339  0.02035304  0.01986879  0.0193089   0.01642291  0.01545098
  0.01308231  0.01002473  0.0077318   0.00741018  0.00587858  0.00541452
  0.00427232  0.00390582  0.0032108   0.00271843  0.00266911  0.00250358
  0.00213718  0.00208478  0.00200188  0.00190601  0.00177579  0.00164895
  0.00162048  0.00156899  0.00154786  0.00150242  0.00142405  0.00137358
  0.00136267  0.00132891  0.00128644  0.0012592   0.00124653  0.00123084
  0.00122136  0.00117698  0.00114538  0.00112406  0.00109547  0.00107569
  0.00106444  0.0010346   0.00100016  0.00098226  0.00096504  0.00095088]
0.916487732769


In [50]:
train_poly = pca.transform(train_poly)
test_poly = pca.transform(test_poly)

In [51]:
# Create a dataframe of the features 
train_poly= pd.DataFrame(train_poly)
# Add in the target
train_poly['TARGET'] = train_target
# Find the correlations with the target
poly_corrs = train_poly.corr()['TARGET'].sort_values()

# Put test features into dataframe
test_poly= pd.DataFrame(test_poly)

# Merge polynomial features into training dataframe

train_data_poly = train_data.merge(train_poly, left_index=True, right_index=True, how = 'left')

# Merge polnomial features into testing dataframe
test_poly['SK_ID_CURR'] = test_data['SK_ID_CURR']
test_data_poly = test_data.merge(test_poly, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
train_data_poly, test_data_poly = train_data_poly.align(test_data_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', train_data_poly.shape)
print('Testing data with polynomial features shape:  ', test_data_poly.shape)

Training data with polynomial features shape:  (307511, 484)
Testing data with polynomial features shape:   (48744, 484)


In [107]:
# Create a dataframe of the features 
train_poly= pd.DataFrame(train_poly)
# Add in the target
train_poly['TARGET'] = train_target
# Find the correlations with the target
poly_corrs = train_poly.corr()['TARGET'].sort_values()

# Put test features into dataframe
test_poly= pd.DataFrame(test_poly)

# Merge polynomial features into training dataframe

train_data_poly = train_data_poly.merge(train_poly, left_index=True, right_index=True, how = 'left')

# Merge polnomial features into testing dataframe
test_poly['SK_ID_CURR'] = test_data['SK_ID_CURR']
test_data_poly = test_data_poly.merge(test_poly, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
train_data_poly, test_data_poly = train_data_poly.align(test_data_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', train_data_poly.shape)
print('Testing data with polynomial features shape:  ', test_data_poly.shape)

Training data with polynomial features shape:  (307511, 438)
Testing data with polynomial features shape:   (48744, 438)


# 贝叶斯优化选择参数

In [98]:
def base_select(num_leaves,learning_rate,reg_alpha,reg_lambda,subsample,min_split_gain,
                min_child_weight,min_child_samples,
                early_stopping_rounds,features=train_data_poly,n_folds = 5):
    
    num_leaves = num_leaves * 1000
    min_split_gain = min_split_gain / 10
    min_child_samples = min_child_samples * 100
    reg_alpha = reg_alpha * 10
    reg_lambda = reg_lambda * 10
    subsample = subsample * 10
    early_stopping_rounds = early_stopping_rounds * 1000
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(labels = ['SK_ID_CURR', 'TARGET'], axis = 1)
    
    # Convert to np arrays
    features = np.array(features)
    
    # Create the kfold object
    k_fold = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 30)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    cat_indices = 'auto'
    best_iterations = 0
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features, labels):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', num_leaves = int(num_leaves),
                                   class_weight = 'balanced', learning_rate = learning_rate, 
                                   min_split_gain = min_split_gain, min_child_weight = min_child_weight,
                                   min_child_samples = int(min_child_samples),
                                   reg_alpha = reg_alpha, reg_lambda = reg_lambda, 
                                   subsample = subsample, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels)],
                  eval_names = ['valid'], categorical_feature = cat_indices,
                  early_stopping_rounds = int(early_stopping_rounds), verbose = False)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        best_iterations += best_iteration / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
    
    print('best iter: %s' % best_iterations)
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    return valid_auc

In [99]:
bys_selector = BayesianOptimization(
    base_select,
    {'num_leaves': (0.01, 0.05),
     'learning_rate': (0.01, 0.05),
     'min_split_gain': (0.05, 0.25),
     'min_child_weight': (0.01, 0.05),
     'min_child_samples': (0.05, 0.25),
     'reg_alpha': (0.01, 0.05),
     'reg_lambda': (0.01, 0.05),
     'subsample': (0.05, 0.1),
     'early_stopping_rounds': (0.05, 0.25)
    }
)

In [100]:
train_data_poly['TARGET'] = train_target

In [101]:
bys_selector.maximize(init_points=2, n_iter=15)

|   iter    |  target   | early_... | learni... | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
best iter: 644.5999999999999
| [0m 1       [0m | [0m 0.7774  [0m | [0m 0.1638  [0m | [0m 0.02352 [0m | [0m 0.2419  [0m | [0m 0.01152 [0m | [0m 0.1108  [0m | [0m 0.04868 [0m | [0m 0.01774 [0m | [0m 0.03176 [0m | [0m 0.07079 [0m |
best iter: 462.4
| [0m 2       [0m | [0m 0.7767  [0m | [0m 0.1644  [0m | [0m 0.04075 [0m | [0m 0.1305  [0m | [0m 0.0226  [0m | [0m 0.1587  [0m | [0m 0.03245 [0m | [0m 0.04501 [0m | [0m 0.03064 [0m | [0m 0.08067 [0m |
best iter: 480.00000000000006
| [0m 3       [0m | [0m 0.7768  [0m | [0m 0.06421 [0m | [0m 0.03111 [0m | [0m 0.05346 [0m | [0m 0.04324 [0m | [0m 0.2336  [0m | [0m 0.04049 [0m | [0m 0.04089 [0m | [0m 0.02771 [0m | [0m 0.076

In [112]:
bys_params = bys_selector.max['params']
print(bys_params)

{'early_stopping_rounds': 0.16383309025308296, 'learning_rate': 0.023521951382605329, 'min_child_samples': 0.24188836046430912, 'min_child_weight': 0.011516188141620649, 'min_split_gain': 0.11075382554890222, 'num_leaves': 0.048683995401874958, 'reg_alpha': 0.017744359191618569, 'reg_lambda': 0.031757957360611766, 'subsample': 0.070789632920536347}


# Lightgbm（StratifiedKFold应对不平衡数据，split后再做balance）

In [116]:
def lgbmmodel(features, test_features, n_folds = 5, **kw):
    
    kw['num_leaves'] = int(kw['num_leaves'] * 1000)
    kw['min_split_gain'] = kw['min_split_gain'] / 10
    kw['min_child_samples'] = int(kw['min_child_samples'] * 100)
    kw['reg_alpha'] = kw['reg_alpha'] * 10
    kw['reg_lambda'] = kw['reg_lambda'] * 10
    kw['subsample'] = kw['subsample'] * 10
    kw['early_stopping_rounds'] = kw['early_stopping_rounds'] * 1000
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(labels = ['SK_ID_CURR', 'TARGET'], axis = 1)
    test_features = test_features.drop(labels = ['SK_ID_CURR'], axis = 1)
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 30)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    cat_indices = 'auto'
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features, labels):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', num_leaves = int(kw['num_leaves']),
                                   class_weight = 'balanced', learning_rate = kw['learning_rate'], 
                                   min_split_gain = kw['min_split_gain'], min_child_weight = kw['min_child_weight'],
                                   min_child_samples = int(kw['min_child_samples']),
                                   reg_alpha = kw['reg_alpha'], reg_lambda = kw['reg_lambda'], 
                                   subsample = kw['subsample'], n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = kw['early_stopping_rounds'], verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [117]:
train_data_poly['TARGET'] = train_target

In [118]:
submission, feature_importances, metrics = lgbmmodel(train_data_poly, test_data_poly, **bys_params)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (307511, 483)
Testing Data Shape:  (48744, 483)
Training until validation scores don't improve for 163.83309025308296 rounds.
[200]	valid's auc: 0.7662	valid's binary_logloss: 0.556314	train's auc: 0.80943	train's binary_logloss: 0.546284
[400]	valid's auc: 0.773386	valid's binary_logloss: 0.52965	train's auc: 0.844763	train's binary_logloss: 0.510441
[600]	valid's auc: 0.774226	valid's binary_logloss: 0.511482	train's auc: 0.870148	train's binary_logloss: 0.484063
Early stopping, best iteration is:
[613]	valid's auc: 0.774297	valid's binary_logloss: 0.510433	train's auc: 0.871573	train's binary_logloss: 0.482499
Training until validation scores don't improve for 163.83309025308296 rounds.
[200]	valid's auc: 0.771994	valid's binary_logloss: 0.557262	train's auc: 0.808364	train's binary_logloss: 0.547089
[400]	valid's auc: 0.779487	valid's binary_logloss: 0.530054	train's auc: 0.844132	train's binary_logloss: 0.51128
[600]	valid's auc: 0.780382	valid's binary_loglo

In [119]:
submission.to_csv('E:/pnk/kaggle_credit/lgbm_result8.csv')

In [89]:
print(feature_importances.sort_values(by='importance'))

                                               feature  importance
104                       CREDIT_TYPE_Interbank credit         0.0
112                       CREDIT_TYPE_Real estate loan         0.0
110                   CREDIT_TYPE_Mobile operator loan         0.0
382                         ORGANIZATION_TYPE_Religion         0.0
107     CREDIT_TYPE_Loan for the purchase of equipment         0.0
106  CREDIT_TYPE_Loan for purchase of shares (margi...         0.0
292                    NAME_TYPE_SUITE_Group of people         0.0
101              CREDIT_TYPE_Cash loan (non-earmarked)         0.0
98                          CREDIT_CURRENCY_currency 4         0.0
96                          CREDIT_CURRENCY_currency 2         0.0
92                              CREDIT_ACTIVE_Bad debt         0.0
297                       NAME_INCOME_TYPE_Businessman         0.0
299                         NAME_INCOME_TYPE_Pensioner         0.0
301                           NAME_INCOME_TYPE_Student        

In [110]:
def lgbmmodel_final(features, test_features, **kw):
    
    kw['num_leaves'] = int(kw['num_leaves'] * 1000)
    kw['min_split_gain'] = kw['min_split_gain'] / 10
    kw['min_child_samples'] = int(kw['min_child_samples'] * 100)
    kw['reg_alpha'] = kw['reg_alpha'] * 10
    kw['reg_lambda'] = kw['reg_lambda'] * 10
    kw['subsample'] = kw['subsample'] * 10
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(labels = ['SK_ID_CURR', 'TARGET'], axis = 1)
    test_features = test_features.drop(labels = ['SK_ID_CURR'], axis = 1)
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    print(kw)
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    cat_indices = 'auto'
    
    model = lgb.LGBMClassifier(n_estimators = 645, class_weight = 'balanced', n_jobs = -1, random_state = 50, **kw)
        
    model.fit(features, labels, categorical_feature = cat_indices)
    
    test_predictions = model.predict_proba(test_features)[:, 1]
    
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    return submission
submission = lgbmmodel_final(train_data_poly, test_data_poly, **bys_params)

Training Data Shape:  (307511, 483)
Testing Data Shape:  (48744, 483)
{'learning_rate': 0.023521951382605329, 'min_child_samples': 24, 'min_child_weight': 0.011516188141620649, 'min_split_gain': 0.011075382554890222, 'num_leaves': 48, 'reg_alpha': 0.17744359191618569, 'reg_lambda': 0.31757957360611766, 'subsample': 0.70789632920536349}
