In [0]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [0]:
import os 
os.chdir('drive/My Drive/Home_Credit_Default_Risk')

# Preprocessing and feature engineering

Here in this notebook we will do some simple preprocessing and feature engineering.
* onehot encoding for categorical features.
* numerical aggregation for other numeric columns.
* some important feature engineering

In [0]:
import pandas as pd
import numpy as np


In [0]:
def onehotenc(df):
  original = list(df.columns)
  cat_col =[]
  for col in df:
    if df[col].dtype == 'object':
      cat_col.append(col)
  df = pd.get_dummies(df,columns=cat_col,dummy_na = True)
  new_col = [c for c in df.columns if c not in original]
  return df,new_col

## application_train_test

In [0]:
train = pd.read_csv('application_train.csv')
test_df = pd.read_csv('application_test.csv')

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

train = imputer.fit_transform(train)
test = imputer.transform(test)
df = train.append(test_df).reset_index()


In [0]:
## remove rows with CODE_GENDER = 'XNA'
df = df[df['CODE_GENDER']!='XNA']

Zero-one encoding for binary features.

In [0]:
for bin_feat in ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','EMERGENCYSTATE_MODE']:
  df[bin_feat],_ = pd.factorize(df[bin_feat])

onehot encoding for categorical features.

In [0]:
df,_ = onehotenc(df)

In [0]:
df['DAYS_EMPLOYED'].replace({365243.000000:np.nan},inplace=True)

Design some new features

In [0]:
## percentage of employment
df['emp_per'] = df['DAYS_EMPLOYED']/df['DAYS_BIRTH']
## percentage of credit
df['income_credit_per'] = df['AMT_INCOME_TOTAL']/df['AMT_CREDIT']
## income per person
df['income_per_person'] = df['AMT_INCOME_TOTAL']/df['CNT_FAM_MEMBERS']
## percentage of annuity to income
df['annuity_income_per'] = df['AMT_ANNUITY']/df['AMT_INCOME_TOTAL']
## payment rate based on annuity and credit
df['payment_rate'] = df['AMT_ANNUITY']/df['AMT_CREDIT']


In [0]:
import gc
del test_df
gc.collect()

26

## bureau_and_balance
Apply onehot encoding to categorical features and numerical aggregation for numerical features

In [0]:
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')

onehot encoding

In [0]:
bureau,b_cat = onehotenc(bureau)
bureau_balance,bb_cat = onehotenc(bureau_balance)

In [0]:
# numerical aggregation for bureau_balance
bb_aggragation ={'MONTHS_BALANCE':['min','max','size']}
for col in bb_cat:
  bb_aggragation[col] =['mean']

bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggragation)
bb_agg.columns = pd.Index([e[0]+'_'+e[1] for e in bb_agg.columns.tolist()])
bureau = bureau.join(bb_agg,how='left',on='SK_ID_BUREAU')
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
del bureau_balance,bb_agg
gc.collect()

0

In [0]:
num_aggr = {'DAYS_CREDIT':['min','max','mean'],
            'CREDIT_DAY_OVERDUE':['min','max','mean'],
            'DAYS_CREDIT_ENDDATE':['min','max','mean'],
            'DAYS_ENDDATE_FACT':['min','max','mean'],
            'AMT_CREDIT_MAX_OVERDUE':['mean'],
            'CNT_CREDIT_PROLONG':['sum'] ,
            'AMT_CREDIT_SUM':['max','min','mean','sum'],
            'AMT_CREDIT_SUM_DEBT':['max','min','mean','sum'],
            'AMT_CREDIT_SUM_LIMIT':['max','min','mean','sum'],
            'AMT_CREDIT_SUM_OVERDUE':['mean'],
            'DAYS_CREDIT_UPDATE':['mean'],
            'AMT_ANNUITY':['max','mean'],
            'MONTHS_BALANCE_min':['min'] ,
            'MONTHS_BALANCE_max':['max'] ,
            'MONTHS_BALANCE_size':['mean','sum']}
cat_aggr = {}
for cat in b_cat:
  cat_aggr[cat] = ['mean']
for cat in bb_cat:
  cat_aggr[cat+'_mean'] = ['mean']

bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggr,**cat_aggr})
bureau_agg.columns = pd.Index(['buro_'+e[0]+'_'+e[1] for e in bureau_agg.columns.tolist()])



There are two main features here the status of the credit(active or closed).

In [0]:
## value  1 means the credit is active 
active_credit = bureau[bureau['CREDIT_ACTIVE_Active']==1]
active_agg = active_credit.groupby('SK_ID_CURR').agg(num_aggr)
active_agg.columns = pd.Index(['active_'+e[0]+'_'+e[1] for e in active_agg.columns.tolist()])
bureau_agg = bureau_agg.join(active_agg,how='left',on='SK_ID_CURR')
del active_credit,active_agg
gc.collect()


0

In [0]:
## value  1 means the credit is closed 
closed_credit = bureau[bureau['CREDIT_ACTIVE_Closed']==1]
closed_agg = closed_credit.groupby('SK_ID_CURR').agg(num_aggr)
closed_agg.columns = pd.Index(['closed_'+e[0]+'_'+e[1] for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg,how='left',on='SK_ID_CURR')


del bureau,closed_agg,closed_credit
gc.collect()

0

## previous_applications

In [0]:
prev_appl = pd.read_csv('previous_application.csv')
prev_appl.shape

(1670214, 37)

In [0]:
prev,cat_col = onehotenc(prev_appl)

In [0]:
prev['DAYS_FIRST_DRAWING'].replace({365243.000000:np.nan},inplace=True)

In [0]:
prev['DAYS_FIRST_DUE'].replace({365243.000000:np.nan},inplace=True)

In [0]:
prev['DAYS_LAST_DUE_1ST_VERSION'].replace({365243.000000:np.nan},inplace=True)

In [0]:
prev['DAYS_LAST_DUE'].replace({365243.000000:np.nan},inplace=True)

In [0]:
prev['DAYS_TERMINATION'].replace({365243.000000:np.nan},inplace=True)

In [0]:
num_aggr = { 'AMT_ANNUITY': ['min', 'max', 'mean'],
            'AMT_APPLICATION': ['min', 'max', 'mean'],
            'AMT_CREDIT': ['min', 'max', 'mean'],
            'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
            'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
            'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'RATE_INTEREST_PRIMARY': ['min', 'max', 'mean'],
            'RATE_INTEREST_PRIVILEGED': ['min', 'max', 'mean'],
            'DAYS_DECISION': ['min', 'max', 'mean'],
            'CNT_PAYMENT': ['min', 'max', 'mean'] }
cat_aggr ={}
for cat in cat_col:
  cat_aggr[cat] = ['mean']
prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggr,**cat_aggr})
prev_agg.columns = pd.Index(['prev_'+e[0]+'_'+e[1] for e in prev_agg.columns.tolist()])


Here status of the application is important wether it is approved or refused.

In [0]:
#NAME_CONTRACT_STATUS_Refused
refused = prev[prev['NAME_CONTRACT_STATUS_Refused']==1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggr)
refused_agg.columns = pd.Index(['refused_'+e[0]+'_'+e[1] for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg,how='left',on='SK_ID_CURR')
del refused,refused_agg
gc.collect()

#NAME_CONTRACT_STATUS_Approved
approved = prev[prev['NAME_CONTRACT_STATUS_Approved']==1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggr)
approved_agg.columns = pd.Index(['approved_'+e[0]+'_'+e[1] for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg,how='left',on='SK_ID_CURR')
del approved,approved_agg
gc.collect()

0

## POS_CASH_balance

In [0]:
pos_cash = pd.read_csv('POS_CASH_balance.csv')
pos_cash.shape

(10001358, 8)

In [0]:
pos,cat_col = onehotenc(pos_cash)

In [0]:
aggr = {'MONTHS_BALANCE':['min','max','mean'],
        'CNT_INSTALMENT':['sum'],
        'CNT_INSTALMENT_FUTURE':['sum'],
        'SK_DPD':['min','max','mean'],
        'SK_DPD_DEF':['min','max','mean'] }
cat_aggr = {}
for cat in cat_col:
  cat_aggr[cat]=['mean']

pos_aggr = pos.groupby('SK_ID_CURR').agg({**aggr,**cat_aggr})
pos_aggr.columns = pd.Index(['pos_'+e[0]+'_'+e[1] for e in pos_aggr.columns.tolist()])


## installments_payments

In [0]:
installments = pd.read_csv('installments_payments.csv')
installments.shape

(13605401, 8)

There are some important factores to be considered as below:
* wether customer pays installment on time, before due date or after due date
* wether customer pays full amount of installment or less then the amount of installment
* percentage of payment to the installment amount.



In [0]:
installments['before_due_payment'] = (installments['DAYS_INSTALMENT']-installments['DAYS_ENTRY_PAYMENT']).apply(lambda x:x if x>0 else 0)
installments['after_due_payment'] = (installments['DAYS_ENTRY_PAYMENT']-installments['DAYS_INSTALMENT']).apply(lambda x:x if x>0 else 0)

installments['paymen_diff'] = installments['AMT_INSTALMENT']-installments['AMT_PAYMENT']
installments['payment_perc'] = installments['AMT_PAYMENT']/installments['DAYS_INSTALMENT']

installments.columns.tolist()

['SK_ID_PREV',
 'SK_ID_CURR',
 'NUM_INSTALMENT_VERSION',
 'NUM_INSTALMENT_NUMBER',
 'DAYS_INSTALMENT',
 'DAYS_ENTRY_PAYMENT',
 'AMT_INSTALMENT',
 'AMT_PAYMENT',
 'before_due_payment',
 'after_due_payment',
 'paymen_diff',
 'payment_perc']

In [0]:
aggr = {'NUM_INSTALMENT_VERSION':['nunique'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min','max', 'mean', 'sum'],
        'before_due_payment': ['max', 'mean', 'sum'],
        'after_due_payment': ['max', 'mean', 'sum'],
        'paymen_diff': ['max', 'mean', 'sum'],
        'payment_perc': ['max', 'mean', 'sum']}

ins_agg = installments.groupby('SK_ID_CURR').agg(aggr)
ins_agg.columns = pd.Index(['ins_'+e[0]+'_'+e[1] for e in ins_agg.columns.tolist()])

## credit_card_balance

In [0]:
credit_card = pd.read_csv('credit_card_balance.csv')
credit_card.shape

(3840312, 23)

In [0]:
credit_card,cat_col = onehotenc(credit_card)

In [0]:
aggr = {'MONTHS_BALANCE':['min','max','mean','sum'],
 'AMT_BALANCE':['min','max','mean','sum'],
 'AMT_CREDIT_LIMIT_ACTUAL':['min','max','mean','sum'],
 'AMT_DRAWINGS_ATM_CURRENT':['min','max','mean','sum'],
 'AMT_DRAWINGS_CURRENT':['min','max','mean','sum'],
 'AMT_DRAWINGS_OTHER_CURRENT':['min','max','mean','sum'],
 'AMT_DRAWINGS_POS_CURRENT':['min','max','mean','sum'],
 'AMT_INST_MIN_REGULARITY':['min','max','mean','sum'],
 'AMT_PAYMENT_CURRENT':['min','max','mean','sum'],
 'AMT_PAYMENT_TOTAL_CURRENT':['min','max','mean','sum'],
 'AMT_RECEIVABLE_PRINCIPAL':['min','max','mean','sum'],
 'AMT_RECIVABLE':['min','max','mean','sum'],
 'AMT_TOTAL_RECEIVABLE':['min','max','mean','sum'],
 'CNT_DRAWINGS_ATM_CURRENT':['sum'],
 'CNT_DRAWINGS_CURRENT':['sum'],
 'CNT_DRAWINGS_OTHER_CURRENT':['sum'],
 'CNT_DRAWINGS_POS_CURRENT':['sum'],
 'CNT_INSTALMENT_MATURE_CUM':['sum'],
 'SK_DPD':['min','max','mean','sum'],
 'SK_DPD_DEF':['min','max','mean','sum']}

cc_agg = credit_card.groupby('SK_ID_CURR').agg(aggr)
cc_agg.columns = pd.Index(['cc_'+e[0]+'_'+e[1] for e in cc_agg.columns.tolist()])


## combine all the dataframe
Combine and save all new data frame.

In [0]:
#df,bureau_agg,prev_agg,pos_aggr,ins_agg,cc_agg
df = df.join(bureau_agg, how='left', on='SK_ID_CURR')
df = df.join(prev_agg, how='left', on='SK_ID_CURR')
df = df.join(pos_aggr, how='left', on='SK_ID_CURR')
df = df.join(ins_agg, how='left', on='SK_ID_CURR')
df = df.join(cc_agg, how='left', on='SK_ID_CURR')

In [0]:
df.to_csv('home_features.csv')

# Model tunning

## Import libraries

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import gc

In [0]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import pickle

## Load feature engineered data

In [0]:
data = pd.read_csv('home_features.csv')

In [0]:
train = data[data['TARGET'].notnull()]
test = data[data['TARGET'].isnull()]
test_id = test['SK_ID_CURR']
del data
gc.collect()
print('Train shape: ',train.shape)
print('Test shape: ',test.shape)

Train shape:  (307507, 759)
Test shape:  (48744, 759)


## Drop columns with null value geater than 75%

In [0]:
missing = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
missing.head()

refused_RATE_INTEREST_PRIVILEGED_min     100.0
refused_RATE_INTEREST_PRIVILEGED_mean    100.0
refused_RATE_INTEREST_PRIVILEGED_max     100.0
refused_RATE_INTEREST_PRIMARY_mean       100.0
refused_RATE_INTEREST_PRIMARY_max        100.0
dtype: float64

In [0]:
train_miss = missing.index[missing>75]

In [0]:
train = train.drop(columns=train_miss)
test = test.drop(columns=train_miss)

print('Train shape: ',train.shape)
print('Test shape: ',test.shape)

In [0]:
train.drop(columns=['SK_ID_CURR','index','Unnamed: 0'],inplace=True)
test.drop(columns=['SK_ID_CURR','index','Unnamed: 0','TARGET'],inplace=True)

In [0]:
y = train['TARGET']
x = train.drop(columns=['TARGET'])

## Train test split, impute missing value and scale data

In [0]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2,random_state =10)

In [0]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)
test = imputer.transform(test)

scaler = MinMaxScaler(feature_range = (0, 1))
x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.transform(x_test)
test_scale = scaler.transform(test)

In [0]:
x_train_scale = pd.DataFrame(data=x_train_scale,columns=x.columns.tolist())
x_test_scale = pd.DataFrame(data=x_test_scale,columns=x.columns.tolist())
test_scale = pd.DataFrame(data=test_scale,columns=x.columns.tolist())
with open('train_test.pkl','wb') as f:
  pickle.dump((x_train_scale,x_test_scale,y_train,y_test),f)
#x_train_scale,x_test_scale,y_train,y_test = pickle.load(open('train_test.pkl','rb'))

In [0]:
with open('test_data.pkl','wb') as f:
  pickle.dump(test_scale,f)

In [0]:
x_train_scale,x_test_scale,y_train,y_test = pickle.load(open('train_test.pkl','rb'))

In [0]:
x_train_scale = x_train_scale.append(x_test_scale)
y_train = y_train.append(y_test)

In [0]:
test_scale = pickle.load(open('test_data.pkl','rb'))

In [0]:
x_train_scale.shape,x_test_scale.shape,test_scale.shape

((307507, 709), (61502, 709), (48744, 709))

Feature selection

In [0]:
from sklearn.linear_model import Ridge,LinearRegression,Lasso
from tqdm import tqdm
def evaluate_metric(model, x_cv, y_cv):
    return roc_auc_score(y_cv, model.predict(x_cv))

def forward_feature_selection(x_train, x_cv, y_train, y_cv, n):
    feature_set = []
    metric_list = [] # Choose appropriate metric based on business problem
    score = 0
    for feature in tqdm(x_train.columns):
        if feature not in feature_set:
            model = Ridge()
            if len(feature_set) == 0:
              f_set =[]
              f_set.append(feature)
            else:
              f_set = feature_set.copy()
              f_set.append(feature)
            model.fit(x_train[f_set], y_train)
            temp = evaluate_metric(model, x_cv[f_set], y_cv)
            if score < temp:
              feature_set.append(feature)
              score =temp

    return feature_set

f=forward_feature_selection(x_train_scale, x_test_scale, y_train, y_test, 287)

100%|██████████| 709/709 [21:25<00:00,  3.92s/it]


In [0]:
print(len(f))

428


## Model tunning and selection.
Here we will try some models as below
* Logistic regression
* SGD Classifier 
* Random forest
* LGBM Classifier

## Logistic Regression

In [0]:
param = {'penalty':['l1','l2'],
        'C':[0.1,0.001,0.0001,0.00001]}
        
log_reg = LogisticRegression()
clf = GridSearchCV(log_reg,param_grid=param,n_jobs=-1,verbose=1)
clf.fit(x_train_scale,y_train)

In [0]:
clf.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [0]:
log_reg = LogisticRegression(C=0.1,penalty='l2')
log_reg.fit(x_train_scale,y_train)
y_pred = log_reg.predict_proba(x_test_scale)[:,1]
score = roc_auc_score(y_test,y_pred)



In [0]:
print(score)

0.7640211457290654


## SGD Classifier

In [0]:
"""param = {
    'loss':['hinge','log'],
    'penalty':['l1','l2','elasticnet'],
    'alpha':[0.1,0.001,0.0001],
    'fit_intercept':[True,False],
    'learning_rate':['optimal','adaptive']
}
sgd = SGDClassifier(eta0=0.1)
clf = GridSearchCV(sgd,param_grid=param,cv=5,n_jobs=-1,verbose=1)
clf.fit(x_train_scale,y_train)"""

In [0]:
"""print(clf.best_score_)
clf.best_params_"""

0.9193309079083758


{'alpha': 0.0001,
 'fit_intercept': False,
 'learning_rate': 'adaptive',
 'loss': 'log',
 'penalty': 'l2'}

#### Fit model on best parameters

In [0]:
scores =[]
sgd = SGDClassifier(loss='log',penalty='l2',learning_rate='adaptive',fit_intercept=False,alpha=0.0001,eta0=0.1,verbose=0)
for i in range(5):
  x_tr,x_te, y_tr,y_te = train_test_split(x_train_scale,y_train,test_size=0.1,random_state=i*10)
  print('Fold '+str(i))
  sgd.fit(x_tr,y_tr)
  y_pred = sgd.predict_proba(x_te)[:,1]
  score = roc_auc_score(y_te,y_pred)
  print(str(i) +') score = ',score)
  scores.append(score)

Fold 0
0) score =  0.7774451537193686
Fold 1
1) score =  0.7650878288691845
Fold 2
2) score =  0.7683897800149332
Fold 3
3) score =  0.7624985291445495
Fold 4
4) score =  0.7652513751052232


In [0]:
sgd.fit(x_train_scale,y_train)

y_pred =sgd.predict_proba(x_test_scale)[:,1]
score = roc_auc_score(y_test,y_pred)

print('Auroc score: ',score)

Auroc score:  0.7624007447132439


## Random Forest

Here we will use random forest to elemenate some features which are less important and see if removal of those features improves score or not. Take less data points for faster execution.

In [0]:
x_tr,x_te,y_tr,y_te = train_test_split(x_train_scale,y_train,test_size=0.9)
x_tr.shape

(24600, 709)

In [0]:
rnd = RandomForestClassifier(n_estimators=100,random_state=10,n_jobs=-1)
rnd.fit(x_train_scale,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

In [0]:
features = x.columns.tolist()
importance = rnd.feature_importances_

feat_imp = pd.DataFrame({'features':features,'importance':importance}).sort_values(by='importance')

#### Remove feature with zero importance.

In [0]:
feat_imp = feat_imp[feat_imp['importance']!=0]

In [0]:
new_feat = feat_imp['features'].tolist()
import pickle
with open('imp_feat.pkl','wb') as f:
  pickle.dump(new_feat,f)
x_new = x[new_feat]

In [0]:
import pickle
new_feat = pickle.load(open('imp_feat.pkl','rb'))
x_new = x_train_scale[new_feat]
x_new_te = x_test_scale[new_feat]

In [0]:
sgd = SGDClassifier(loss='log',penalty='l2',learning_rate='adaptive',fit_intercept=False,alpha=0.0001,eta0=0.1,verbose=0)

In [0]:
sgd.fit(x_new,y_train)

y_pred =sgd.predict_proba(x_new_te)[:,1]
score = roc_auc_score(y_test,y_pred)

print('Auroc score: ',score)

Auroc score:  0.7624274382336401


## LGBMClassifier
Lightgbm is faster than XGBoost so we are using Lightgbm. Here Bayesian Optimization is used for hyperparameter tunning.

https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a
https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
https://datascience.stackexchange.com/questions/49567/lightgbm-vs-xgboost-vs-catboost

In [0]:
"""#https://github.com/hyperopt/hyperopt/issues/253#issuecomment-298960310

import lightgbm as lgb
from hyperopt import STATUS_OK
from hyperopt import Trials
from hyperopt import tpe
from hyperopt import hp
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

N_FOLDS = 5 
train_set = lgb.Dataset(x_train_scale, y_train)
def objective(params, n_folds = N_FOLDS):
  """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
  
  # Perform n_fold cross validation with hyperparameters
  # Use early stopping and evalute based on ROC AUC
  cv_results = lgb.cv(params, train_set, nfold = n_folds, num_boost_round = 30, 
                      early_stopping_rounds = 100, metrics = 'auc', seed = 50,verbose_eval=True)

  # Extract the best score
  best_score = max(cv_results['auc-mean'])
  
  # Loss must be minimized
  loss = 1 - best_score
  
  # Dictionary with information for evaluation
  return {'loss': loss, 'params': params, 'status': STATUS_OK}

space = {
    'num_leaves': sample(scope.int(hp.quniform('num_leaves', 30, 40, 1))),
    'max_depth':sample(scope.int(hp.quniform('max_depth', 5,10,1))),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.02)),
    'subsample': hp.uniform('subsample', 0.7,1.0),
    'min_child_weight': hp.uniform('min_child_weight', 20, 50),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.1),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'boosting_type':['goss']
}


# Algorithm
tpe_algorithm = tpe.suggest

# Trials object to track progress
bayes_trials = Trials()

from hyperopt import fmin

MAX_EVALS = 15

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials)"""

### Model training 
Model is trained using 5 fold cross validation. Parameter used here are found using above hyperparameter tunning.

In [0]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold

folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
sub_preds = np.zeros(test_scale.shape[0])
preds = np.zeros(x_train_scale.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train_scale,y_train)):
  train_x,train_y = x_train_scale.iloc[train_idx],y_train.iloc[train_idx]
  valid_x,valid_y = x_train_scale.iloc[valid_idx],y_train.iloc[valid_idx]
  clf = LGBMClassifier(boosting_type ='goss',
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,)
  clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)
  preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
  sub_preds += clf.predict_proba(test_scale, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
  print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, preds[valid_idx])))
  del clf, train_x, train_y, valid_x, valid_y
  gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y_train, preds))

Training until validation scores don't improve for 200 rounds.
[200]	training's binary_logloss: 0.234988	training's auc: 0.799879	valid_1's binary_logloss: 0.241091	valid_1's auc: 0.778138
[400]	training's binary_logloss: 0.224369	training's auc: 0.824819	valid_1's binary_logloss: 0.237154	valid_1's auc: 0.787019
[600]	training's binary_logloss: 0.21703	training's auc: 0.842698	valid_1's binary_logloss: 0.235951	valid_1's auc: 0.790071
[800]	training's binary_logloss: 0.210845	training's auc: 0.857781	valid_1's binary_logloss: 0.235503	valid_1's auc: 0.791184
[1000]	training's binary_logloss: 0.205273	training's auc: 0.870665	valid_1's binary_logloss: 0.235236	valid_1's auc: 0.791945
[1200]	training's binary_logloss: 0.199961	training's auc: 0.882501	valid_1's binary_logloss: 0.235053	valid_1's auc: 0.792394
[1400]	training's binary_logloss: 0.195054	training's auc: 0.892983	valid_1's binary_logloss: 0.234929	valid_1's auc: 0.792657
[1600]	training's binary_logloss: 0.190335	training's

In [0]:
#f = pickle.load(open("imp_feat.pkl",'rb'))
x_train_imp = x_train_scale[f]
test_imp = test_scale[f]

In [0]:
x_train_imp.shape,test_imp.shape

((307507, 428), (48744, 428))

In [0]:
clf = LGBMClassifier()
clf.fit(x_train_imp,y_train)

features = x_train_imp.columns.tolist()
importance = clf.feature_importances_

feat_imp = pd.DataFrame({'features':features,'importance':importance}).sort_values(by='importance')
print(feat_imp.shape)
new_feat = feat_imp[feat_imp['importance']!=0]
print(new_feat.shape)


(428, 2)
(284, 2)


In [0]:
y_train.value_counts()

0.0    282682
1.0     24825
Name: TARGET, dtype: int64

In [0]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold

new_feat_ = new_feat['features'].tolist()
param = {
                'objective': 'binary',
                'num_leaves': 32,
                'learning_rate': 0.04,
                'colsample_bytree': 0.2,
                'max_depth': 8,
                'reg_alpha': 0.04,
                'reg_lambda': 0.073,
                'min_split_gain': 0.1,
                'min_child_weight': 120,
                'top_rate': 0.35,
                'other_rate': 0.1,
                'metric': 'auc',
                'n_estimators': 10000,
                'boosting_type': 'goss',
         'feature_fraction':0.9, 
            'bagging_fraction':0.9,
            'lambda_l1':0.1, 
            'lambda_l2':0.1,
         'silent':-1,
            'verbose':-1
            }
folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
sub_preds = np.zeros(test_imp.shape[0])
preds = np.zeros(x_train_imp.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train_imp[new_feat_],y_train)):
  train_x,train_y = x_train_imp[new_feat_].iloc[train_idx],y_train.iloc[train_idx]
  valid_x,valid_y = x_train_imp[new_feat_].iloc[valid_idx],y_train.iloc[valid_idx]
  clf = LGBMClassifier(**param)
  clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)
  preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
  sub_preds += clf.predict_proba(test_imp[new_feat_], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
  print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, preds[valid_idx])))
  del clf, train_x, train_y, valid_x, valid_y
  gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y_train, preds))
# 0.78725
# 0.78613
# 0.78717
# 0.79048

Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.811034	valid_1's auc: 0.779007
[400]	training's auc: 0.833283	valid_1's auc: 0.784962
[600]	training's auc: 0.849592	valid_1's auc: 0.786549
[800]	training's auc: 0.862988	valid_1's auc: 0.787458
[1000]	training's auc: 0.87496	valid_1's auc: 0.787957
Early stopping, best iteration is:
[950]	training's auc: 0.872164	valid_1's auc: 0.787994
Fold  1 AUC : 0.787994
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.810472	valid_1's auc: 0.782659
[400]	training's auc: 0.833144	valid_1's auc: 0.788616
[600]	training's auc: 0.850015	valid_1's auc: 0.790368
[800]	training's auc: 0.864017	valid_1's auc: 0.790892
[1000]	training's auc: 0.8757	valid_1's auc: 0.791009
[1200]	training's auc: 0.886197	valid_1's auc: 0.790257
Early stopping, best iteration is:
[1014]	training's auc: 0.876544	valid_1's auc: 0.791083
Fold  2 AUC : 0.791083
Training until validation scores don't imp

In [0]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold

new_feat_ = new_feat['features'].tolist()
param = {
    'boosting_type': 'goss',
    'n_estimators': 10000,
    'learning_rate': 0.007134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'min_child_weight': 120,
    'top_rate': 0.35,
    'other_rate': 0.1,
    'metric': 'auc',
    'is_unbalance': False,
    'feature_fraction':0.9, 
    'bagging_fraction':0.9,
    'lambda_l1':0.1, 
    'lambda_l2':0.1,
    'silent':-1,
    'verbose':-1
}
folds = KFold(n_splits= 6, shuffle=True, random_state=1001)
sub_preds = np.zeros(test_imp.shape[0])
preds = np.zeros(x_train_imp.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train_imp[new_feat_],y_train)):
  train_x,train_y = x_train_imp[new_feat_].iloc[train_idx],y_train.iloc[train_idx]
  valid_x,valid_y = x_train_imp[new_feat_].iloc[valid_idx],y_train.iloc[valid_idx]
  clf = LGBMClassifier(**param)
  clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 400, early_stopping_rounds= 100)
  preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
  sub_preds += clf.predict_proba(test_imp[new_feat_], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
  print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, preds[valid_idx])))
  del clf, train_x, train_y, valid_x, valid_y
  gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y_train, preds))
# 0.78725
# 0.78613
# 0.78717
# 0.79048

Training until validation scores don't improve for 100 rounds.
[400]	training's auc: 0.792931	valid_1's auc: 0.770092
[800]	training's auc: 0.816004	valid_1's auc: 0.780644
[1200]	training's auc: 0.829199	valid_1's auc: 0.784615
[1600]	training's auc: 0.839974	valid_1's auc: 0.786941
[2000]	training's auc: 0.849289	valid_1's auc: 0.788275
[2400]	training's auc: 0.857431	valid_1's auc: 0.78908
[2800]	training's auc: 0.865045	valid_1's auc: 0.789571
[3200]	training's auc: 0.871854	valid_1's auc: 0.789816
Early stopping, best iteration is:
[3464]	training's auc: 0.876116	valid_1's auc: 0.789982
Fold  1 AUC : 0.789982
Training until validation scores don't improve for 100 rounds.
[400]	training's auc: 0.792986	valid_1's auc: 0.767106
[800]	training's auc: 0.815899	valid_1's auc: 0.778983
[1200]	training's auc: 0.829479	valid_1's auc: 0.783734
[1600]	training's auc: 0.840295	valid_1's auc: 0.786311
[2000]	training's auc: 0.849504	valid_1's auc: 0.787703
[2400]	training's auc: 0.857611	valid

In [0]:
def nueral_net():
  model = Sequential()
  model.add(Dense(units=512,kernel_initializer='normal',imput_dim=x_train.shape[1])
  model.add(PReLU())
  model.add(BatchNormalization())
  model.add(Dropout(.8))
  model.add(Dense(units=800,kernel_initializer='normal')
  model.add(PReLU())
  model.add(BatchNormalization())
  model.add(Dropout(.9))
  model.add(Dense(units=512,kernel_initializer='normal')
  model.add(PReLU())
  model.add(BatchNormalization())
  model.add(Dropout(.8))
  model.add(Dense(units=128,kernel_initializer='normal')
  model.add(PReLU())
  model.add(BatchNormalization())
  model.add(Dropout(.8))
  opt = Adam(lr=001)
  model.add(Dense(1,kernel_initializer='normal',activation='sigmoid'))
  model.compile(loss='binary_crossentropy',optimizer=opt)
  
  return model



In [0]:
test_id = pd.read_csv('submission3.csv')['SK_ID_CURR']

In [0]:
test_id.head()

0    100001
1    100005
2    100013
3    100028
4    100038
Name: SK_ID_CURR, dtype: int64

In [0]:
pred_df = pd.DataFrame({'SK_ID_CURR':test_id,'TARGET':sub_preds})

In [0]:
pred_df.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.035274
1,100005,0.110644
2,100013,0.030602
3,100028,0.041858
4,100038,0.164968


In [0]:
pred_df.to_csv('feat_sel_new_param_submission.csv',index=False)