In [56]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
from IPython.core.display import display, HTML
from sklearn.preprocessing import MinMaxScaler, RobustScaler

display(HTML("<style>.container { width: 100% !important; }</style>"))

In [57]:
train = pd.read_csv('./open/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('./open/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('./open/sample_submission.csv')

In [58]:
data=pd.concat([train, test], axis=0)
data.shape

(36457, 19)

In [59]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
#     elif col in ['phone',  'email', 'work_phone', 'FLAG_MOBIL']:
#         object_col.append(col)

In [60]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [61]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [62]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [63]:
def minmax(df):
    df = np.array(df).reshape(-1, 1)
    result = MinMaxScaler().fit_transform(df)
    return result

In [64]:
def days_to_age(x):
    return (x*-1)/365

In [65]:
train['income_total'] = train['income_total']/10000
train['income_total_dev'] = (train['income_total'] - train['income_total'].mean())**2
train['income_total_log'] = train['income_total'].apply(np.log)
train['income_total_minmax'] = train['income_total'].apply(minmax)

In [66]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))
train['DAYS_EMPLOYED_one'] = train['DAYS_EMPLOYED']
train.loc[train['DAYS_EMPLOYED_one'] > 0,'DAYS_EMPLOYED']=0
train.loc[train['DAYS_EMPLOYED_one'] < 0,'DAYS_EMPLOYED']=1

In [67]:
train['begin_month'] =- train['begin_month']
# train.loc[train['begin_month'] < 0,'begin_month']=0
# train.loc[train['begin_month'] > 0,'begin_month']=1

In [68]:
train['year_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_age)

In [69]:
train.loc[train['child_num'] >= 3,'child_num']=3
train.loc[train['family_size'] >= 5,'child_num']=5
train['diff_child'] = train['family_size'] - train['child_num']

In [70]:
test['income_total'] = test['income_total']/10000
test['income_total_dev'] = (test['income_total'] - test['income_total'].mean())**2
test['income_total_log'] = test['income_total'].apply(np.log)
test['income_total_minmax'] = test['income_total'].apply(minmax)

In [71]:
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))
test['DAYS_EMPLOYED_one'] = test['DAYS_EMPLOYED']
test.loc[test['DAYS_EMPLOYED_one'] > 0,'DAYS_EMPLOYED']=0
test.loc[test['DAYS_EMPLOYED_one'] < 0,'DAYS_EMPLOYED']=1

In [72]:
test['year_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_age)

In [73]:
test.loc[test['child_num'] >= 3,'child_num']=3
test.loc[test['family_size'] >= 5,'child_num']=5
test['diff_child'] = test['family_size'] - test['child_num']

In [74]:
test['begin_month'] =- test['begin_month']
# train.loc[train['begin_month'] < 0,'begin_month']=0
# train.loc[train['begin_month'] > 0,'begin_month']=1

In [75]:
#minus 변경하고
#구간화 함수
def make_bin(df, variable, n):
    data = df
    #data[variable] =- data[variable]
    count, bin_dividers = np.histogram(data[variable], bins=n)
    bin_names=[str(i) for i in range(n)]
    data['%s_bin' % variable]=pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)

In [76]:
#make_bin(train, 'income_total', n=7)
make_bin(train, 'income_total_log', n=25)
make_bin(train, 'year_BIRTH', n=30)
make_bin(train, 'begin_month', n=6)
make_bin(train, 'DAYS_EMPLOYED', n=20)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [77]:
#make_bin(train, 'income_total', n=7)
make_bin(test, 'income_total_log', n=25)
make_bin(test, 'year_BIRTH', n=30)
make_bin(test, 'begin_month', n=6)
make_bin(test, 'DAYS_EMPLOYED', n=20)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [78]:
enc = OneHotEncoder()
object_col = ['begin_month_bin', 'income_total_log_bin', 'year_BIRTH_bin',  'DAYS_EMPLOYED_bin']
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [79]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)
train = train.drop(['FLAG_MOBIL'], axis=1)

In [80]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)
test = test.drop(['FLAG_MOBIL'], axis=1)

In [81]:
train.shape, test.shape

((26457, 123), (10000, 122))

In [82]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [83]:
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from tensorflow.keras.utils import to_categorical
random.seed(42)
lgb_models={}
outcomes=[]
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=50,
           verbose=100)
    logloss=log_loss(to_categorical(y_valid), lgb.predict_proba(X_valid))
    outcomes.append(logloss)
    print(f"FOLD {fold} : logloss:{logloss}")
    
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')
    
mean_outcome=np.mean(outcomes)
    
print("Mean:{}".format(mean_outcome))

Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.649707	valid_1's multi_logloss: 0.748941
[200]	training's multi_logloss: 0.560811	valid_1's multi_logloss: 0.731279
[300]	training's multi_logloss: 0.49637	valid_1's multi_logloss: 0.727575
[400]	training's multi_logloss: 0.442834	valid_1's multi_logloss: 0.727192
Early stopping, best iteration is:
[357]	training's multi_logloss: 0.463597	valid_1's multi_logloss: 0.725636
FOLD 0 : logloss:0.7256160293583117


Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.644464	valid_1's multi_logloss: 0.759414
[200]	training's multi_logloss: 0.556346	valid_1's multi_logloss: 0.744981
Early stopping, best iteration is:
[243]	training's multi_logloss: 0.527403	valid_1's multi_logloss: 0.743225
FOLD 1 : logloss:0.7432241057722012


Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.644691	valid_1's multi_logloss: 0.7549

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train.drop(['credit'],axis=1), train['credit'], 
                                                    stratify=train['credit'], test_size=0.2,
                                                    random_state = 10086)

In [119]:
import lightgbm as lgbm
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score, make_scorer, log_loss
from sklearn.model_selection import cross_validate

In [136]:
#목적함수 생성
X = np.array(train.drop(['credit'],axis=1))
y = np.array(train['credit'])
def lgbm_cv(learning_rate, num_leaves, max_depth, min_child_weight, colsample_bytree, feature_fraction, bagging_fraction, lambda_l1, lambda_l2):
    model = lgbm.LGBMClassifier(learning_rate=learning_rate,
                                n_estimators = 300,
                                #boosting = 'dart',
                                num_leaves = int(round(num_leaves)),
                                max_depth = int(round(max_depth)),
                                min_child_weight = int(round(min_child_weight)),
                                colsample_bytree = colsample_bytree,
                                feature_fraction = max(min(feature_fraction, 1), 0),
                                bagging_fraction = max(min(bagging_fraction, 1), 0),
                                lambda_l1 = max(lambda_l1, 0),
                                lambda_l2 = max(lambda_l2, 0)
                               )
    scoring = {'roc_auc_score': make_scorer(roc_auc_score)}
    result = cross_validate(model, X, y, cv=5, scoring=scoring)
    auc_score = result["test_roc_auc_score"].mean()
    return   auc_score

In [137]:
# 입력값의 탐색 대상 구간
pbounds = {'learning_rate' : (0.0001, 0.05),
           'num_leaves': (300, 600),
           'max_depth': (2, 25),
           'min_child_weight': (30, 100),
           'colsample_bytree': (0, 0.99),
           'feature_fraction': (0.0001, 0.99),
           'bagging_fraction': (0.0001, 0.99),
           'lambda_l1' : (0, 0.99),
           'lambda_l2' : (0, 0.99),
          }

In [138]:
#객체 생성
lgbmBO = BayesianOptimization(f = lgbm_cv, pbounds = pbounds, verbose = 2, random_state = 0 )

In [139]:
# 반복적으로 베이지안 최적화 수행
# acq='ei'사용
# xi=0.01 로 exploration의 강도를 조금 높임
lgbmBO.maximize(init_points=5, n_iter = 10, acq='ei', xi=0.01)

|   iter    |  target   | baggin... | colsam... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------


ValueError: multi_class must be in ('ovo', 'ovr')