In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials 

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = '1JUo8pNfbv3jtRM3fq4hzT9Rh1khUuuuO' # url에서 file id만 copy/paste
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('train.csv')

- 하이퍼 파라미터 : Dictionary 형태
  - 개별 하이퍼 파라미터 : Tuple 형태

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

cust_df = pd.read_csv("./train.csv",encoding='latin-1')
print(cust_df['TARGET'].value_counts())
unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1]['TARGET'].count()
total_cnt = cust_df['TARGET'].count()
print('unsatisfied 비율은 {0:.2f}'.format((unsatisfied_cnt / total_cnt)))

0    73012
1     3008
Name: TARGET, dtype: int64
unsatisfied 비율은 0.04


In [6]:
# var3 피처 값 대체 및 ID 피처 드롭
cust_df['var3'].replace(-999999, 2, inplace=True)
cust_df.drop('ID',axis=1 , inplace=True)

# 피처 세트와 레이블 세트분리. 레이블 컬럼은 DataFrame의 맨 마지막에 위치해 컬럼 위치 -1로 분리
X_features = cust_df.iloc[:, :-1]
y_labels = cust_df.iloc[:, -1]
print('피처 데이터 shape:{0}'.format(X_features.shape))

피처 데이터 shape:(76020, 369)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                    test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape))

print(' 학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt)

학습 세트 Shape:(60816, 369), 테스트 세트 Shape:(15204, 369)
 학습 세트 레이블 값 분포 비율
0    0.960964
1    0.039036
Name: TARGET, dtype: float64

 테스트 세트 레이블 값 분포 비율
0    0.9583
1    0.0417
Name: TARGET, dtype: float64


In [0]:
bayes_params = {
    'num_leaves' : (24, 45),
    'colsample_bytree' : (0.5, 1),
    'reg_alpha' : (0, 0.5),
    'min_split_gain' : (0.001, 0.1),
}

In [0]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

def lgb_roc_eval(num_leaves, colsample_bytree, reg_alpha, min_split_gain):
  params = {
      'n_estimator':200,
      'learning_rate':0.02,
      'num_leaves':int(round(num_leaves)), # 정수형으로 변경
      'colsample_bytree':colsample_bytree,
      'reg_alpha':reg_alpha,
      'min_split_gain':min_split_gain,
  }
  lgb_model = LGBMClassifier(**params)
  lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=30, eval_metric='auc')
  best_iter = lgb_model.best_iteration_
  print('best_iter:', best_iter)
  valid_proba = lgb_model.predict_proba(X_test, num_iteration=best_iter)[:, 1]
  roc_preds = roc_auc_score(y_test, valid_proba)
  print('roc_auc:', roc_preds)
  return roc_preds

In [11]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/b5/26/9842333adbb8f17bcb3d699400a8b1ccde0af0b6de8d07224e183728acdf/bayesian_optimization-1.1.0-py3-none-any.whl
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.1.0


In [0]:
from bayes_opt import BayesianOptimization
BO_lgb = BayesianOptimization(lgb_roc_eval, bayes_params, random_state=0)

In [19]:
BO_lgb.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... | min_sp... | num_le... | reg_alpha |
-------------------------------------------------------------------------
[1]	valid_0's binary_logloss: 0.17236	valid_0's auc: 0.76775
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's binary_logloss: 0.171529	valid_0's auc: 0.770113
[3]	valid_0's binary_logloss: 0.169943	valid_0's auc: 0.810523
[4]	valid_0's binary_logloss: 0.168451	valid_0's auc: 0.827977
[5]	valid_0's binary_logloss: 0.167643	valid_0's auc: 0.824547
[6]	valid_0's binary_logloss: 0.16638	valid_0's auc: 0.829328
[7]	valid_0's binary_logloss: 0.165203	valid_0's auc: 0.831982
[8]	valid_0's binary_logloss: 0.164061	valid_0's auc: 0.832976
[9]	valid_0's binary_logloss: 0.163041	valid_0's auc: 0.834787
[10]	valid_0's binary_logloss: 0.162054	valid_0's auc: 0.835366
[11]	valid_0's binary_logloss: 0.161441	valid_0's auc: 0.834736
[12]	valid_0's binary_logloss: 0.160521	valid_0's auc: 0.835803
[13]	valid_0's binary_logloss: 0.1

In [20]:
BO_lgb.res

[{'params': {'colsample_bytree': 0.7744067519636624,
   'min_split_gain': 0.07180374727086954,
   'num_leaves': 36.65803089750452,
   'reg_alpha': 0.27244159149844843},
  'target': 0.8465893467628267},
 {'params': {'colsample_bytree': 0.7118273996694524,
   'min_split_gain': 0.06494351719359896,
   'num_leaves': 33.18933143651654,
   'reg_alpha': 0.4458865003910399},
  'target': 0.8457719071858039},
 {'params': {'colsample_bytree': 0.9818313802505146,
   'min_split_gain': 0.038960710363751996,
   'num_leaves': 40.626225799735955,
   'reg_alpha': 0.26444745987645224},
  'target': 0.8432748788076272},
 {'params': {'colsample_bytree': 0.7840222805469661,
   'min_split_gain': 0.09263406719097345,
   'num_leaves': 25.491757222155627,
   'reg_alpha': 0.043564649850770354},
  'target': 0.8453703864082673},
 {'params': {'colsample_bytree': 0.5101091987201629,
   'min_split_gain': 0.08342936470924586,
   'num_leaves': 40.341291769946864,
   'reg_alpha': 0.4350060741234096},
  'target': 0.843834

In [21]:
BO_lgb.max

{'params': {'colsample_bytree': 0.7744067519636624,
  'min_split_gain': 0.07180374727086954,
  'num_leaves': 36.65803089750452,
  'reg_alpha': 0.27244159149844843},
 'target': 0.8465893467628267}

In [24]:
max_params = BO_lgb.max['params']

max_params['colsample_bytree'] = int(round(max_params['colsample_bytree']))
max_params['min_split_gain'] = int(round(max_params['min_split_gain']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['reg_alpha'] = int(round(max_params['reg_alpha']))

lgbm_clf = LGBMClassifier(n_estimators=1000, learning_rate=0.02, **max_params)

evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals,
                verbose=True)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1],average='macro')
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	valid_0's binary_logloss: 0.171588	valid_0's auc: 0.816678
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.169988	valid_0's auc: 0.819345
[3]	valid_0's binary_logloss: 0.168501	valid_0's auc: 0.819523
[4]	valid_0's binary_logloss: 0.167137	valid_0's auc: 0.820486
[5]	valid_0's binary_logloss: 0.165917	valid_0's auc: 0.820723
[6]	valid_0's binary_logloss: 0.164772	valid_0's auc: 0.821523
[7]	valid_0's binary_logloss: 0.163679	valid_0's auc: 0.821541
[8]	valid_0's binary_logloss: 0.162688	valid_0's auc: 0.824082
[9]	valid_0's binary_logloss: 0.161739	valid_0's auc: 0.824223
[10]	valid_0's binary_logloss: 0.16084	valid_0's auc: 0.824258
[11]	valid_0's binary_logloss: 0.160012	valid_0's auc: 0.827164
[12]	valid_0's binary_logloss: 0.159248	valid_0's auc: 0.828898
[13]	valid_0's binary_logloss: 0.15851	valid_0's auc: 0.828966
[14]	valid_0's binary_logloss: 0.157804	valid_0's auc: 0.830847
[15]	valid_0's binary_logloss: 0.157103	valid_0's au