<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Analyse from LightGBM

### train, val, test data로 분류

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [2]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [3]:
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

In [4]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import make_scorer

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [5]:
TRAIN_START_DATE = datetime.date(2022, 3, 2)
TRAIN_END_DATE = datetime.date(2023, 3, 31)

In [6]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [7]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    fields=my_dict.keys()
    values = my_dict.values()
    pd.DataFrame([fields, values], index=['parameter','value']).T.to_csv(path)
#     df = pd.DataFrame.from_dict(my_dict, orient='index') 
#     df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 
    
# def load_dict_from_csv(path):
#     df = pd.read_csv(path, header=None)
#     my_dict = df.to_dict()
#     return my_dict

In [8]:
def predict_p(test_target, y_predict_list): 
    ps = precision_score(test_target, y_predict_list)
    rs = recall_score(test_target, y_predict_list)
    fs = f1_score(test_target, y_predict_list)
    roc = roc_auc_score(test_target, y_predict_list)
    collect_list = [ps, rs, fs, roc]
    return collect_list

In [9]:
def make_df_from_estimator(estimator, num):
    df_t = pd.DataFrame.from_dict(estimator, orient='index')
    df_t.columns = [f'iter_{num}']
    df_t.index.name = 'parameter'
    return df_t

In [10]:
def calc_results(lgbm, model, train_scaled, val_scaled, test_scaled, train_target, val_target, test_target):
# model = lgbmgs.best_estimator_  # 최적의 파라미터로 모델 생성
    y_predict_train = model.predict(train_scaled)
    y_predict_val = model.predict(val_scaled)
    y_predict_test = model.predict(test_scaled)
    result_dict= {}
    result_dict['best_score'] = lgbm.best_score_ 
    result_dict['best_index'] = lgbm.best_index_
    result_dict['acc_train'] = model.score(train_scaled, train_target)
    result_dict['acc_val'] = model.score(val_scaled, val_target)
    result_dict['acc_test'] = model.score(test_scaled, test_target)
    result_dict['train_precision'] = precision_score(train_target, y_predict_train)
    cm = confusion_matrix(train_target, y_predict_train)
    result_dict['train_tn'] = cm[0,0]
    result_dict['train_fp'] = cm[0,1]
    result_dict['train_fn'] = cm[1,0]
    result_dict['train_tp'] = cm[1,1]
    result_dict['val_precision'] = precision_score(val_target, y_predict_val)
    cm = confusion_matrix(val_target, y_predict_val)
    result_dict['val_tn'] = cm[0,0]
    result_dict['val_fp'] = cm[0,1]
    result_dict['val_fn'] = cm[1,0]
    result_dict['val_tp'] = cm[1,1]
    result_dict['test_precision'] = precision_score(test_target, y_predict_test)
    result_dict['recall'] = recall_score(test_target, y_predict_test)
    result_dict['f1score'] = f1_score(test_target, y_predict_test)
    result_dict['roc'] = roc_auc_score(test_target, y_predict_test)
    cm = confusion_matrix(test_target, y_predict_test)
    result_dict['test_tn'] = cm[0,0]
    result_dict['test_fp'] = cm[0,1]
    result_dict['test_fn'] = cm[1,0]
    result_dict['test_tp'] = cm[1,1]
#     result_dict['precision_neg'] = cm[0,0] / (cm[0,0] + cm[1,0])
    return result_dict

In [11]:
def save_best_results(com_name, model, scaler, columns, df_base, df_params_search):
    joblib.dump(model, f'{directory_for_model}{com_name}/best_model.pkl') # estimaor 저장
    joblib.dump(scaler, f'{directory_for_model}{com_name}/best_scaler.pkl') # scaler 저장
    save_to_pickle(f'{directory_for_model}{com_name}/best_model_p.pkl', model) # save with pickle.dump 
    save_to_pickle(f'{directory_for_model}{com_name}/best_scaler_p.pkl', scaler) # save with pickle.dump 
    save_to_pickle(f'{directory_for_model}{com_name}/best_columns.pkl', new_col) # save with pickle.dump 
    save_list_to_csv(f'{directory_for_model}{com_name}/best_columns.csv', new_col)
    df_base.to_pickle(f'{directory_for_model}{com_name}/best_result.pkl')
    df_params_search.to_pickle(f'{directory_for_model}{com_name}/best_params.pkl')
    return

In [12]:
def add_dict(*dicts):
    dsum = {}
    d_k = []
    d_v = []
    for dic in dicts:
        d_k.extend(list(dic.keys()))  # sum keys list
        d_v.extend(list(dic.values()))  # sum values list
    for i in range(len(d_k)):
        dsum[d_k[i]] = d_v[i]
    return dsum

In [13]:
def df_combine_sorted(df_base_l, df_params_l):
    o_num = list(df_base_l.index).index('best_score')
    df1 = df_base.iloc[:o_num, :]
    df2 = df_base.iloc[o_num:, :]
    df3 = pd.concat([df1, df_params_l], axis=0)
    df3.sort_index(axis=0, inplace=True)
    df_sorted = pd.concat([df3, df2], axis=0)
    return df_sorted

In [14]:
def list_std(lgbmbs_cv_results, num):
    df_result =  pd.DataFrame(lgbmbs_cv_results)
    df_sel = df_result.loc[:, ['mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)
    df_sel = df_sel.reset_index()
    df_sel = df_sel.set_index('rank_test_score')
    df_sel = df_sel.reset_index(drop=True)
    df_sel.columns = ['best_index', 'mean_test_score', 'std_test_score']
    ddict = {'best_index': f'iter{iter}', 'mean_test_score': f'iter{iter}', 'std_test_score' : f'iter{iter}'}
    gdict = dict((*df_sel.groupby(by=ddict, axis=1),))
    df_return = pd.concat(gdict, axis=1)
    return df_return

In [15]:
import sys, os

module_path = os.path.abspath(os.path.join('../..')) # 현재 폴더로 이동
if module_path+"\\data\\base_data\\common_data" not in sys.path:
    sys.path.append(module_path+"\\data\\base_data\\common_data") #  공통으로 사용하는 각종 리스트, 코드 등 
    
import common_data as cd

In [16]:
def df_style(df_s, row_index_l):
#     xin = [list(df_s.index).index(x) for x in row_index_l]
    xin = []
    for x in list(df_s.index):
        try:
            if x in row_index_l:
                xin.append(list(df_s.index).index(x))
        except:
            pass
    return df_s.reset_index('parameter').style.apply(
        lambda x: ['background-color: yellow' if row in xin else '' for row in range(len(df_s.index))], axis=0)

In [17]:
code = cd.code_all # 전체 회사 코드
code = {'005380': ['현대차', 'hyunmotor']}

In [18]:
col_inv1 = ['retail_1', 'foreigner_1', 'institution_1', 'financial_1', 'invtrust_1', 'pension_1', 
#             'privequity_1', 'bank_1', 'insurance_1', 'financeetc_1', 'corporateetc_1', 
            'privequity_1',  'insurance_1', 'corporateetc_1', # bank_1, 'financeetc_1 제외
            'foreigneretc_1']
col_inv2 = ['retail_2', 'foreigner_2', 'institution_2', 'financial_2', 'invtrust_2', 'pension_2',
#             'privequity_2', 'bank_2', 'insurance_2', 'financeetc_2', 'corporateetc_2', 
            'privequity_2', 'insurance_2', 'corporateetc_2', # bank_2, 'financeetc_2 제외
            'foreigneretc_2']
col_his1 = ['open_1', 'high_1', 'low_1', 'close_1', 'vol_1']
col_his2 = ['open_2', 'high_2', 'low_2', 'close_2', 'vol_2']
col_cr = ['weekday', 'cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20']
col_common1 = ["dji_cr", "dji_f_cr", "dxy_cr", "ixic_f_cr", "bond_kor_10_cr", "bond_kor_2_cr", "kosdaq_cr", "kospi_cr", 
         "krw_cr", "ixic_cr", "spx_f_cr", "sox_cr", "spx_cr", "bond_usa_10_cr", "bond_usa_2_cr", "bond_usa_3m_cr", 
         "vix_cr", "wti_cr", "spsy_cr", "spny_cr", "spxhc_cr", "splrcd_cr", "splrci_cr", "splrcu_cr", "splrcs_cr",
         "splrct_cr", "splrcl_cr", "splrcm_cr", "ixbk_cr", "ixfn_cr", "ixid_cr", "ixis_cr", "ixk_cr", "ixtr_cr",
         "ixut_cr", "nbi_cr", "bkx_cr"]
col_common2 = ["dji_cr_2", "dji_f_cr_2", "dxy_cr_2", "ixic_f_cr_2", "bond_kor_10_cr_2", "bond_kor_2_cr_2", "kosdaq_cr_2", "kospi_cr_2",
         "krw_cr_2", "ixic_cr_2", "spx_f_cr_2", "sox_cr_2", "spx_cr_2", "bond_usa_10_cr_2", "bond_usa_2_cr_2", "bond_usa_3m_cr_2",
         "vix_cr_2", "wti_cr_2", "spsy_cr_2", "spny_cr_2", "spxhc_cr_2", "splrcd_cr_2", "splrci_cr_2", "splrcu_cr_2",
         "splrcs_cr_2", "splrct_cr_2", "splrcl_cr_2", "splrcm_cr_2", "ixbk_cr_2", "ixfn_cr_2", "ixid_cr_2",
         "ixis_cr_2", "ixk_cr_2", "ixtr_cr_2", "ixut_cr_2", "nbi_cr_2", "bkx_cr_2"]
col_futures = ['ixic_f_cr', 'ixic_f_cr_2', 'spx_f_cr', 'spx_f_cr_2', 'dji_f_cr', 'dji_f_cr_2',
           'wti_cr','wti_cr_2', 'dxy_cr', 'dxy_cr_2', 'bond_usa_10_cr', 'bond_usa_10_cr_2' ]
column_o = col_inv1 + col_common1 + col_his1 + col_inv2 + col_common2 + col_his2 + col_cr

col_except_futures = [item for item in column_o if item not in col_futures]

new_col = col_except_futures.copy()

# bank, financeetc는 결측치가 많아서 사용하지 않음.
# df.drop(['bank_1', 'bank_2', 'financeetc_1', 'financeetc_2'], axis=1, inplace=True)   

# col_futures : futures는 당일 종료가 되지 않는 data이므로 제외

In [19]:
# 최초의 empty df 생성
df_base =   pd.DataFrame(pd.Series([],dtype=pd.StringDtype(), name='parameter')).set_index('parameter')
df_params = pd.DataFrame(pd.Series([],dtype=pd.StringDtype(), name='parameter')).set_index('parameter')
df_std =    pd.DataFrame(pd.Series([],dtype=pd.StringDtype(), name='parameter')).set_index('parameter')
iter = 0

In [20]:
com_name = list(code.values())[0][1]

directory_for_ml = '../../data/data_for_ml/predict/'
directory_for_model = '../../data/data_for_ml/model/model/'
fname = f'df_{com_name}_combine.pkl'
f_name = directory_for_ml + fname
df_o = pd.read_pickle(f_name) 
df_o = df_o.iloc[:-1] # /predict/를 사용할 경우 마지막 prediction data 제외

In [21]:
len(df_o)

343

In [22]:
train_start_index = list(df_o.index).index(TRAIN_START_DATE)
train_end_index = list(df_o.index).index(TRAIN_END_DATE)
val_end_index = list(df_o.index).index(datetime.date(2023, 4, 28))

In [23]:
# opening date와 df_o date의 개수 일치 여부 확인
base_data_directory = '../../data/base_data/stock_market_holydays/'
OPENING_DAYS_KOR = pd.read_pickle(base_data_directory+'opening_days_kor.pkl') # 한국 개장일 데이터 
OPENING_DAYS_USA = pd.read_pickle(base_data_directory+'opening_days_usa.pkl') # 미국 개장일 데이터 

opening_day_filter = [item for item in list(OPENING_DAYS_KOR) if item in list(OPENING_DAYS_USA)]

sttn = list(opening_day_filter).index(TRAIN_START_DATE)
endn = list(opening_day_filter).index(TRAIN_END_DATE)
# sttn = list(OPENING_DAYS_KOR).index(TRAIN_START_DATE)
# endn = list(OPENING_DAYS_KOR).index(TRAIN_END_DATE)
len_filter = endn - sttn

len_train = train_end_index - train_start_index

if (len_train != 0 ):
    print("Length of common_data and company data is DIFFERENT.")
    diff_date = set(opening_day_filter[515:772]) - set(list(df_o.iloc[32:288].index))
    print("different date is : ", diff_date)

Length of common_data and company data is DIFFERENT.
different date is :  {datetime.date(2023, 3, 30), datetime.date(2022, 5, 31), datetime.date(2022, 11, 14), datetime.date(2022, 4, 18), datetime.date(2022, 6, 21), datetime.date(2023, 2, 21), datetime.date(2022, 12, 27), datetime.date(2022, 7, 5), datetime.date(2023, 1, 17), datetime.date(2022, 9, 6), datetime.date(2022, 11, 25)}


In [24]:
df_o_train = df_o.iloc[train_start_index:train_end_index+1]
df_o_val = df_o.iloc[train_end_index+1:val_end_index+1]
df_o_test = df_o.iloc[val_end_index+1:]

## 아래는 기존 자료 이용시 사용

In [None]:
#### read best fit parameters made from the previous fit training
params_search_o = pd.read_pickle(f'{directory_for_model}{com_name}/best_params.pkl')
list(params_search_o.to_dict().values())[0]

In [80]:
#### 다중 iter columns으로 되어 있는 경우 아래의 예를 사용
params_search_o = pd.read_pickle(f'{directory_for_model}{com_name}/params_bs_df_0622_2018.pkl')
list(params_search_o.to_dict().values())[2] # 예: iter-3 column 선정시

{'boost_from_average': [True, False],
 'boosting_type': ['gbdt', 'dart'],
 'colsample_bytree': (0.005, 1.0, 'uniform'),
 'cv': 5,
 'force_col_wise': ['true'],
 'importance_type': nan,
 'iterations': 50,
 'learning_rate': (0.0001, 0.004, 'log-uniform'),
 'max_bin': (100, 1000),
 'max_depth': (0, 50),
 'metric': ['binary_logloss'],
 'min_child_samples': (5, 50),
 'min_child_weight': (5, 15),
 'n_estimators': (700, 10000),
 'num_col': 98,
 'num_leaves': (3, 20),
 'objective': ['binary'],
 'reg_alpha': (0, 4.0, 'uniform'),
 'reg_lambda': (0, 4.0, 'uniform'),
 'scale_pos_weight': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
        1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
 'scoring': 'precision',
 'second_best_index': None,
 'subsample': (0.005, 1.0, 'uniform'),
 'subsample_for_bin': (100000, 1200000)}

### 여기까지

In [25]:
# 반복 작업시 여기서 부터 진행 (feature importance로 선정된 새로운 column으로)

df = df_o_train[new_col]
train_input = df.iloc[:, :-5]
train_target = df.iloc[:, -5]

df = df_o_val[new_col]
val_input = df.iloc[:, :-5]
val_target = df.iloc[:, -5]

df = df_o_test[new_col]
test_input = df.iloc[:, :-5]
test_target = df.iloc[:, -5]

In [26]:
scaler = None
scaler = StandardScaler()
scaler.fit(train_input)
train_scaled = scaler.transform(train_input)
val_scaled = scaler.transform(val_input)
test_scaled = scaler.transform(test_input)

In [347]:
search_space = {
    'boosting_type' : ['gbdt', 'dart', ],# ['gbdt', 'dart', 'goss'],
    'learning_rate': (0.0001, 0.004, 'log-uniform'),
    'num_leaves': (3, 20),      
    'max_depth': (0, 50), #[-1], #(0, 50),
    'min_child_samples': (5, 40),
    'min_child_weight': (3.0, 6.0), # 값을 변동시 같은 값이 최적값으로 선정되더라고 precision이 틀림. 반드시 값을 바꾸면서 진행해야 함.
#     'min_split_loss': (0, 5), 
    'max_bin': (100, 1200), # 사용시 0.75%로 올라감 default: 255
#     'max_cat_threshold' : (1, 100),
    'subsample': (0.005, 1.0, 'uniform'), # == bagging_fraction
#     'subsample_freq': (1, 10), # mobis에서 적용시 값이 많이 올라감. default:0
    'colsample_bytree': (0.5, 1.0, 'uniform'),
    'subsample_for_bin': (100000, 700000),
    'scale_pos_weight' : (0.2, 2.0, 'uniform'),
    'n_estimators': (700, 5000),
    'reg_alpha' : (0.0, 2.0, 'uniform'), # =='lambda_l1': [0, 5], # default 0
    'reg_lambda' : (0.0, 4.0, 'uniform'), # == lambda_l2': [0, 5], #default 0    

    'force_col_wise': ['true'], 
    'importance_type': ['split'], # ['gain'], default: split
    'boost_from_average' : [True, False],
    'objective': ['binary'],
    'metric': ['binary_logloss'],
}

params_space = search_space.copy() # for Bayesian Optimization

## BayesSearchCV

In [348]:
param_grid = {
    'cv' : 5,  # usually 5 or 10, default: 3
#     'cv' : None,
#     'scoring' : None,
#     'scoring' : 'f1',
#     'scoring' : 'roc_auc',
    'scoring' : 'precision',
#     'scoring' : 'average_precision',
#     'scoring' : 'accuracy',
    'num_col' : len(new_col),
    'iterations' : 50,  # default : 50, number of parameter settings.
    # The number of parameter settings that are tried is given by n_iter.
    'second_best_index' : None  # second best index를 사용했는지 확인용으로 삽입
             }

In [349]:
# directory_for_model = '../../data/data_for_ml/model/model/0. test/'
directory_for_model = '../../data/data_for_ml/model/model/'

In [350]:
# directory가 없으면 만드는 과정
# if not os.path.exists(com_name):
#     os.makedirs(com_name)
if not os.path.exists(directory_for_model+com_name):
    os.makedirs(directory_for_model+com_name)

iter = iter + 1

lgbm = None
lgbmBS = None
gsbs = 'bs'

lgbmBS = BayesSearchCV( estimator = lightgbm.LGBMClassifier(verbose=0, random_state=42),
                       search_spaces = params_space,
                       scoring = param_grid['scoring'],
                       cv = StratifiedKFold( n_splits=param_grid['cv'],
                                             shuffle=True,
                                             random_state=42 ),
                       n_jobs = -1, # 자동 검색 적용
                       n_iter = param_grid['iterations'],   
                       verbose = 0, refit = True, random_state = 42 
                      )

print("*** after lgbm BS ******")
lgbmBS.fit(train_scaled, train_target, eval_metric = 'logloss') 

# save model
stamp = datetime.datetime.today().isoformat() # 파일명 끝에 생성날짜 시간 추가
dt = re.sub(r'[-:T]', '', stamp[5:16])
dt = f'{dt[:4]}_{dt[4:]}'

df_estimator = make_df_from_estimator(lgbmBS.best_estimator_.get_params(), iter)
df_estimator.sort_index(inplace=True) # alphabet 순으로 보기 편하게

# metrics accuracy,,,, 3단계 precision 등까지. dictionary
result_dict = calc_results(lgbmBS, lgbmBS.best_estimator_, 
                           train_scaled, val_scaled, test_scaled,  
                           train_target, val_target, test_target,
                          )

df_grid = make_df_from_estimator(param_grid, iter) # gridcv parameter
df_grid.sort_index(inplace=True) # alphabet 순으로 보기 편하게
df_result = make_df_from_estimator(result_dict, iter)  # dict 를 df로
df_concat = pd.concat([df_grid, df_estimator, df_result])

df_base = pd.merge(df_base,df_concat, how='outer', left_index=True, right_index=True)

df_result =  list_std(lgbmBS.cv_results_, iter)
df_std = pd.concat([df_std, df_result], axis=1, join='outer')

# val_test = df_concat.loc['acc_val'].iloc[0]
# acc_test = df_concat.loc['acc_test'].iloc[0]
# precision = df_concat.loc['test_precision'].iloc[0]
# f1score = df_concat.loc['f1score'].iloc[0]

params_search = add_dict(param_grid, params_space)
df_params_search = make_df_from_estimator(params_search, iter)
df_params = pd.merge(df_params,df_params_search, how='outer', left_index=True, right_index=True)
  
print("******* No.{} BS Process is Done! ********".format(iter))
    
print("**** End of BayesSearchCV Process ****")

*** after lgbm BS ******
******* No.41 BS Process is Done! ********
**** End of BayesSearchCV Process ****


In [351]:
df_one = df_combine_sorted(df_base, df_params)

In [352]:
row_index =  ['train_precision', 'val_precision', 'test_precision']
df_style(df_one.iloc[52:80, :], row_index)

Unnamed: 0,parameter,iter_4,iter_5,iter_6,iter_7,iter_8,iter_12,iter_13,iter_14,iter_15,iter_16,iter_17,iter_18,iter_19,iter_20,iter_21,iter_22,iter_23,iter_24,iter_25,iter_26,iter_27,iter_28,iter_29,iter_30,iter_31,iter_32,iter_33,iter_34,iter_35,iter_36,iter_37,iter_38,iter_39,iter_40,iter_41
0,subsample_for_bin,"(100000, 1200000)","(100000, 1200000)","(100000, 1200000)","(100000, 1200000)","(100000, 1200000)","(100000, 1200000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 500000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)","(100000, 700000)"
1,subsample_freq,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,verbose,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,best_score,0.663485,0.742308,0.684524,0.663540,0.655761,0.706548,0.644250,0.950000,0.663579,0.654286,0.666667,0.636738,0.866667,0.866667,0.950000,0.933333,0.622450,0.622450,0.697222,0.685368,0.717778,0.628011,0.647681,0.788889,0.853333,0.880000,0.689744,0.680635,0.666436,0.675714,0.659394,0.669264,0.666288,0.674729,0.843333
4,best_index,30.000000,29.000000,26.000000,17.000000,8.000000,31.000000,31.000000,1.000000,24.000000,31.000000,32.000000,15.000000,23.000000,23.000000,1.000000,1.000000,8.000000,8.000000,38.000000,47.000000,45.000000,15.000000,29.000000,49.000000,12.000000,18.000000,46.000000,46.000000,46.000000,38.000000,46.000000,24.000000,26.000000,35.000000,48.000000
5,acc_train,0.693798,0.651163,0.713178,0.674419,0.887597,0.647287,0.751938,0.596899,0.794574,0.798450,0.817829,0.848837,0.604651,0.604651,0.596899,0.596899,0.953488,0.953488,0.674419,0.693798,0.732558,0.829457,0.961240,0.655039,0.682171,0.658915,0.717054,0.658915,0.848837,0.662791,0.748062,0.763566,0.790698,0.810078,0.624031
6,acc_val,0.368421,0.473684,0.526316,0.473684,0.526316,0.421053,0.368421,0.473684,0.473684,0.526316,0.473684,0.526316,0.526316,0.526316,0.473684,0.473684,0.473684,0.473684,0.578947,0.473684,0.526316,0.526316,0.578947,0.473684,0.526316,0.473684,0.526316,0.473684,0.526316,0.473684,0.526316,0.368421,0.578947,0.421053,0.473684
7,acc_test,0.617647,0.529412,0.558824,0.588235,0.647059,0.529412,0.647059,0.500000,0.647059,0.705882,0.529412,0.676471,0.558824,0.558824,0.500000,0.500000,0.676471,0.676471,0.529412,0.647059,0.588235,0.676471,0.588235,0.470588,0.529412,0.558824,0.588235,0.529412,0.647059,0.617647,0.617647,0.676471,0.617647,0.705882,0.529412
8,train_precision,0.786667,1.000000,0.863636,0.737500,0.934579,0.816327,0.768519,1.000000,0.985915,0.972973,1.000000,0.936842,1.000000,1.000000,0.950000,1.000000,0.974138,0.974138,0.952381,0.890909,0.981818,0.897959,0.974576,1.000000,1.000000,1.000000,0.929825,0.972222,0.956044,0.948718,0.938462,0.906667,0.947368,0.929412,0.962963
9,train_tn,120.000000,136.000000,127.000000,115.000000,129.000000,127.000000,111.000000,136.000000,135.000000,134.000000,136.000000,130.000000,136.000000,136.000000,135.000000,136.000000,133.000000,133.000000,134.000000,130.000000,135.000000,126.000000,133.000000,136.000000,136.000000,136.000000,132.000000,135.000000,132.000000,134.000000,132.000000,129.000000,132.000000,130.000000,135.000000


In [353]:
row_index = ['n_estimators', 'reg_alpha', 'max_bin', 'subsample_for_bin']
df_style(df_one.head(60), row_index)

Unnamed: 0,parameter,iter_4,iter_5,iter_6,iter_7,iter_8,iter_12,iter_13,iter_14,iter_15,iter_16,iter_17,iter_18,iter_19,iter_20,iter_21,iter_22,iter_23,iter_24,iter_25,iter_26,iter_27,iter_28,iter_29,iter_30,iter_31,iter_32,iter_33,iter_34,iter_35,iter_36,iter_37,iter_38,iter_39,iter_40,iter_41
0,boost_from_average,False,False,True,True,True,False,True,True,True,False,True,False,True,True,True,True,True,True,True,False,False,False,True,True,True,False,True,False,True,True,False,True,True,False,False
1,boost_from_average,"[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]","[True, False]"
2,boosting_type,"['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']","['gbdt', 'dart']"
3,boosting_type,gbdt,gbdt,dart,dart,gbdt,dart,dart,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,gbdt,dart,gbdt,gbdt,gbdt,dart,gbdt,gbdt,dart,gbdt,gbdt,gbdt,dart,gbdt,dart,dart
4,class_weight,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,colsample_bytree,"(0.005, 1.0, 'uniform')","(0.005, 1.0, 'uniform')","(0.005, 1.0, 'uniform')","(0.005, 1.0, 'uniform')","(0.005, 1.0, 'uniform')","(0.005, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')","(0.5, 1.0, 'uniform')"
6,colsample_bytree,0.005000,0.005000,0.304977,0.045747,0.872294,0.142809,0.617856,0.651705,0.967086,0.687899,0.500000,0.500000,0.672617,0.672617,0.651705,0.651705,0.935826,0.935826,0.675415,0.824246,0.898273,0.500000,0.811199,0.713134,0.500000,0.650121,0.637458,0.715444,1.000000,0.500000,0.500000,0.643531,0.500000,0.554109,0.766259
7,cv,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
8,cv,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
9,force_col_wise,['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true'],['true']


In [354]:
df_std.iloc[:5]

Unnamed: 0_level_0,iter4,iter4,iter4,iter5,iter5,iter5,iter6,iter6,iter6,iter7,...,iter38,iter39,iter39,iter39,iter40,iter40,iter40,iter41,iter41,iter41
Unnamed: 0_level_1,best_index,mean_test_score,std_test_score,best_index,mean_test_score,std_test_score,best_index,mean_test_score,std_test_score,best_index,...,std_test_score,best_index,mean_test_score,std_test_score,best_index,mean_test_score,std_test_score,best_index,mean_test_score,std_test_score
0,30,0.663485,0.176798,29,0.742308,0.232557,26,0.684524,0.14705,17,...,0.095645,26,0.666288,0.068123,35,0.674729,0.084541,48,0.843333,0.134825
1,27,0.658403,0.063272,5,0.656093,0.069097,22,0.66697,0.120102,16,...,0.040777,27,0.666143,0.095355,30,0.668859,0.110546,45,0.821429,0.186263
2,43,0.641697,0.053001,37,0.655458,0.098958,24,0.662116,0.125689,20,...,0.074221,23,0.660119,0.085025,16,0.65507,0.026231,46,0.721825,0.235704
3,22,0.63981,0.060881,42,0.654343,0.097449,5,0.652283,0.065146,35,...,0.086031,3,0.655357,0.061076,13,0.651554,0.013666,38,0.656278,0.028529
4,44,0.638951,0.059839,44,0.652381,0.19726,41,0.651172,0.125687,28,...,0.0546,34,0.651762,0.059423,49,0.645253,0.065069,36,0.647914,0.065354


In [355]:
# search_space, parameters and results save
df_one.to_csv(f'{directory_for_model}{com_name}/params_results_{gsbs}_df_{dt}.csv')
df_one.to_pickle(f'{directory_for_model}{com_name}/params_results_{gsbs}_df_{dt}.pkl')

# search_space, parameters and results save
df_std.to_csv(f'{directory_for_model}{com_name}/std_{gsbs}_df_{dt}.csv')
df_std.to_pickle(f'{directory_for_model}{com_name}/std_{gsbs}_df_{dt}.pkl')

In [356]:
# 아래 내용은 위의 df_one을 대체함.
# parameter and results save
df_base.to_csv(f'{directory_for_model}{com_name}/lgbm_{gsbs}_df_{dt}.csv')
df_base.to_pickle(f'{directory_for_model}{com_name}/lgbm_{gsbs}_df_{dt}.pkl')

# search_space, parameters save
df_params.to_csv(f'{directory_for_model}{com_name}/params_{gsbs}_df_{dt}.csv')
df_params.to_pickle(f'{directory_for_model}{com_name}/params_{gsbs}_df_{dt}.pkl')

In [None]:

'''
# save model
# 필요시 저장. 가장 좋은 결과만 저장하고자할 때는 밑의 save_best_results만 진행하면 됨.
joblib.dump(lgbmBS.best_estimator_, f'{directory_for_model}{com_name}/estimator_{gsbs}_{dt}_v{iter}.pkl') # bayessearchcv 저장
save_to_pickle(f'{directory_for_model}{com_name}/model_{gsbs}_{dt}_v{iter}_p.pkl', lgbmBS.best_estimator_)
joblib.dump(scaler, f'{directory_for_model}{com_name}/scaler_{gsbs}_{dt}_v{iter}.pkl') # scaler 저장
save_to_pickle(f'{directory_for_model}{com_name}/columns_{gsbs}_{dt}_{len(new_col)}_{round(precision*100):2d}%_{round(f1score*100):2d}%_ver{iter}.pkl', new_col)
save_list_to_csv(f'{directory_for_model}{com_name}/columns_{gsbs}_{dt}_{len(new_col)}_{round(precision*100):2d}%_{round(f1score*100):2d}%_ver{iter}.csv', new_col)
'''

In [61]:
# 최종 가장 좋은 결과 저장
save_best_results(com_name, lgbmBS.best_estimator_, scaler, new_col, df_base, df_params_search)

In [None]:
print(lgbmBS.best_estimator_)

In [None]:
model = lgbmBS.best_estimator_
feature_df = pd.DataFrame(model.booster_.feature_importance(importance_type='gain'), 
                      index=train_input.columns, columns=['importance']).sort_values(by='importance', 
                                                                              ascending=False)

In [None]:
len(feature_df)

In [None]:
feature_df.tail(70)

In [None]:
new_col = list(feature_df.index[:35]) +  ['cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20']

In [53]:
# parameter 조합별 측정치 비교 결과 확인후 robust한 index select하여 아래 진행 후 bayessearchcv 진행
# 아래는 index가 44인 경우
second_best_index =  47  # select from parameter 비교 from .cv_results_

df_cv_param = pd.DataFrame(lgbmBS.cv_results_)

search_space_secondbest = {
    'boosting_type' : [df_cv_param.iloc[second_best_index,:]['param_boosting_type']],
    'learning_rate': [df_cv_param.iloc[second_best_index,:]['param_learning_rate']],
    'num_leaves': [df_cv_param.iloc[second_best_index,:]['param_num_leaves']],
    'max_depth': [df_cv_param.iloc[second_best_index,:]['param_max_depth']],
    'min_child_samples': [df_cv_param.iloc[second_best_index,:]['param_min_child_samples']],
#     'min_child_weight': [df_cv_param.iloc[second_best_index,:]['param_min_child_weight']],
#     'min_split_loss': [df_cv_param.iloc[second_best_index,:]['param_min_split_loss']],
#     'max_bin': [df_cv_param.iloc[second_best_index,:]['param_max_bin']],
    'max_bin': [100],
#     'max_cat_threshold' : [df_cv_param.iloc[second_best_index,:]['param_max_cat_threshold']],
    'subsample': [df_cv_param.iloc[second_best_index,:]['param_subsample']],
#     'subsample_freq': [df_cv_param.iloc[second_best_index,:]['param_subsample_freq']],
    'colsample_bytree': [df_cv_param.iloc[second_best_index,:]['param_colsample_bytree']],
    'subsample_for_bin': [df_cv_param.iloc[second_best_index,:]['param_subsample_for_bin']],
    'scale_pos_weight' : [df_cv_param.iloc[second_best_index,:]['param_scale_pos_weight']],
#     'scale_pos_weight': [df_cv_param.iloc[second_best_index,:]['param_scale_pos_weight']],
    'n_estimators': [df_cv_param.iloc[second_best_index,:]['param_n_estimators']],
    'reg_alpha' : [df_cv_param.iloc[second_best_index,:]['param_reg_alpha']],
    'reg_lambda' : [df_cv_param.iloc[second_best_index,:]['param_reg_lambda']],
#     'reg_alpha' : [df_cv_param.iloc[second_best_index,:]['param_reg_alpha']],
#     'reg_lambda' : [df_cv_param.iloc[second_best_index,:]['param_reg_lambda']],
    'force_col_wise': [df_cv_param.iloc[second_best_index,:]['param_force_col_wise']],
    'importance_type': ['split'], # ['gain'], default: split
    'importance_type': [df_cv_param.iloc[second_best_index,:]['param_importance_type']],
    'boost_from_average' : [df_cv_param.iloc[second_best_index,:]['param_boost_from_average']],
    'objective':[df_cv_param.iloc[second_best_index,:]['param_objective']],
    'metric': [df_cv_param.iloc[second_best_index,:]['param_metric']],
}

params_space = search_space_secondbest.copy() # for Bayesian Optimization

param_grid = {
    'cv' : param_grid['cv'],
    'scoring' : param_grid['scoring'],
    'num_col' :  param_grid['num_col'],
    'iterations' :  2,
    'second_best_index' : second_best_index, 
             }

In [51]:
df_cv_param.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_boost_from_average', 'param_boosting_type',
       'param_colsample_bytree', 'param_force_col_wise', 'param_learning_rate',
       'param_max_bin', 'param_max_depth', 'param_metric',
       'param_min_child_samples', 'param_min_child_weight',
       'param_n_estimators', 'param_num_leaves', 'param_objective',
       'param_reg_alpha', 'param_reg_lambda', 'param_scale_pos_weight',
       'param_subsample', 'param_subsample_for_bin', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [45]:
os.listdir(f'{directory_for_model}{com_name}')

['best_columns.csv',
 'best_columns.pkl',
 'best_model.pkl',
 'best_model_p.pkl',
 'best_params.pkl',
 'best_result.pkl',
 'best_scaler.pkl',
 'best_scaler_p.pkl',
 'lgbm_bs_df_0622_2018.csv',
 'lgbm_bs_df_0622_2018.pkl',
 'params_bs_df_0512_1431.csv',
 'params_bs_df_0512_1431.pkl',
 'params_bs_df_0622_2018.csv',
 'params_bs_df_0622_2018.pkl',
 'params_results_bs_df_0512_1431.csv',
 'params_results_bs_df_0512_1431.pkl',
 'params_results_bs_df_0622_2018.csv',
 'params_results_bs_df_0622_2018.pkl',
 'std_bs_df_0622_2018.csv',
 'std_bs_df_0622_2018.pkl']

In [46]:
'params_bs_df_0621_1429.pkl'

'params_bs_df_0621_1429.pkl'

In [103]:
dff = pd.read_pickle(f'{directory_for_model}{com_name}/params_results_bs_df_0622_2018.pkl')

In [106]:
dff.head(50)

Unnamed: 0_level_0,iter_1,iter_2,iter_3,iter_4
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
boost_from_average,True,True,False,False
boost_from_average,"[True, False]","[True, False]","[True, False]",[False]
boosting_type,"[gbdt, dart]","[gbdt, dart]","[gbdt, dart]",[gbdt]
boosting_type,dart,dart,gbdt,gbdt
class_weight,,,,
colsample_bytree,"(0.005, 1.0, uniform)","(0.005, 1.0, uniform)","(0.005, 1.0, uniform)",[0.005]
colsample_bytree,0.997121,0.396634,0.005,0.005
cv,5,5,5,5
cv,5,5,5,5
force_col_wise,[true],[true],[true],[true]
