<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feature selection from 3 tools
### SGDClassifier, DecisionTree, LinearRegressor

### get most important 10 features for the next analysis,
###  result data (accuracy, precision, confusion matrix etc.

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
import pickle
def list_to_pickle(filename, listname):
    open_file = open(filename, "wb")
    pickle.dump(listname, open_file)
    open_file.close()

def list_from_pickle(filename):
    open_file = open(filename, "rb")
    loaded_list = pickle.load(open_file)
    open_file.close()
    return loaded_list

In [4]:
# confusion matrix to list 변환
def matrix_to_list(confu_matrix):
    m_list = []
    tn = confu_matrix[0,0]
    fp = confu_matrix[0,1]
    fn = confu_matrix[1,0]
    tp = confu_matrix[1,1]
    m_list.extend([tn, fp, fn, tp])
    return m_list

In [5]:
def predict_p(test_target, y_predict_list): 
    ps = precision_score(test_target, y_predict_list)
    rs = recall_score(test_target, y_predict_list)
    fs = f1_score(test_target, y_predict_list)
    roc = roc_auc_score(test_target, y_predict_list)
#     cm = matrix_to_list(confusion_matrix(test_target, y_predict_list))
    collect_list = [ps, rs, fs, roc]
#     collect_list.extend(cm)
    return collect_list

In [6]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코케미칼', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng']}

# code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy']}

In [7]:
# 분석용 데이터 입력
stock_name = 'sec'
directory_for_ml = '../data/data_for_ml/'

# 초기화
lr_accuracy = []
lr_test = []
lr_confu_matrix = []
sc_accuracy = []
sc_test = []
sc_confu_matrix = []
dt_accuracy = []
dt_test = []
dt_confu_matrix = []

for key, val in code.items():
    fname = f'df_{val[1]}_sel.pkl'
    f_name = directory_for_ml + fname
    df = pd.read_pickle(f_name) 
    
    # train, val,: 8, test: 2
    split_ratio = 0.8
    split_n = int(len(df)*split_ratio)

    data = df.iloc[:split_n, :-5]
    target = df.iloc[:split_n, -4]
    test_input = df.iloc[split_n:, :-5]
    test_target = df.iloc[split_n:, -4]
    
    train_input, val_input, train_target, val_target = train_test_split(data, target, 
                                                                        random_state=42, test_size=0.2, stratify=target)

    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    val_scaled = ss.transform(val_input)
    test_scaled = ss.transform(test_input)

    # ********** logistic regressor를 이용한 feature selection

    lr = LogisticRegression(C=20, max_iter=1000) # max_iter default 100, 
    #     lr = LogisticRegression(C=1, solver='newton_cg', max_iter=1000) # max_iter default 100, 
    lr.fit(train_scaled, train_target)

    train_score_lr = lr.score(train_scaled, train_target)
    val_score_lr = lr.score(val_scaled, val_target)
    test_score_lr = lr.score(test_scaled, test_target)
    
    lr_accuracy.append([val[1], train_score_lr, val_score_lr, test_score_lr])
    lr_test.append([val[1]] + predict_p(test_target, lr.predict(test_scaled)))
    cm = matrix_to_list(confusion_matrix(test_target, lr.predict(test_scaled)))
    lr_confu_matrix.append([val[1]] + cm)

    df_sel_lr = pd.DataFrame(lr.coef_[0, :], index=data.columns, columns=['importance_LR'])
    df_sel_lr['importance_LR'] = df_sel_lr['importance_LR'].apply(lambda x: abs(x))
    df_sel_lr = df_sel_lr.sort_values(by='importance_LR', ascending=False)
    # df_sel_lr.index
    

    # **********  결정트리를 사용하여 feature selection하기, feature_importances_ 이용

    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(train_input, train_target)

    train_score_dt = dt.score(train_input, train_target)
    val_score_dt = dt.score(val_input, val_target)
    test_score_dt = dt.score(test_input, test_target)
    
    dt_accuracy.append([val[1], train_score_dt, val_score_dt, test_score_dt])
    dt_test.append([val[1]] + predict_p(test_target, dt.predict(test_input)))
    cm = matrix_to_list(confusion_matrix(test_target, dt.predict(test_input)))
    dt_confu_matrix.append([val[1]] + cm)

    df_sel_dt = pd.DataFrame(dt.feature_importances_, index=data.columns, columns=['importance']).sort_values(by='importance', ascending=False)
    # df_sel_dt.index
    
 

    # **********  SGDClassifier를 이용한 feature selecition

    sc =  SGDClassifier(loss='log_loss', max_iter=2000, random_state=42)
    sc.fit(train_scaled, train_target)

    train_score_sc = sc.score(train_scaled, train_target)
    val_score_sc = sc.score(val_scaled, val_target)
    test_score_sc = sc.score(test_scaled, test_target)
    
    sc_accuracy.append([val[1], train_score_sc, val_score_sc, test_score_sc])
    sc_test.append([val[1]] + predict_p(test_target, sc.predict(test_scaled)))
    cm = matrix_to_list(confusion_matrix(test_target, sc.predict(test_scaled)))
    sc_confu_matrix.append([val[1]] + cm)

    df_sel_sc = pd.DataFrame(sc.coef_[0, :], index=data.columns, columns=['importance_SC'])
    df_sel_sc['importance_SC'] = df_sel_sc['importance_SC'].apply(lambda x: abs(x))
    df_sel_sc = df_sel_sc.sort_values(by='importance_SC', ascending=False)
    # df_sel_sc.index

    # ********** feature selection 하기
    select = {}
    for name in data.columns:
        sum = 0
        sum = (list(df_sel_sc.index).index(name) + 1)*2  \
             +(list(df_sel_lr.index).index(name) + 1)*2  \
             +(list(df_sel_dt.index).index(name) + 1)*1
        # 1부터 시작으로 하고 dt에는 1/2배 가중치줌. 
        select[name] = sum

    # ------ top 10 important list  -----------------
    sorted_sel = sorted(select.items(), key = lambda items : items[1])
    new_columns = np.array(sorted_sel[:10])[:, 0]
    # -----------------------------------------------
 
    # logisticregression 결과 모으기

    df_lr_acc = pd.DataFrame(lr_accuracy, columns=['name', 'lr_train', 'lr_val', 'lr_test']).set_index('name')
    df_sc_acc = pd.DataFrame(sc_accuracy, columns=['name', 'sc_train', 'sc_val', 'sc_test']).set_index('name')
    df_dt_acc = pd.DataFrame(dt_accuracy, columns=['name', 'dt_train', 'dt_val', 'dt_test']).set_index('name')
    
    df_lr_test = pd.DataFrame(lr_test, columns=['name', 'lr_pre', 'lr_recall', 'lr_f1', 'lr_roc']).set_index('name')
    df_sc_test = pd.DataFrame(sc_test, columns=['name', 'sc_pre', 'sc_recall', 'sc_f1', 'sc_roc']).set_index('name')
    df_dt_test = pd.DataFrame(dt_test, columns=['name', 'dt_pre', 'dt_recall', 'dt_f1', 'dt_roc']).set_index('name')
        
    df_lr_cm = pd.DataFrame(lr_confu_matrix, columns=['name', 'lr_tn', 'lr_fp', 'lr_fn', 'lr_tp']).set_index('name')
    df_sc_cm = pd.DataFrame(sc_confu_matrix, columns=['name', 'sc_tn', 'sc_fp', 'sc_fn', 'sc_tp']).set_index('name')
    df_dt_cm = pd.DataFrame(dt_confu_matrix, columns=['name', 'dt_tn', 'dt_fp', 'dt_fn', 'dt_tp']).set_index('name')
    
    dfs = [df_lr_acc, df_sc_acc, df_dt_acc, df_lr_test, df_sc_test, df_dt_test, df_lr_cm, df_sc_cm, df_dt_cm ]
    df_merged = reduce(lambda  left,right: pd.merge(left,right, how='left', left_index=True, right_index=True), dfs)
    
    lr_col = [ x for x in df_merged.columns if x.startswith('lr')]
    sc_col = [ x for x in df_merged.columns if x.startswith('sc')]
    dt_col = [ x for x in df_merged.columns if x.startswith('dt')]

    df_lr = df_merged[lr_col]
    df_sc = df_merged[sc_col]
    df_dt = df_merged[dt_col]

In [12]:
acc_train = 0.
acc_val = 0.6
acc_test = 0.6
precision = 0.6
f1_score = 0.
recall = 0.

dt_ratio = ((df_merged['dt_fn'] + df_merged['dt_tp']) / (df_merged['dt_tn'] + df_merged['dt_fp'] + df_merged['dt_fn'] + df_merged['dt_tp']))
dt_sel = (df_merged['dt_train'] >= acc_train) & \
        (df_merged['dt_val'] >= acc_val) & \
        (df_merged['dt_test'] >= acc_test) & \
        (df_merged['dt_pre'] >= precision) & \
        (df_merged['dt_recall'] >= recall) & \
        (df_merged['dt_f1'] >= f1_score) 

sc_ratio = ((df_merged['sc_fn'] + df_merged['sc_tp']) / (df_merged['sc_tn'] + df_merged['sc_fp'] + df_merged['sc_fn'] + df_merged['sc_tp']))
sc_sel = (df_merged['sc_train'] >= acc_train) & \
        (df_merged['sc_val'] >= acc_val) & \
        (df_merged['sc_test'] >= acc_test) & \
        (df_merged['sc_pre'] >= precision) & \
        (df_merged['sc_recall'] >= recall) & \
        (df_merged['sc_f1'] >= f1_score) 

lr_ratio = ((df_merged['lr_fn'] + df_merged['lr_tp']) / (df_merged['lr_tn'] + df_merged['lr_fp'] + df_merged['lr_fn'] + df_merged['lr_tp']))
lr_sel = (df_merged['lr_train'] >= acc_train) & \
        (df_merged['lr_val'] >= acc_val) & \
        (df_merged['lr_test'] >= acc_test) & \
        (df_merged['lr_pre'] >= precision) & \
        (df_merged['lr_recall'] >= recall) & \
        (df_merged['lr_f1'] >= f1_score)

df_sel = df_merged[sc_sel & dt_sel & lr_sel]



In [13]:
df_sel

Unnamed: 0_level_0,lr_train,lr_val,lr_test,sc_train,sc_val,sc_test,dt_train,dt_val,dt_test,lr_pre,...,lr_fn,lr_tp,sc_tn,sc_fp,sc_fn,sc_tp,dt_tn,dt_fp,dt_fn,dt_tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sec,0.953488,0.790698,0.851852,0.924419,0.790698,0.833333,1.0,0.744186,0.759259,0.736842,...,3,14,32,5,4,13,29,8,5,12
lgenergy,0.930556,0.702703,0.76087,0.895833,0.648649,0.717391,1.0,0.756757,0.717391,0.736842,...,6,14,21,5,8,12,22,4,9,11
skhinix,0.945783,0.785714,0.865385,0.873494,0.857143,0.846154,1.0,0.880952,0.865385,0.8,...,3,16,28,5,3,16,29,4,3,16
sdi,0.897436,0.825,0.755102,0.865385,0.825,0.734694,1.0,0.75,0.755102,0.615385,...,2,16,22,9,4,14,20,11,1,17
secpre,1.0,0.866667,0.894737,1.0,0.8,0.789474,1.0,0.733333,0.736842,0.857143,...,1,6,10,2,2,5,10,2,3,4
naver,0.947712,0.820513,0.770833,0.934641,0.846154,0.770833,1.0,0.871795,0.75,0.666667,...,4,14,21,9,2,16,24,6,6,12
kia,0.923611,0.675676,0.782609,0.875,0.702703,0.782609,1.0,0.72973,0.717391,0.9,...,9,9,26,2,8,10,22,6,7,11
poscoholding,0.911392,0.775,0.74,0.892405,0.775,0.74,1.0,0.725,0.7,0.733333,...,9,11,25,5,8,12,23,7,8,12
kbbank,0.908397,0.878788,0.785714,0.89313,0.787879,0.785714,1.0,0.818182,0.642857,0.777778,...,5,14,20,3,6,13,18,5,10,9
shgroup,0.957983,0.766667,0.842105,0.915966,0.733333,0.842105,1.0,0.733333,0.815789,0.909091,...,5,10,20,3,3,12,21,2,5,10


In [14]:
df_sel[lr_col]

Unnamed: 0_level_0,lr_train,lr_val,lr_test,lr_pre,lr_recall,lr_f1,lr_roc,lr_tn,lr_fp,lr_fn,lr_tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sec,0.953488,0.790698,0.851852,0.736842,0.823529,0.777778,0.844197,32,5,3,14
lgenergy,0.930556,0.702703,0.76087,0.736842,0.7,0.717949,0.753846,21,5,6,14
skhinix,0.945783,0.785714,0.865385,0.8,0.842105,0.820513,0.860447,29,4,3,16
sdi,0.897436,0.825,0.755102,0.615385,0.888889,0.727273,0.783154,21,10,2,16
secpre,1.0,0.866667,0.894737,0.857143,0.857143,0.857143,0.886905,11,1,1,6
naver,0.947712,0.820513,0.770833,0.666667,0.777778,0.717949,0.772222,23,7,4,14
kia,0.923611,0.675676,0.782609,0.9,0.5,0.642857,0.732143,27,1,9,9
poscoholding,0.911392,0.775,0.74,0.733333,0.55,0.628571,0.708333,26,4,9,11
kbbank,0.908397,0.878788,0.785714,0.777778,0.736842,0.756757,0.781465,19,4,5,14
shgroup,0.957983,0.766667,0.842105,0.909091,0.666667,0.769231,0.811594,22,1,5,10


In [15]:
df_sel[dt_col]

Unnamed: 0_level_0,dt_train,dt_val,dt_test,dt_pre,dt_recall,dt_f1,dt_roc,dt_tn,dt_fp,dt_fn,dt_tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sec,1.0,0.744186,0.759259,0.6,0.705882,0.648649,0.744833,29,8,5,12
lgenergy,1.0,0.756757,0.717391,0.733333,0.55,0.628571,0.698077,22,4,9,11
skhinix,1.0,0.880952,0.865385,0.8,0.842105,0.820513,0.860447,29,4,3,16
sdi,1.0,0.75,0.755102,0.607143,0.944444,0.73913,0.794803,20,11,1,17
secpre,1.0,0.733333,0.736842,0.666667,0.571429,0.615385,0.702381,10,2,3,4
naver,1.0,0.871795,0.75,0.666667,0.666667,0.666667,0.733333,24,6,6,12
kia,1.0,0.72973,0.717391,0.647059,0.611111,0.628571,0.698413,22,6,7,11
poscoholding,1.0,0.725,0.7,0.631579,0.6,0.615385,0.683333,23,7,8,12
kbbank,1.0,0.818182,0.642857,0.642857,0.473684,0.545455,0.628146,18,5,10,9
shgroup,1.0,0.733333,0.815789,0.833333,0.666667,0.740741,0.789855,21,2,5,10


In [16]:
df_sel[sc_col]

Unnamed: 0_level_0,sc_train,sc_val,sc_test,sc_pre,sc_recall,sc_f1,sc_roc,sc_tn,sc_fp,sc_fn,sc_tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sec,0.924419,0.790698,0.833333,0.722222,0.764706,0.742857,0.814785,32,5,4,13
lgenergy,0.895833,0.648649,0.717391,0.705882,0.6,0.648649,0.703846,21,5,8,12
skhinix,0.873494,0.857143,0.846154,0.761905,0.842105,0.8,0.845295,28,5,3,16
sdi,0.865385,0.825,0.734694,0.608696,0.777778,0.682927,0.743728,22,9,4,14
secpre,1.0,0.8,0.789474,0.714286,0.714286,0.714286,0.77381,10,2,2,5
naver,0.934641,0.846154,0.770833,0.64,0.888889,0.744186,0.794444,21,9,2,16
kia,0.875,0.702703,0.782609,0.833333,0.555556,0.666667,0.742063,26,2,8,10
poscoholding,0.892405,0.775,0.74,0.705882,0.6,0.648649,0.716667,25,5,8,12
kbbank,0.89313,0.787879,0.785714,0.8125,0.684211,0.742857,0.776888,20,3,6,13
shgroup,0.915966,0.733333,0.842105,0.8,0.8,0.8,0.834783,20,3,3,12


In [17]:
new_columns

array(['low', 'open', 'insurance', 'vol', 'privequity', 'dxy_cr',
       'financeetc', 'ixic_f_cr', 'ixic_cr', 'kospi_cr'], dtype='<U11')