<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature selection을 위한 결정 트리

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
import pickle
def list_to_pickle(filename, listname):
    open_file = open(filename, "wb")
    pickle.dump(listname, open_file)
    open_file.close()

def list_from_pickle(filename):
    open_file = open(filename, "rb")
    loaded_list = pickle.load(open_file)
    open_file.close()
    return loaded_list

In [24]:
# 분석용 데이터 입력
stock_name = 'sec'
directory_for_ml = '../data/data_for_ml/'
fname = f'df_{stock_name}_sel.pkl'
f_name = directory_for_ml + fname
df = pd.read_pickle(f_name)

In [25]:
# train, val,: 8, test: 2
split_n = int(len(df)*0.8)

In [26]:
data = df.iloc[:split_n, :-5]
target = df.iloc[:split_n, -4]
test_input = df.iloc[split_n:, :-5]
test_target = df.iloc[split_n:, -4]

In [27]:
train_input, val_input, train_target, val_target = train_test_split(data, target, 
                                                                      random_state=42, test_size=0.2, stratify=target)

ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
val_scaled = ss.transform(val_input)
test_scaled = ss.transform(test_input)

## logistic regressor를 이용한 feature selection

In [28]:
lr = LogisticRegression(C=20, max_iter=1000) # max_iter default 100, 
#     lr = LogisticRegression(C=1, solver='newton_cg', max_iter=1000) # max_iter default 100, 
lr.fit(train_scaled, train_target)

train_score_lr = lr.score(train_scaled, train_target)
val_score_lr = lr.score(val_scaled, val_target)
test_score_lr = lr.score(test_scaled, test_target)

df_sel_lr = pd.DataFrame(lr.coef_[0, :], index=data.columns, columns=['importance_LR'])
df_sel_lr['importance_LR'] = df_sel_lr['importance_LR'].apply(lambda x: abs(x))
df_sel_lr = df_sel_lr.sort_values(by='importance_LR', ascending=False)
# df_sel_lr.index

## 결정트리를 사용하여 feature selection하기
### feature_importances_ 이용

In [29]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_input, train_target)

In [30]:
train_score_dt = dt.score(train_input, train_target)
val_score_dt = dt.score(val_input, val_target)
test_score_dt = dt.score(test_input, test_target)

In [31]:
# dt.feature_importances_

In [32]:
df_sel_dt = pd.DataFrame(dt.feature_importances_, index=data.columns, columns=['importance']).sort_values(by='importance', ascending=False)
# df_sel_dt.index

### SGDClassifier를 이용한 feature selecition

In [33]:
sc =  SGDClassifier(loss='log_loss', max_iter=2000, random_state=42)
sc.fit(train_scaled, train_target)

In [34]:
train_score_sc = sc.score(train_scaled, train_target)
val_score_sc = sc.score(val_scaled, val_target)
test_score_sc = sc.score(test_scaled, test_target)

In [35]:
# train_score_sc, val_score_sc

In [36]:
df_sel_sc = pd.DataFrame(sc.coef_[0, :], index=data.columns, columns=['importance_SC'])
df_sel_sc['importance_SC'] = df_sel_sc['importance_SC'].apply(lambda x: abs(x))
df_sel_sc = df_sel_sc.sort_values(by='importance_SC', ascending=False)
# df_sel_sc.index

In [37]:
select = {}
for name in data.columns:
    sum = 0
    sum = (list(df_sel_sc.index).index(name) + 1)*2  \
         +(list(df_sel_lr.index).index(name) + 1)*2  \
         +(list(df_sel_dt.index).index(name) + 1)*1
    # 1부터 시작으로 하고 dt에는 1/2배 가중치줌. 
    select[name] = sum

In [38]:
sorted_sel = sorted(select.items(), key = lambda items : items[1])

In [39]:
# top 10 important list
new_columns = np.array(sorted_sel[:10])[:, 0]

In [40]:
# new_columns

In [41]:
print("linear regression train accuracy: {:.4f}, val accuracy: {:.4f}, test accuracy: {:.4f}". 
      format(train_score_lr, val_score_lr, test_score_lr))
print("sgd classifier    train accuracy: {:.4f}, val accuracy: {:.4f}, test accuracy: {:.4f}". 
      format(train_score_sc, val_score_sc, test_score_sc))
print("decision tree     train accuracy: {:.4f}, val accuracy: {:.4f}, test accuracy: {:.4f}". 
      format(train_score_dt, val_score_dt, test_score_dt))

linear regression train accuracy: 0.9535, val accuracy: 0.7907, test accuracy: 0.8519
sgd classifier    train accuracy: 0.9244, val accuracy: 0.7907, test accuracy: 0.8333
decision tree     train accuracy: 1.0000, val accuracy: 0.7442, test accuracy: 0.7593


In [42]:
def predict_p(test_target, y_predict_list):
    # y_predict_list = [1 if i > 0.5 else 0 for i in y_predict[:, 0]]
    print("정밀도", precision_score(test_target, y_predict_list)) 
    print("재현율", recall_score(test_target, y_predict_list))
    print("f1_acore", f1_score(test_target, y_predict_list))
    print("roc_auc_score", roc_auc_score(test_target, y_predict_list))
    print("confusion matrix", confusion_matrix(test_target, y_predict_list))

In [43]:
print("********** Decision Tree ***********")
y_predict_list = dt.predict(test_input)
predict_p(test_target, y_predict_list)
print("********** Linear Regressor ***********")
y_predict_list = lr.predict(np.array(test_scaled))
predict_p(test_target, y_predict_list)
print("********** SGD Regressor ***********")
y_predict_list = sc.predict(np.array(test_scaled))
predict_p(test_target, y_predict_list)

********** Decision Tree ***********
정밀도 0.6
재현율 0.7058823529411765
f1_acore 0.6486486486486486
roc_auc_score 0.7448330683624802
confusion matrix [[29  8]
 [ 5 12]]
********** Linear Regressor ***********
정밀도 0.7368421052631579
재현율 0.8235294117647058
f1_acore 0.7777777777777778
roc_auc_score 0.8441971383147854
confusion matrix [[32  5]
 [ 3 14]]
********** SGD Regressor ***********
정밀도 0.7222222222222222
재현율 0.7647058823529411
f1_acore 0.7428571428571428
roc_auc_score 0.814785373608903
confusion matrix [[32  5]
 [ 4 13]]
