<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature selection을 위한 결정 트리

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

## 결정트리를 사용하여 feature selection하기
### feature_importances_ 이용

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [2]:
# 분석용 데이터 입력
stock_name = 'sec'
directory_for_ml = '../data/data_for_ml/'
fname = f'df_{stock_name}_sel.pkl'
f_name = directory_for_ml + fname
df = pd.read_pickle(f_name)

In [3]:
data = df.iloc[:240, :-5]
target = df.iloc[:240, -4]

In [4]:
data.describe(include='all')

Unnamed: 0,retail,foreigner,institution,financial,invtrust,pension,privequity,bank,insurance,financeetc,...,kospi_cr,krw_cr,sox_cr,vix_cr,wti_cr,open,high,low,vol,weekday
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-4.08896,-1.643905,-9.204509,-1.979598,-0.879832,1.437446,2.027869,16.771095,-1.952861,-3.11028,...,-0.085,0.030167,-0.103417,0.10275,-0.030792,-0.000795,-0.000929,-0.000828,0.051242,2.083333
std,50.203231,16.6893,162.906738,22.548362,20.665609,25.55394,40.098488,298.445621,21.989742,26.006436,...,1.167759,0.661384,2.718119,6.653607,2.972412,0.01521,0.012852,0.013261,0.352367,1.375721
min,-759.655574,-170.16129,-2475.512605,-321.409524,-121.291667,-105.085714,-55.940789,-789.666667,-183.30303,-249.0,...,-3.52,-1.79,-6.24,-13.09,-9.87,-0.037139,-0.032841,-0.030498,-0.655365,0.0
25%,-1.599927,-1.148336,-1.798263,-1.944746,-1.439958,-1.366766,-1.484406,-2.020421,-1.640175,-1.697913,...,-0.8725,-0.4025,-2.0825,-4.2375,-1.97,-0.010528,-0.010114,-0.01028,-0.177909,1.0
50%,-0.747867,-0.494372,-0.867549,-0.911871,-0.729971,-0.439956,-0.711318,-1.0,-0.774582,-1.0,...,-0.01,0.09,-0.195,-1.31,0.245,-0.002812,-0.002231,-0.001576,-0.010794,2.0
75%,0.16946,0.440255,0.109888,0.041446,0.306093,0.579879,0.260288,-0.24375,0.087066,0.018983,...,0.705,0.4725,1.8,3.4425,2.0025,0.007499,0.006612,0.006766,0.199856,3.0
max,97.771324,84.403557,472.778281,108.223973,246.691489,347.889908,605.428571,4467.423453,183.05,100.0,...,2.74,1.78,10.21,24.38,8.35,0.057361,0.037234,0.052124,2.001584,4.0


In [5]:
train_input, test_input, train_target, test_target = train_test_split(data, target, random_state=42, test_size=0.2, stratify=target)

In [6]:
print(train_input.shape, test_input.shape)

(192, 35) (48, 35)


## 결정 트리

In [9]:
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(train_input, train_target)

In [12]:
train_score = dt.score(train_input, train_target)
test_score = dt.score(test_input, test_target)
print('train score: {} \n test score: {}'.format(train_score, test_score))

train score: 1.0 
 test score: 0.7291666666666666


In [13]:
dt.feature_importances_

array([0.        , 0.        , 0.01890756, 0.        , 0.03601441,
       0.        , 0.        , 0.04726891, 0.10534974, 0.02205882,
       0.02299752, 0.        , 0.        , 0.0226266 , 0.        ,
       0.02016807, 0.        , 0.        , 0.        , 0.0249809 ,
       0.        , 0.03833752, 0.        , 0.        , 0.05778348,
       0.41200326, 0.        , 0.        , 0.        , 0.02083829,
       0.02239054, 0.12827437, 0.        , 0.        , 0.        ])

In [14]:
df_sel = pd.DataFrame(dt.feature_importances_, index=data.columns, columns=['importance']).sort_values(by='importance', ascending=False)

In [15]:
df_sel

Unnamed: 0,importance
kospi_cr,0.412003
high,0.128274
insurance,0.10535
kosdaq_cr,0.057783
bank,0.047269
bond_usa_10_cr,0.038338
invtrust,0.036014
bond_kor_2_cr,0.024981
corporateetc,0.022998
dji_f_cr,0.022627


In [16]:
df_sel.index[:10]

Index(['kospi_cr', 'high', 'insurance', 'kosdaq_cr', 'bank', 'bond_usa_10_cr',
       'invtrust', 'bond_kor_2_cr', 'corporateetc', 'dji_f_cr'],
      dtype='object')