# 예측이 가능한 종목 추리기

## 가장 좋은 결과를 낼 수 있는 feature항목 추출
## 모든 feature를 사용한 결과와, 선택 추출된 feature만 사용한 결과 정확도에 차이가 남
#### logistic 회귀 이용하여 coef_ 항목에서 영향력이 높은 feature를 선택. 최적의 갯수 선택

### 데이터 준비하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow import keras
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score

In [3]:
# 로지스틱회귀후에 .coef_ 항목에서 기준(criteria, 계수)보다 높은 영향력을 미치는 feature column 선택
def select_features(df, coef, criteria):
    sel_num = np.where(np.abs(coef) > criteria )[1]
    sel_col = df.columns[sel_num]
    return sel_col

In [4]:
def get_scores(data, target):
    train_input, test_input, train_target, test_target = train_test_split(data, target, random_state=42, test_size=0.2, stratify=target)

    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    test_scaled = ss.transform(test_input)

    lr = LogisticRegression(C=20, max_iter=4000) # max_iter default 100, 
#     lr = LogisticRegression(C=1, solver='newton_cg', max_iter=1000) # max_iter default 100, 
    lr.fit(train_scaled, train_target)

    train_score = lr.score(train_scaled, train_target)
    test_score = lr.score(test_scaled, test_target)
#     print(f'train score: {train_score:.4f} \n test score; {test_score:.4f}')
    return train_score, test_score, lr.coef_, lr.intercept_

In [5]:
def find_best_result(data, target):
# min을 하나씩 제거하면서 최고의 결과를 가져오는 feature갯수(항목) 선택

    train_score_list= []
    test_score_list = []
#     data_columns = []
#     data_coef = []
    test_s = 0
    train_score, test_score, coef, intercept = get_scores(data, target)
    for _ in range(len(data.columns)-1):
        criteria = np.abs(coef).min()
        sel_col = select_features(data, coef, criteria)
        data = df[sel_col]
        train_score, test_score, coef, intercept = get_scores(data, target)

        if test_score > test_s:
            test_s = test_score
            data_columns = sel_col
            data_coef = coef

        train_score_list.append(train_score)
        test_score_list.append(test_score)
    
    return train_score_list, test_score_list, data_columns, data_coef

In [6]:
def model_fn(inp_num, a_layer=None):
    model = Sequential()
    model.add(Dense(12, activation='sigmoid', input_shape=(inp_num,)))
#     model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))
#     model.add(Dropout(0.1))
    if a_layer:
        model.add(a_layer)
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [7]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코케미칼', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng']}

# code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy']}

In [8]:
# 분석용 데이터 입력
directory_for_ml = '../data/data_for_ml/'
logi_accuracy = []
sgd_accuracy = []
deep_accuracy = []
confu_matrix = []
for key, val in code.items():
    f_name= 'df_{}_{}.pkl'.format(val[1], 'sel')
    fname = directory_for_ml + f_name
    df = pd.read_pickle(fname)
    
    data = df.iloc[:, :-5]
    target = df.iloc[:, -4]
    
    # logisticregression 결과 모으기
    train_score_list, test_score_list, data_columns, data_coef = find_best_result(data, target)
    logi_accuracy.append([f_name, max(train_score_list), max(test_score_list)])
    
    # SGDregressor 결과 모으기
    data_new = data[data_columns] # 선택된 주요 column (feature) 만으로 정확도 계산하기
    train_input, test_input, train_target, test_target \
        = train_test_split(data_new, target, random_state=42, test_size=0.2, stratify=target)
    
    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    test_scaled = ss.transform(test_input)
    
    sgd_value = []
    for iter in range(5, 50, 1):
        sc = SGDClassifier(loss='log', max_iter=iter, random_state=42)
        scores = cross_validate(sc, X=train_scaled, y=train_target, n_jobs=-1)
        sgd_value.append(scores['test_score'].mean())
        
    sgd_accuracy.append([f_name, max(sgd_value)])
    
    # 인공신경망
    try :
        model = None
    except:
        pass
    
    model = model_fn(len(data_new.columns))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # checkpoint_cb = ModelCheckpoint('best_model.h5', save_best_only=True)
    # checkpoint_cb = ModelCheckpoint(filepath='best_model_{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.h5', \
#                                                 monitor='val_accuracy', mode='max', save_best_only=True)
    checkpoint_cb = ModelCheckpoint(filepath='best_model.h5', save_best_only=True)
# earlystopping_cb = EarlyStopping(patience=100, monitor='val_accuracy', mode='max', restore_best_weights=True)
    earlystopping_cb = EarlyStopping(patience=100, monitor='val_loss', mode='min', restore_best_weights=True)
    
    history = model.fit(train_scaled, train_target, epochs=2000, verbose=0,
                        callbacks=[checkpoint_cb, earlystopping_cb],
                        validation_data=(test_scaled, test_target))
    
    y_predict = model.predict(np.array(test_scaled))
    y_predict_list = [1 if i > 0.5 else 0 for i in y_predict[:, 0]]
    
# 정밀도 : 양성으로 예측된 것(TP+FP) 중 얼마나 많은 샘플이 진짜 양성(TP)인지 측정
#     precision_score(test_target, y_predict_list)  # 정밀도, 입력값의 순서 중요힘. (실제값, 예측값)
    
#     precision_score(test_target, y_predict_list)  # 정밀도, 입력값의 순서 중요힘. (실제값, 예측값)
#     recall_score(test_target, y_predict_list)  # 재현율, 입력값의 순서 중요힘. (실제값, 예측값)
#     f1_score(test_target, y_predict_list)
#     roc_auc_score(test_target, y_predict_list)  
    
    deep_accuracy.append([f_name, model.evaluate(test_scaled, test_target),
                          precision_score(test_target, y_predict_list),
                          precision_score(test_target, y_predict_list),
                          recall_score(test_target, y_predict_list),
                          f1_score(test_target, y_predict_list),
                          roc_auc_score(test_target, y_predict_list)  
                         ]) 
    
    confu_matrix.append([f_name, confusion_matrix(test_target, y_predict_list)])




In [9]:
confusion_matrix(test_target, y_predict_list)

array([[8, 5],
       [4, 8]], dtype=int64)

In [10]:
logi_accuracy

[['df_sec_sel.pkl', 0.8778625954198473, 0.9090909090909091],
 ['df_lgenergy_sel.pkl', 0.8, 0.6451612903225806],
 ['df_skhinix_sel.pkl', 0.8984375, 0.84375],
 ['df_ssbio_sel.pkl', 0.8272727272727273, 0.7142857142857143],
 ['df_sdi_sel.pkl', 0.8582677165354331, 0.65625],
 ['df_lgchemical_sel.pkl', 0.8916666666666667, 0.8333333333333334],
 ['df_secpre_sel.pkl', 1.0, 0.8],
 ['df_hyunmotor_sel.pkl', 0.8166666666666667, 0.7],
 ['df_naver_sel.pkl', 0.940677966101695, 0.8333333333333334],
 ['df_kia_sel.pkl', 0.8333333333333334, 0.7407407407407407],
 ['df_kakao_sel.pkl', 0.9137931034482759, 0.7586206896551724],
 ['df_poscoholding_sel.pkl', 0.8333333333333334, 0.7741935483870968],
 ['df_kbbank_sel.pkl', 0.8125, 0.76],
 ['df_sscnt_sel.pkl', 0.9302325581395349, 0.7727272727272727],
 ['df_celltrion_sel.pkl', 0.8125, 0.8],
 ['df_mobis_sel.pkl', 0.9230769230769231, 0.782608695652174],
 ['df_shgroup_sel.pkl', 0.8837209302325582, 0.6818181818181818],
 ['df_lgelec_sel.pkl', 0.898989898989899, 0.72],
 ['

In [11]:
sgd_accuracy

[['df_sec_sel.pkl', 0.754985754985755],
 ['df_lgenergy_sel.pkl', 0.625],
 ['df_skhinix_sel.pkl', 0.7504615384615384],
 ['df_ssbio_sel.pkl', 0.7],
 ['df_sdi_sel.pkl', 0.7396923076923076],
 ['df_lgchemical_sel.pkl', 0.7333333333333333],
 ['df_secpre_sel.pkl', 0.8964285714285716],
 ['df_hyunmotor_sel.pkl', 0.7166666666666667],
 ['df_naver_sel.pkl', 0.7967391304347825],
 ['df_kia_sel.pkl', 0.7415584415584415],
 ['df_kakao_sel.pkl', 0.7594202898550725],
 ['df_poscoholding_sel.pkl', 0.6833333333333333],
 ['df_kbbank_sel.pkl', 0.6757894736842105],
 ['df_sscnt_sel.pkl', 0.7320261437908497],
 ['df_celltrion_sel.pkl', 0.55],
 ['df_mobis_sel.pkl', 0.7029239766081872],
 ['df_shgroup_sel.pkl', 0.6509803921568627],
 ['df_lgelec_sel.pkl', 0.7078947368421054],
 ['df_poscochemical_sel.pkl', 0.6399999999999999],
 ['df_skinnovation_sel.pkl', 0.6666666666666667],
 ['df_ktng_sel.pkl', 0.5878947368421052]]

In [12]:
deep_accuracy

[['df_sec_sel.pkl',
  [0.33678552508354187, 0.9090909361839294],
  0.8823529411764706,
  0.8823529411764706,
  0.9375,
  0.9090909090909091,
  0.9099264705882353],
 ['df_lgenergy_sel.pkl',
  [0.6701975464820862, 0.6451612710952759],
  0.6875,
  0.6875,
  0.6470588235294118,
  0.6666666666666667,
  0.6449579831932772],
 ['df_skhinix_sel.pkl',
  [0.3298147916793823, 0.84375],
  0.8461538461538461,
  0.8461538461538461,
  0.7857142857142857,
  0.8148148148148148,
  0.8373015873015871],
 ['df_ssbio_sel.pkl',
  [0.6280768513679504, 0.6785714030265808],
  0.7,
  0.7,
  0.5384615384615384,
  0.608695652173913,
  0.6692307692307693],
 ['df_sdi_sel.pkl',
  [0.6762300729751587, 0.59375],
  0.5714285714285714,
  0.5714285714285714,
  0.75,
  0.6486486486486486,
  0.59375],
 ['df_lgchemical_sel.pkl',
  [0.5036324858665466, 0.7333333492279053],
  0.6470588235294118,
  0.6470588235294118,
  0.8461538461538461,
  0.7333333333333334,
  0.746606334841629],
 ['df_secpre_sel.pkl',
  [0.5195909142494202, 

1. 정밀도, f1-score, 
2. confusion matrix ((1,1), (2,2), 두개가 큰 비중이면 good, (2,1)은 진을 부로 구분, (1,2)는 부를 진으로 결정하는 항목) "ㄴ" 형태
    (2,2) -> (1,1) -> (2,1)로 확인. (1,2)가 크면 모델 제외
    
<img src="https://raw.githubusercontent.com/fasthill/My-gist/main/data/picture/confusion_matrix.png" width="800"/> <br>

수수료: 주식거래수수료 0.015%. 유관기관수수료 0.0036%, 증권거래세 0.08, 농어촌 특별세 0.15%
수수료 : (0.015+0.0036 ) * 2 (사고팔때), 증권거래세 : 0.08 + 0.15 (팔때)
전체 지출 금액율: 0.2672%