# 예측이 가능한 종목 추리기

## 가장 좋은 결과를 낼 수 있는 feature항목 추출
## 모든 feature를 사용한 결과와, 선택 추출된 feature만 사용한 결과 정확도에 차이가 남
#### logistic 회귀 이용하여 coef_ 항목에서 영향력이 높은 feature를 선택. 최적의 갯수 선택

### 데이터 준비하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow import keras
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score

In [3]:
# 로지스틱회귀후에 .coef_ 항목에서 기준(criteria, 계수)보다 높은 영향력을 미치는 feature column 선택
def select_features(df, coef, criteria):
    sel_num = np.where(np.abs(coef) > criteria )[1]
    sel_col = df.columns[sel_num]
    return sel_col

In [4]:
def get_scores(data, target):
    train_input, test_input, train_target, test_target = train_test_split(data, target, random_state=42, test_size=0.2, stratify=target)

    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    test_scaled = ss.transform(test_input)

    lr = LogisticRegression(C=20, max_iter=4000) # max_iter default 100, 
#     lr = LogisticRegression(C=1, solver='newton_cg', max_iter=1000) # max_iter default 100, 
    lr.fit(train_scaled, train_target)

    train_score = lr.score(train_scaled, train_target)
    test_score = lr.score(test_scaled, test_target)
#     print(f'train score: {train_score:.4f} \n test score; {test_score:.4f}')
    return train_score, test_score, lr.coef_, lr.intercept_

In [5]:
def find_best_result(data, target):
# min을 하나씩 제거하면서 최고의 결과를 가져오는 feature갯수(항목) 선택

    train_score_list= []
    test_score_list = []
#     data_columns = []
#     data_coef = []
    test_s = 0
    train_score, test_score, coef, intercept = get_scores(data, target)
    for _ in range(len(data.columns)-1):
        criteria = np.abs(coef).min()
        sel_col = select_features(data, coef, criteria)
        data = df[sel_col]
        train_score, test_score, coef, intercept = get_scores(data, target)

        if test_score > test_s:
            test_s = test_score
            data_columns = sel_col
            data_coef = coef

        train_score_list.append(train_score)
        test_score_list.append(test_score)
    
    return train_score_list, test_score_list, data_columns, data_coef

In [6]:
def model_fn(inp_num, a_layer=None):
    model = Sequential()
    model.add(Dense(12, activation='sigmoid', input_shape=(inp_num,)))
#     model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))
#     model.add(Dropout(0.1))
    if a_layer:
        model.add(a_layer)
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [7]:
# confusion matrix to list 변환
def matrix_to_list(matrix):
    m_list = []
    for cm in confu_matrix:
        name = cm[0]
        tn = cm[1][0,0]
        fp = cm[1][0,1]
        fn = cm[1][1,0]
        tp = cm[1][1,1]
        m_list.append([name, tn, fp, fn, tp])
    return m_list

In [8]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코케미칼', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng']}

# code = {'005380' : ['현대차', 'hyunmotor'], '005930' : ['삼성전자', 'sec']}
# code = {'373220' : ['LG에너지솔루션', 'lgenergy']}

In [9]:
# 분석용 데이터 입력
directory_for_ml = '../data/data_for_ml/expand_date/'
logi_accuracy = []
sgd_accuracy = []
deep_accuracy = []
confu_matrix = []
for key, val in code.items():
    f_name= 'df_{}_{}.pkl'.format(val[1], 'sel')
    fname = directory_for_ml + f_name
    df = pd.read_pickle(fname)
    
    # bank, financeetc는 결측치가 많아서 사용하지 않음.
    df.drop(['bank_1', 'bank_2', 'financeetc_1', 'financeetc_2'], axis=1, inplace=True)   
    
    data = df.iloc[:, :-5]
    target = df.iloc[:, -4]
    
    col_name = data.columns  # df로 되돌리기 위하여 진행. impute하면  np.array로 변환됨.
    data = data.replace([np.inf, -np.inf], np.nan) # replace 메서드로 np.inf를 None(np.nan)
    # imputer = SimpleImputer(missing_values=np.inf, strategy = 'mean')
    imputer = SimpleImputer(strategy = 'mean')
    data = imputer.fit_transform(data)
    
    data = pd.DataFrame(data, columns=col_name)
        
    if data.isnull().values.any():
        print("com_name", val[1])

    print("vom---", val[1])
    # logisticregression 결과 모으기
    train_score_list, test_score_list, data_columns, data_coef = find_best_result(data, target)
    logi_accuracy.append([val[1], max(train_score_list), max(test_score_list)])
    
    # SGDregressor 결과 모으기
    data_new = data[data_columns] # 선택된 주요 column (feature) 만으로 정확도 계산하기
    train_input, test_input, train_target, test_target \
        = train_test_split(data_new, target, random_state=42, test_size=0.2, stratify=target)
    
    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    test_scaled = ss.transform(test_input)
    
    sgd_value = []
    for iter in range(5, 50, 1):
        sc = SGDClassifier(loss='log', max_iter=iter, random_state=42)
        scores = cross_validate(sc, X=train_scaled, y=train_target, n_jobs=-1)
        sgd_value.append(scores['test_score'].mean())
        
    sgd_accuracy.append([val[1], max(sgd_value)])
    
    # 인공신경망
    try :
        model = None
    except:
        pass
    
    model = model_fn(len(data_new.columns))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # checkpoint_cb = ModelCheckpoint('best_model.h5', save_best_only=True)
    # checkpoint_cb = ModelCheckpoint(filepath='best_model_{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.h5', \
#                                                 monitor='val_accuracy', mode='max', save_best_only=True)
    checkpoint_cb = ModelCheckpoint(filepath='best_model.h5', save_best_only=True)
# earlystopping_cb = EarlyStopping(patience=100, monitor='val_accuracy', mode='max', restore_best_weights=True)
    earlystopping_cb = EarlyStopping(patience=100, monitor='val_loss', mode='min', restore_best_weights=True)
    
    history = model.fit(train_scaled, train_target, epochs=2000, verbose=0,
                        callbacks=[checkpoint_cb, earlystopping_cb],
                        validation_data=(test_scaled, test_target))
    
    y_predict = model.predict(np.array(test_scaled), verbose=0)
    y_predict_list = [1 if i > 0.5 else 0 for i in y_predict[:, 0]]
    
# 정밀도 : 양성으로 예측된 것(TP+FP) 중 얼마나 많은 샘플이 진짜 양성(TP)인지 측정
#     precision_score(test_target, y_predict_list)  # 정밀도, 입력값의 순서 중요힘. (실제값, 예측값)
#     recall_score(test_target, y_predict_list)  # 재현율, 입력값의 순서 중요힘. (실제값, 예측값)
#     f1_score(test_target, y_predict_list)
#     roc_auc_score(test_target, y_predict_list)  
    score = model.evaluate(test_scaled, test_target, verbose=0)
    deep_accuracy.append([val[1], 
                          score[0], score[1],
                          precision_score(test_target, y_predict_list),
                          recall_score(test_target, y_predict_list),
                          f1_score(test_target, y_predict_list),
                          roc_auc_score(test_target, y_predict_list)  
                         ]) 
    
    confu_matrix.append([val[1], confusion_matrix(test_target, y_predict_list)])
    
    df_logi = pd.DataFrame(logi_accuracy, columns=['name', 'train_max', 'test_max']).set_index('name')
    df_sgd = pd.DataFrame(sgd_accuracy, columns=['name', 'sgd_accuracy']).set_index('name')
    df_deep = pd.DataFrame(deep_accuracy, 
                       columns=['name', 'val_loss', 'val_accuracy', 'precision', 'recall', 'f1_score', ' roc_auc_score']).set_index('name')
    df_confu_matrix = pd.DataFrame(matrix_to_list(confu_matrix), columns = ['name', 'tn', 'fp', 'fn', 'tp']).set_index('name')

    dfs = [df_logi, df_sgd, df_deep, df_confu_matrix ]
    df_merged = reduce(lambda  left,right: pd.merge(left,right, how='left', left_index=True, right_index=True), dfs)

vom--- sec
vom--- lgenergy
vom--- skhinix
vom--- ssbio
vom--- sdi
vom--- lgchemical
vom--- secpre
vom--- hyunmotor


  _warn_prf(average, modifier, msg_start, len(result))


vom--- naver
vom--- kia
vom--- kakao
vom--- poscoholding


  _warn_prf(average, modifier, msg_start, len(result))


vom--- kbbank
vom--- sscnt


  _warn_prf(average, modifier, msg_start, len(result))


vom--- celltrion
vom--- mobis
vom--- shgroup
vom--- lgelec
vom--- poscochemical
vom--- skinnovation


  _warn_prf(average, modifier, msg_start, len(result))


vom--- ktng


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
df_merged

Unnamed: 0_level_0,train_max,test_max,sgd_accuracy,val_loss,val_accuracy,precision,recall,f1_score,roc_auc_score,tn,fp,fn,tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
sec,0.865116,0.703704,0.711628,0.572289,0.703704,0.571429,0.235294,0.333333,0.577107,34,3,13,4
lgenergy,0.885057,0.659091,0.632605,0.631958,0.659091,0.666667,0.421053,0.516129,0.630526,21,4,11,8
skhinix,0.825,0.745098,0.715,0.596475,0.686275,0.625,0.277778,0.384615,0.593434,30,3,13,5
ssbio,0.976562,0.625,0.656308,0.671446,0.625,0.666667,0.153846,0.25,0.550607,18,1,11,2
sdi,0.803191,0.723404,0.675391,0.620137,0.744681,0.764706,0.619048,0.684211,0.732601,22,4,8,13
lgchemical,0.794444,0.666667,0.644444,0.607923,0.622222,0.533333,0.444444,0.484848,0.592593,20,7,10,8
secpre,1.0,0.692308,0.775556,0.472034,0.923077,1.0,0.75,0.857143,0.875,9,0,1,3
hyunmotor,0.877095,0.666667,0.68127,0.645382,0.644444,0.0,0.0,0.0,0.5,29,0,16,0
naver,0.832402,0.666667,0.692857,0.60599,0.666667,0.6,0.352941,0.444444,0.605042,24,4,11,6
kia,0.868263,0.785714,0.670766,0.529637,0.714286,0.666667,0.285714,0.4,0.607143,26,2,10,4


In [None]:
test_max = 0.7
sgd_accuracy = 0.7
val_accuracy = 0.7
precision = 0.7
fi_score = 0.7
ratio_min = 0.4
ratio_max = 0.6

ratio = ((df_merged['fn'] + df_merged['tp']) / (df_merged['tn'] + df_merged['fp'] + df_merged['fn'] + df_merged['tp']))
df_sel = (df_merged['test_max'] >= test_max) & \
        (df_merged['sgd_accuracy'] >= sgd_accuracy) & \
        (df_merged['val_accuracy'] >= val_accuracy) & \
        (df_merged['precision'] >= precision) & \
        (df_merged['f1_score'] >= fi_score) & \
        (ratio_min < ratio ) & (ratio < ratio_max)

In [None]:
df_merged[df_sel]

1. 정밀도, f1-score, 
2. confusion matrix ((1,1), (2,2), 두개가 큰 비중이면 good, (1,2)은 틀린것을 맞다라고 구분, (2,1)은 맞는 것을 틀린 것이다 라고 결정하는 항목) 따라서
    (2,2) -> (1,2) -> (1,1)로 확인하고. <br>
    (1,2)가 크면 모델 제외 (정밀도(precision = TP / (TP + FP) )가 높아야 함. 낮으면 손해를 보게 됨.), <br>
    재현율(Recall = TP / (TP + FN) ) 은 손해를 끼치지는 않음.
    
<img src="https://raw.githubusercontent.com/fasthill/My-gist/main/data/picture/confusion_matrix.png" width="800"/> <br>

수수료: 주식거래수수료 0.015%. 유관기관수수료 0.0036%, 증권거래세 0.08, 농어촌 특별세 0.15%
수수료 : (0.015+0.0036 ) * 2 (사고팔때), 증권거래세 : 0.08 + 0.15 (팔때)
전체 지출 금액율: 0.2672%