<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 6 month analysis
### SGDClassifier, DecisionTree, LinearRegressor

### get 6 month delay analysis

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce

In [12]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [13]:
# confusion matrix to list 변환
def matrix_to_list(confu_matrix):
    m_list = []
    tn = confu_matrix[0,0]
    fp = confu_matrix[0,1]
    fn = confu_matrix[1,0]
    tp = confu_matrix[1,1]
    m_list.extend([tn, fp, fn, tp])
    return m_list

In [14]:
def predict_p(test_target, y_predict_list): 
    ps = precision_score(test_target, y_predict_list)
    rs = recall_score(test_target, y_predict_list)
    fs = f1_score(test_target, y_predict_list)
    roc = roc_auc_score(test_target, y_predict_list)
#     cm = matrix_to_list(confusion_matrix(test_target, y_predict_list))
    collect_list = [ps, rs, fs, roc]
#     collect_list.extend(cm)
    return collect_list

In [15]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코케미칼', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng']}

# code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], '000660' : ['SK하이닉스', 'skhinix']}

In [21]:
def get_analysis_data(date, data, target, test_inputm, test_target):
    
    # 초기화
    lr_accuracy = []
    lr_test = []
    lr_confu_matrix = []
    sc_accuracy = []
    sc_test = []
    sc_confu_matrix = []
    dt_accuracy = []
    dt_test = []
    dt_confu_matrix = []
    
    train_input, val_input, train_target, val_target = train_test_split(data, target, 
                                                                        random_state=42, test_size=0.2, stratify=target)

    ss = StandardScaler()
    ss.fit(train_input)
    train_scaled = ss.transform(train_input)
    val_scaled = ss.transform(val_input)
    test_scaled = ss.transform(test_input)

    # ********** logistic regressor를 이용한 feature selection

    lr = LogisticRegression(C=20, max_iter=1000) # max_iter default 100, 
    #     lr = LogisticRegression(C=1, solver='newton_cg', max_iter=1000) # max_iter default 100, 
    lr.fit(train_scaled, train_target)

    train_score_lr = lr.score(train_scaled, train_target)
    val_score_lr = lr.score(val_scaled, val_target)
    test_score_lr = lr.score(test_scaled, test_target)
    
    lr_accuracy.append([date, train_score_lr, val_score_lr, test_score_lr])
    lr_test.append([date] + predict_p(test_target, lr.predict(test_scaled)))
    cm = matrix_to_list(confusion_matrix(test_target, lr.predict(test_scaled)))
    lr_confu_matrix.append([date] + cm)

    df_sel_lr = pd.DataFrame(lr.coef_[0, :], index=data.columns, columns=['importance_LR'])
    df_sel_lr['importance_LR'] = df_sel_lr['importance_LR'].apply(lambda x: abs(x))
    df_sel_lr = df_sel_lr.sort_values(by='importance_LR', ascending=False)
    # df_sel_lr.index
    

    # **********  결정트리를 사용하여 feature selection하기, feature_importances_ 이용

    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(train_input, train_target)

    train_score_dt = dt.score(train_input, train_target)
    val_score_dt = dt.score(val_input, val_target)
    test_score_dt = dt.score(test_input, test_target)
    
    dt_accuracy.append([date, train_score_dt, val_score_dt, test_score_dt])
    dt_test.append([date] + predict_p(test_target, dt.predict(test_input)))
    cm = matrix_to_list(confusion_matrix(test_target, dt.predict(test_input)))
    dt_confu_matrix.append([date] + cm)

    df_sel_dt = pd.DataFrame(dt.feature_importances_, index=data.columns, columns=['importance']).sort_values(by='importance', ascending=False)
    # df_sel_dt.index
    
 

    # **********  SGDClassifier를 이용한 feature selecition

    sc =  SGDClassifier(loss='log_loss', max_iter=2000, random_state=42)
    sc.fit(train_scaled, train_target)

    train_score_sc = sc.score(train_scaled, train_target)
    val_score_sc = sc.score(val_scaled, val_target)
    test_score_sc = sc.score(test_scaled, test_target)
    
    sc_accuracy.append([date, train_score_sc, val_score_sc, test_score_sc])
    sc_test.append([date] + predict_p(test_target, sc.predict(test_scaled)))
    cm = matrix_to_list(confusion_matrix(test_target, sc.predict(test_scaled)))
    sc_confu_matrix.append([date] + cm)

    df_sel_sc = pd.DataFrame(sc.coef_[0, :], index=data.columns, columns=['importance_SC'])
    df_sel_sc['importance_SC'] = df_sel_sc['importance_SC'].apply(lambda x: abs(x))
    df_sel_sc = df_sel_sc.sort_values(by='importance_SC', ascending=False)
    # df_sel_sc.index

    # ********** feature selection 하기
    select = {}
    for name in data.columns:
        sum = 0
        sum = (list(df_sel_sc.index).index(name) + 1)*2  \
             +(list(df_sel_lr.index).index(name) + 1)*2  \
             +(list(df_sel_dt.index).index(name) + 1)*1
        # 1부터 시작으로 하고 dt에는 1/2배 가중치줌. 
        select[name] = sum

    # ------ top 10 important list  -----------------
    sorted_sel = sorted(select.items(), key = lambda items : items[1])
    new_columns = np.array(sorted_sel[:10])[:, 0]
    # -----------------------------------------------
 
    # logisticregression 결과 모으기

    df_lr_acc = pd.DataFrame(lr_accuracy, columns=['date', 'lr_train', 'lr_val', 'lr_test']).set_index('date')
    df_sc_acc = pd.DataFrame(sc_accuracy, columns=['date', 'sc_train', 'sc_val', 'sc_test']).set_index('date')
    df_dt_acc = pd.DataFrame(dt_accuracy, columns=['date', 'dt_train', 'dt_val', 'dt_test']).set_index('date')
    
    df_lr_test = pd.DataFrame(lr_test, columns=['date', 'lr_pre', 'lr_recall', 'lr_f1', 'lr_roc']).set_index('date')
    df_sc_test = pd.DataFrame(sc_test, columns=['date', 'sc_pre', 'sc_recall', 'sc_f1', 'sc_roc']).set_index('date')
    df_dt_test = pd.DataFrame(dt_test, columns=['date', 'dt_pre', 'dt_recall', 'dt_f1', 'dt_roc']).set_index('date')
        
    df_lr_cm = pd.DataFrame(lr_confu_matrix, columns=['date', 'lr_tn', 'lr_fp', 'lr_fn', 'lr_tp']).set_index('date')
    df_sc_cm = pd.DataFrame(sc_confu_matrix, columns=['date', 'sc_tn', 'sc_fp', 'sc_fn', 'sc_tp']).set_index('date')
    df_dt_cm = pd.DataFrame(dt_confu_matrix, columns=['date', 'dt_tn', 'dt_fp', 'dt_fn', 'dt_tp']).set_index('date')
    
    dfs = [df_lr_acc, df_sc_acc, df_dt_acc, df_lr_test, df_sc_test, df_dt_test, df_lr_cm, df_sc_cm, df_dt_cm ]
    df_merged = reduce(lambda  left,right: pd.merge(left,right, how='left', left_index=True, right_index=True), dfs)
    
#     lr_col = [ x for x in df_merged.columns if x.startswith('lr')]
#     sc_col = [ x for x in df_merged.columns if x.startswith('sc')]
#     dt_col = [ x for x in df_merged.columns if x.startswith('dt')]

#     df_lr = df_merged[lr_col]
#     df_sc = df_merged[sc_col]
#     df_dt = df_merged[dt_col]
    
    return df_merged

In [34]:
# 분석용 데이터 입력
stock_name = 'sec'
directory_for_ml = '../data/data_for_ml/'

df_stock_6mon = {}

date_ = []
for key, val in code.items():
    fname = f'df_{val[1]}_sel.pkl'
    f_name = directory_for_ml + fname
    df = pd.read_pickle(f_name) 
    
    df_append = None
    # secpre 처럼 총 길이가 150미만인 자료는 skip됨
    for st in range(0, len(df)-150, 15):
        new_df = df.iloc[st:st+150, :]
    
        # train, val,: 8, test: 2
        split_ratio = 0.8
        split_n = int(len(new_df)*split_ratio)

        data = new_df.iloc[:split_n, :-5]
        target = new_df.iloc[:split_n, -4]
        test_input = new_df.iloc[split_n:, :-5]
        test_target = new_df.iloc[split_n:, -4]
        
        date = new_df.index[-1]  # index name

        df_temp = get_analysis_data(date, data, target, test_input, test_target)
        df_append = pd.concat([df_append, df_temp], axis=0)
        
    if df_append is not None:
        df_stock_6mon[val[1]] = df_append
#     if df_stock_6mon[val[1]] is not None:
#         print(df_stock_6mon[val[1]])
        
# 분류 방법에 따른 결과행 columns 선정
lr_col = [ x for x in df_append.columns if x.startswith('lr')]
sc_col = [ x for x in df_append.columns if x.startswith('sc')]
dt_col = [ x for x in df_append.columns if x.startswith('dt')]

In [24]:
col_sel = ['lr_val','lr_test', 'lr_pre', 'sc_val','sc_test', 'sc_pre','dt_val','dt_test', 'dt_pre']

In [32]:
df_stock_6mon['sec'][col_sel]

Unnamed: 0_level_0,lr_val,lr_test,lr_pre,sc_val,sc_test,sc_pre,dt_val,dt_test,dt_pre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-08-30,0.916667,0.633333,0.428571,0.916667,0.633333,0.428571,0.833333,0.666667,0.5
2022-09-23,0.958333,0.833333,0.8,0.916667,0.866667,0.833333,0.75,0.8,0.75
2022-10-18,0.791667,0.833333,0.857143,0.791667,0.9,0.888889,0.833333,0.833333,0.727273
2022-11-08,0.75,0.633333,0.636364,0.75,0.633333,0.636364,0.708333,0.6,0.583333
2022-12-01,0.791667,0.666667,0.571429,0.708333,0.7,0.6,0.75,0.6,0.5
2022-12-22,0.875,0.833333,1.0,0.875,0.8,1.0,0.791667,0.733333,0.6
2023-01-18,0.833333,0.733333,0.555556,0.75,0.733333,0.555556,0.791667,0.8,0.666667
2023-02-10,0.75,0.933333,0.916667,0.75,0.966667,0.923077,0.666667,0.833333,0.769231


In [20]:
df_stock_6mon['lgenergy'][col_sel]

Unnamed: 0_level_0,lr_val,lr_test,lr_pre,sc_val,sc_test,sc_pre,dt_val,dt_test,dt_pre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-28,0.75,0.6,0.7,0.625,0.733333,0.833333,0.708333,0.7,0.769231
2022-11-19,0.708333,0.6,0.714286,0.75,0.666667,0.75,0.833333,0.766667,0.789474
2022-12-15,0.708333,0.766667,0.7,0.666667,0.733333,0.666667,0.666667,0.633333,0.5
2023-01-11,0.708333,0.833333,0.714286,0.708333,0.833333,0.714286,0.666667,0.7,0.428571
2023-02-04,0.708333,0.7,0.6,0.708333,0.633333,0.5,0.666667,0.566667,0.4
2023-02-28,0.666667,0.666667,0.75,0.666667,0.666667,0.75,0.833333,0.7,0.692308


In [36]:
df_stock_6mon['skhinix'][col_sel]

Unnamed: 0_level_0,lr_val,lr_test,lr_pre,sc_val,sc_test,sc_pre,dt_val,dt_test,dt_pre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-09-07,0.916667,0.7,0.571429,0.916667,0.666667,0.5,0.791667,0.8,0.7
2022-09-30,0.75,0.833333,0.714286,0.791667,0.8,0.625,0.875,0.9,0.857143
2022-10-25,0.833333,0.866667,0.875,0.75,0.866667,0.875,0.875,0.866667,0.875
2022-11-17,0.708333,0.6,0.727273,0.708333,0.633333,0.8,0.958333,0.6,0.727273
2022-12-10,0.875,0.666667,0.75,0.833333,0.6,0.5,0.791667,0.766667,0.777778
2023-01-05,0.875,0.766667,0.571429,0.791667,0.766667,0.545455,0.833333,0.833333,0.714286
2023-02-01,0.708333,0.866667,0.769231,0.666667,0.766667,0.7,0.833333,0.8,0.727273
2023-02-24,0.75,0.966667,1.0,0.791667,0.9,0.866667,0.666667,0.9,0.923077


In [35]:
df_stock_6mon['kia'][col_sel]

Unnamed: 0_level_0,lr_val,lr_test,lr_pre,sc_val,sc_test,sc_pre,dt_val,dt_test,dt_pre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-12,0.708333,0.633333,0.375,0.625,0.666667,0.444444,0.75,0.8,0.714286
2022-11-05,0.75,0.8,0.7,0.708333,0.766667,0.666667,0.833333,0.766667,0.615385
2022-12-02,0.625,0.766667,0.692308,0.708333,0.7,0.588235,0.666667,0.766667,0.692308
2023-01-04,0.75,0.833333,0.857143,0.791667,0.833333,0.857143,0.625,0.733333,0.75
2023-02-02,0.791667,0.8,0.642857,0.75,0.8,0.642857,0.75,0.733333,0.571429
2023-03-01,0.708333,0.666667,0.583333,0.708333,0.633333,0.533333,0.625,0.666667,0.583333


In [39]:
for key, val in code.items():
    print("**** {} ****".format(val[1]))
    try:
        print(df_stock_6mon[val[1]][col_sel]) # 데이터가 모자라는 데이터는 에러 발생
    except:
        pass

**** sec ****
              lr_val   lr_test    lr_pre    sc_val   sc_test    sc_pre  \
date                                                                     
2022-08-30  0.916667  0.633333  0.428571  0.916667  0.633333  0.428571   
2022-09-23  0.958333  0.833333  0.800000  0.916667  0.866667  0.833333   
2022-10-18  0.791667  0.833333  0.857143  0.791667  0.900000  0.888889   
2022-11-08  0.750000  0.633333  0.636364  0.750000  0.633333  0.636364   
2022-12-01  0.791667  0.666667  0.571429  0.708333  0.700000  0.600000   
2022-12-22  0.875000  0.833333  1.000000  0.875000  0.800000  1.000000   
2023-01-18  0.833333  0.733333  0.555556  0.750000  0.733333  0.555556   
2023-02-10  0.750000  0.933333  0.916667  0.750000  0.966667  0.923077   

              dt_val   dt_test    dt_pre  
date                                      
2022-08-30  0.833333  0.666667  0.500000  
2022-09-23  0.750000  0.800000  0.750000  
2022-10-18  0.833333  0.833333  0.727273  
2022-11-08  0.708333  0.600000