<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testing with real world data

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [2]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [5]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    df = pd.DataFrame.from_dict(my_dict, orient='index') 
    df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 

In [6]:
def to_df(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['date'] = date
    dict_temp[f'{com_name}_precision'] = f'{precision:.2f}'
    dict_temp[f'{com_name}_predict'] = f'{y_predict[0]}'
    dict_temp[f'{com_name}_yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'{com_name}_no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('date', inplace=True)
    return df_t

In [7]:
def to_df_todays(date, com_name, result, y_predict, weight, cr):
    precision = result.loc['precision'].iloc[-1]
    tn = result.loc['tn'].iloc[-1]
    fp = result.loc['fp'].iloc[-1]
    fn = result.loc['fn'].iloc[-1]
    tp = result.loc['tp'].iloc[-1]
    
    dict_temp = {}
    dict_temp['name'] = com_name
    dict_temp[f'precision'] = f'{precision:.2f}'
    dict_temp[f'predict'] = f'{y_predict[0]}'
    dict_temp[f'yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'no'] = f'{weight[0,0]:.2f}'
    dict_temp[f'tn'] = f'{tn:.1f}'
    dict_temp[f'fp'] = f'{fp:.1f}'
    dict_temp[f'fn'] = f'{fn:.1f}'
    dict_temp[f'tp'] = f'{tp:.1f}'
    if ((y_predict[0] == 1) & (cr > 0)):
        result = 'right'
    elif ((y_predict[0] == 1) & (cr <= 0)):
        result = 'wrong'
    else:
        result = 'draw'
    dict_temp[f'result'] = result
    
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('name', inplace=True)
    return df_t

In [8]:
# get stock market opening days
base_data_directory = './data/base_data/stock_market_holydays/'
opening_days_kor = pd.read_pickle(base_data_directory+'opening_days_kor.pkl') # 한국 개장일 데이터 
opening_days_usa = pd.read_pickle(base_data_directory+'opening_days_usa.pkl') # 미국 개장일 데이터 

In [9]:
import sys, os

module_path = os.path.abspath(os.path.join('.')) # 현재 폴더로 이동
if module_path+"\\data\\base_data\\common_data" not in sys.path:
    sys.path.append(module_path+"\\data\\base_data\\common_data") #  공통으로 사용하는 각종 리스트, 코드 등 
    
import common_data as cd

In [10]:
# cd.code_all

In [20]:
code = cd.code_all # 전체 회사 코드

code_good = {'005930': ['삼성전자', 'sec'], '035420': ['NAVER', 'naver'],
             '035720': ['카카오', 'kakao'], '012330': ['현대모비스', 'mobis'],
             '051910': ['LG화학', 'lgchemical'], '005935': ['삼성전자우', 'secpre'],
             '373220': ['LG에너지솔루션', 'lgenergy'],
            }

code_good = {'005930': ['삼성전자', 'sec'], '035420': ['NAVER', 'naver'],
             '035720': ['카카오', 'kakao'], '012330': ['현대모비스', 'mobis'],
             '051910': ['LG화학', 'lgchemical'],'005935': ['삼성전자우', 'secpre'],
             '000270': ['기아','kia'], '373220': ['LG에너지솔루션', 'lgenergy'],
             '005380': ['현대차', 'hyunmotor'], '000660': ['SK하이닉스', 'skhynix'],
             '006400': ['삼성SDI', 'sdi'],
            }

code_mid = {'373220': ['LG에너지솔루션', 'lgenergy'], '207940': ['삼성바이오로직스', 'ssbio'],
            '000270': ['기아', 'kia'], '028260': ['삼성물산', 'sscnt'],
            }

code_bad = {'051910': ['LG화학', 'lgchemical'], '033780': ['KT&G', 'ktng'],
            '005490': ['POSCO홀딩스', 'poscoholding'], '068270': ['셀트리온', 'celltrion'],
            '066570': ['LG전자', 'lgelec'],  '096770': ['SK이노베이션', 'skinnovation'],
            '030200': ['KT', 'kt'], '003550': ['LG', 'lg'],
           }

code_good = {'005930': ['삼성전자', 'sec'], '035420': ['NAVER', 'naver'],
             '005380': ['현대차', 'hyunmotor'], '035720': ['카카오', 'kakao'],
             '000660': ['SK하이닉스', 'skhynix'], '006400': ['삼성SDI', 'sdi'],
             '005935': ['삼성전자우', 'secpre'], '105560': ['KB금융', 'kbbank'],
             '012330': ['현대모비스', 'mobis'],  '055550': ['신한지주', 'shgroup'],
             '003670' : ['포스코퓨처엠', 'poscochemical'], '034730': ['SK', 'sk'], 
             '032830': ['삼성생명', 'sslife'],
            }

In [21]:
directory_for_predict = './data/data_for_ml/predict/'
directory_for_data = './data/company_pkl/'
directory_for_common = './data/common_pkl/'

In [22]:
df_dji = pd.read_pickle(directory_for_common+'dji.pkl')
df_sec = pd.read_pickle(directory_for_data+'sec_investors.pkl')
df_common = pd.read_pickle(directory_for_predict+'0_df_common.pkl')
df_company = pd.read_pickle(directory_for_predict+'df_sec_company.pkl')
df_combine = pd.read_pickle(directory_for_predict+'df_sec_combine.pkl')

In [23]:
if not os.path.exists(directory_for_predict+ 'prediction/prediction_list.pkl'):
    os.makedirs(directory_for_predict+'prediction')
    prediction_list=pd.DataFrame()
    fname_p = 'prediction_list.pkl'
    path_p = directory_for_predict+'prediction/' + fname_p
    prediction_list.to_pickle(path_p)

In [24]:
prediction_list = pd.read_pickle(directory_for_predict+ 'prediction/prediction_list.pkl')

In [25]:
prediction_date = datetime.date.today()
# prediction_date = datetime.date(2023, 5, 10) # 예측을 필요로 하는 일자

In [26]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

if prediction_date not in list(opening_days_kor):
    print(f'오늘 {prediction_date}은 휴장일입니다.')
else:
    print(f'오늘 {prediction_date}은 개장일입니다.')

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_combine.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 
    com_fname = f'{com_name}_historical.pkl'  # 실제와 예측을 비교하기 위하여 실제데이터을 불러 옴
    f_com_name = directory_for_data + com_fname
    com_data = pd.read_pickle(f_com_name)

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns
    
    prediction_row = current_data[current_data.index == prediction_date]
        
    if(len(prediction_row) == 0):
        # 데이터 최종 기록일 확인
        print(f"미국 dji   마지막 일자 : {df_dji['date'].iloc[-1].isoformat()} (거래일자)")
        lf1_index = list(opening_days_usa).index(df_dji['date'].iloc[-1]) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     미국 다음 개장일은 {list(opening_days_usa)[lf1_index]} 입니다.")
        print(f"한국 주식  마지막 일자 : {df_sec['date'].iloc[-1].isoformat()[:10]} (거래일자)")
        lf1_index = list(opening_days_kor).index(df_sec['date'].iloc[-1].date()) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
#         lf1_index = l_index + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     한국 다음 개장일은 {list(opening_days_kor)[lf1_index]} 입니다.")
        print(f"df_common  마지막 일자 : {df_common.index[-1].isoformat()} (예측일자)")
        print(f"df_company 마지막 일자 : {df_company.index[-1].isoformat()} (예측일자)")
        print(f"df_combine 마지막 일자 : {df_combine.index[-1].isoformat()} (예측일자)")  
        raise Exception(f"예측을 위한 최근 데이터가 준비가 되어 있지 않음. 혹은 한국, 미국 주식 휴장 등. 예측 당일 최신자료로 진행하도록...")
    
    com_row = com_data[com_data['date']  == prediction_date]
#     com_row = com_data[com_data['date'].apply(lambda x: x.date())  == prediction_date]
#     com_data['date'].apply(lambda x: x.date()) <  prediction_date

    try:
        cr = com_row['close_cr'].values[0] # 실제의 등락을 확인
    except:
        cr = -1  # 예측 당일 아침 실제 결과가 없을시 임시 지정
        
        #**************************************************************

    # locate the model data directory
    directory_model_data = f'./data/data_for_ml/model\model/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
    result = load_from_pickle(result_pkl)[:-5] 
    precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = prediction_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(prediction_row.index[-1], com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(prediction_row.index[-1], com_name, result, y_predict, weight, cr)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {prediction_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

오늘 2023-05-12은 개장일입니다.


Unnamed: 0_level_0,precision,predict,yes,no,tn,fp,fn,tp,result
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sec,0.89,0,0.18,0.82,24.0,1.0,27.0,8.0,draw
naver,0.8,0,0.19,0.81,32.0,2.0,18.0,8.0,draw
hyunmotor,0.86,0,0.28,0.72,29.0,1.0,25.0,6.0,draw
kakao,0.83,0,0.38,0.62,36.0,1.0,19.0,5.0,draw
skhynix,1.0,0,0.46,0.54,41.0,0.0,20.0,2.0,draw
sdi,0.8,0,0.44,0.56,34.0,1.0,24.0,4.0,draw
secpre,0.86,0,0.33,0.67,35.0,1.0,21.0,6.0,draw
kbbank,0.88,0,0.21,0.79,34.0,1.0,21.0,7.0,draw
mobis,0.85,0,0.33,0.67,27.0,2.0,23.0,11.0,draw
shgroup,0.83,0,0.48,0.52,31.0,1.0,26.0,5.0,draw


In [27]:
prediction_date in list(opening_days_kor)

True

In [28]:
# save current prediction data

prediction_list = pd.concat([prediction_list, df_base], axis=0)
prediction_list = prediction_list[~prediction_list.index.duplicated(keep='last')]

directory_for_predict = './data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
fname_c = 'prediction_list.csv'
path_p = directory_for_predict+'prediction/' + fname_p
path_c = directory_for_predict+'prediction/' + fname_c
prediction_list.to_pickle(path_p)
prediction_list.to_csv(path_c)

In [29]:
# 결과를 회사별로 확인하기
directory_for_predict = './data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
predict_list = pd.read_pickle(directory_for_predict+'prediction/' + fname_p)

for i, (key, val) in enumerate(code_good.items()):
    print("***", i, key, val)
    k =  i * 4
    globals()[f'{val[1]}_df'] = predict_list.iloc[:, k:k+4]

*** 0 005930 ['삼성전자', 'sec']
*** 1 035420 ['NAVER', 'naver']
*** 2 005380 ['현대차', 'hyunmotor']
*** 3 035720 ['카카오', 'kakao']
*** 4 000660 ['SK하이닉스', 'skhynix']
*** 5 006400 ['삼성SDI', 'sdi']
*** 6 005935 ['삼성전자우', 'secpre']
*** 7 105560 ['KB금융', 'kbbank']
*** 8 012330 ['현대모비스', 'mobis']
*** 9 055550 ['신한지주', 'shgroup']
*** 10 003670 ['포스코퓨처엠', 'poscochemical']
*** 11 034730 ['SK', 'sk']
*** 12 032830 ['삼성생명', 'sslife']


In [30]:
secpre_df

Unnamed: 0_level_0,kia_precision,kia_predict,kia_yes,kia_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,1.0,0.0,0.23,0.77
2023-04-04,1.0,0.0,0.17,0.83
2023-04-05,1.0,0.0,0.14,0.86
2023-04-06,1.0,0.0,0.08,0.92
2023-04-07,1.0,0.0,0.14,0.86
2023-04-24,,,,
2023-04-27,,,,
2023-04-28,,,,
2023-05-02,,,,
2023-05-08,,,,
