<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## calculate prediction accuracy

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [3]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [4]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [5]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    df = pd.DataFrame.from_dict(my_dict, orient='index') 
    df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 

In [6]:
def to_df(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['date'] = date
    dict_temp[f'{com_name}_precision'] = f'{precision:.2f}'
    dict_temp[f'{com_name}_predict'] = f'{y_predict[0]}'
    dict_temp[f'{com_name}_yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'{com_name}_no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('date', inplace=True)
    return df_t

In [7]:
def to_df_todays(date, com_name, result, y_predict, weight, cr, yes_no):
    if yes_no:
        precision = result.loc['test_precision'].iloc[-1] 
        tn = result.loc['test_tn'].iloc[-1]
        fp = result.loc['test_fp'].iloc[-1]
        fn = result.loc['test_fn'].iloc[-1]
        tp = result.loc['test_tp'].iloc[-1]
    else:
        precision = result.loc['precision'].iloc[-1]
        tn = result.loc['tn'].iloc[-1]
        fp = result.loc['fp'].iloc[-1]
        fn = result.loc['fn'].iloc[-1]
        tp = result.loc['tp'].iloc[-1]
    
    dict_temp = {}
    dict_temp['name'] = com_name
    dict_temp[f'precision'] = f'{precision:.2f}'
    dict_temp[f'predict'] = f'{y_predict[0]}'
    dict_temp[f'yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'no'] = f'{weight[0,0]:.2f}'
    dict_temp[f'tn'] = f'{tn:.1f}'
    dict_temp[f'fp'] = f'{fp:.1f}'
    dict_temp[f'fn'] = f'{fn:.1f}'
    dict_temp[f'tp'] = f'{tp:.1f}'
    if ((y_predict[0] == 1) & (cr > 0)):
        result = 'right'
    elif ((y_predict[0] == 1) & (cr <= 0)):
        result = 'wrong'
    else:
        result = 'draw'
    dict_temp[f'result'] = result
    
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('name', inplace=True)
    return df_t

In [8]:
def is_new_format(f_name): #5월 16일 이전 생성 데이터 확인
    mon = int(f_name[11:13])
    day = int(f_name[13:15])
    if (mon < 5):
        return False
    elif (mon == 5) & (day < 16) :
        return False
    else:
        return True

In [9]:
def find_filename(dir): # find a filename in a directory
    p = re.compile('lgbm_bs_df_(.{9}).*csv')
    dir_list = os.listdir(dir)
    for fname in dir_list:
        aa = p.search(fname)
        if aa is None:
            continue
        else:
            return aa.group()

In [11]:
import sys, os

module_path = os.path.abspath(os.path.join('.')) # 현재 폴더로 이동
if module_path+"\\data\\base_data\\common_data" not in sys.path:
    sys.path.append(module_path+"\\data\\base_data\\common_data") #  공통으로 사용하는 각종 리스트, 코드 등 
    
import common_data as cd

In [5]:
# get stock market opening days
base_data_directory = './data/base_data/stock_market_holydays/'
OPENING_DAYS_KOR = pd.read_pickle(base_data_directory+'opening_days_kor.pkl') # 한국 개장일 데이터 
OPENING_DAYS_USA = pd.read_pickle(base_data_directory+'opening_days_usa.pkl') # 미국 개장일 데이터 

In [6]:
TRAIN_START_DATE = datetime.date(2022, 3, 2)
TRAIN_END_DATE = datetime.date(2023, 3, 31)

In [7]:
def find_next_date(current_date):
    current_index = list(OPENING_DAYS_KOR).index(current_date)
    next_date =  OPENING_DAYS_KOR.iloc[current_index+1]
    return next_date

In [19]:
list(OPENING_DAYS_KOR)

[datetime.date(2020, 1, 2),
 datetime.date(2020, 1, 3),
 datetime.date(2020, 1, 6),
 datetime.date(2020, 1, 7),
 datetime.date(2020, 1, 8),
 datetime.date(2020, 1, 9),
 datetime.date(2020, 1, 10),
 datetime.date(2020, 1, 13),
 datetime.date(2020, 1, 14),
 datetime.date(2020, 1, 15),
 datetime.date(2020, 1, 16),
 datetime.date(2020, 1, 17),
 datetime.date(2020, 1, 20),
 datetime.date(2020, 1, 21),
 datetime.date(2020, 1, 22),
 datetime.date(2020, 1, 23),
 datetime.date(2020, 1, 28),
 datetime.date(2020, 1, 29),
 datetime.date(2020, 1, 30),
 datetime.date(2020, 1, 31),
 datetime.date(2020, 2, 3),
 datetime.date(2020, 2, 4),
 datetime.date(2020, 2, 5),
 datetime.date(2020, 2, 6),
 datetime.date(2020, 2, 7),
 datetime.date(2020, 2, 10),
 datetime.date(2020, 2, 11),
 datetime.date(2020, 2, 12),
 datetime.date(2020, 2, 13),
 datetime.date(2020, 2, 14),
 datetime.date(2020, 2, 17),
 datetime.date(2020, 2, 18),
 datetime.date(2020, 2, 19),
 datetime.date(2020, 2, 20),
 datetime.date(2020, 2, 2

In [10]:
next_date = TRAIN_END_DATE
next_date = find_next_date(next_date)
next_date = find_next_date(next_date)

In [11]:
next_date

datetime.date(2023, 4, 4)

In [22]:
OPENING_DAYS_KOR.iloc[800:810]

800    2023-03-30
801    2023-03-31
802    2023-04-03
803    2023-04-04
804    2023-04-05
805    2023-04-06
806    2023-04-07
807    2023-04-10
808    2023-04-11
809    2023-04-12
Name: date, dtype: object

In [None]:
train_start_index = list(df_o.index).index(TRAIN_START_DATE)
train_end_index = list(df_o.index).index(TRAIN_END_DATE)
df_o_train = df_o.iloc[train_start_index:train_end_index+1]
df_o_val = df_o.iloc[train_end_index+1:val_end_index+1]

In [12]:
TRAIN_END_DATE

datetime.date(2023, 3, 31)

In [12]:
# cd.code_all

In [13]:
code = cd.code_all # 전체 회사 코드

code_mid = {'373220': ['LG에너지솔루션', 'lgenergy'], '207940': ['삼성바이오로직스', 'ssbio'],
            '000270': ['기아', 'kia'], '028260': ['삼성물산', 'sscnt'],
            '015760': ['한국전력', 'koreaelec'], '034020': ['두산에너빌리티', 'doosanener'],
            '051900': ['LG생활건강', 'lglife'], '259960': ['크래프톤', 'crafton'],
            '361610': ['SK아이이테크놀로지', 'skietech'], '086280': ['현대글로비스', 'glovis'],
            '302440': ['SK바이오사이언스', 'skbio'],
            }

code_bad = {'051910': ['LG화학', 'lgchemical'], '033780': ['KT&G', 'ktng'],
            '005490': ['POSCO홀딩스', 'poscoholding'], '068270': ['셀트리온', 'celltrion'],
            '066570': ['LG전자', 'lgelec'],  '096770': ['SK이노베이션', 'skinnovation'],
            '030200': ['KT', 'kt'], '003550': ['LG', 'lg'],
            '329180': ['현대중공업', 'hhi'], '003490': ['대한항공', 'koreanair'],
            '036570': ['엔씨소프트', 'ncsoft'], '009830': ['한화솔루션', 'hanhwasol'],
            '090430': ['아모레퍼시픽', 'amore'], '011170': ['롯데케미칼', 'lottechem'],
            '138040': ['메리츠금융지주', 'meritz'], '011070': ['LG이노텍', 'lginnotek'],
           }

code_good = {'005930': ['삼성전자', 'sec'], '035420': ['NAVER', 'naver'],
             '005380': ['현대차', 'hyunmotor'], '035720': ['카카오', 'kakao'],
             '000660': ['SK하이닉스', 'skhynix'], '006400': ['삼성SDI', 'sdi'],
             '005935': ['삼성전자우', 'secpre'], '105560': ['KB금융', 'kbbank'],
             '012330': ['현대모비스', 'mobis'],  '055550': ['신한지주', 'shgroup'],
             '003670': ['포스코퓨처엠', 'poscochemical'], '034730': ['SK', 'sk'], 
             '032830': ['삼성생명', 'sslife'], '086790': ['하나금융지주', 'hana'],
             '009150': ['삼성전기', 'sselec'], '017670': ['SK텔레콤', 'sktelecom'],
             '011200': ['HMM', 'hmm'], '000810': ['삼성화재', 'ssfire'], 
             '010950': ['S-Oil', 'soil'], '018260': ['삼성에스디에스', 'sds'],
             '316140': ['우리금융지주', 'woorifg'], '024110': ['기업은행', 'ibk'], 
             '377300': ['카카오페이', 'kakaopay'], '028050': ['삼성엔지니어링', 'ssengineering'],
            }

In [14]:
directory_for_predict = './data/data_for_ml/predict/'
directory_for_data = './data/company_pkl/'
directory_for_common = './data/common_pkl/'

In [15]:
df_dji = pd.read_pickle(directory_for_common+'dji.pkl')
df_sec = pd.read_pickle(directory_for_data+'sec_investors.pkl')
df_common = pd.read_pickle(directory_for_predict+'0_df_common.pkl')
df_company = pd.read_pickle(directory_for_predict+'df_sec_company.pkl')
df_combine = pd.read_pickle(directory_for_predict+'df_sec_combine.pkl')

In [16]:
if not os.path.exists(directory_for_predict+ 'prediction/prediction_list.pkl'):
    os.makedirs(directory_for_predict+'prediction')
    prediction_list=pd.DataFrame()
    fname_p = 'prediction_list.pkl'
    path_p = directory_for_predict+'prediction/' + fname_p
    prediction_list.to_pickle(path_p)

In [17]:
prediction_list = pd.read_pickle(directory_for_predict+ 'prediction/prediction_list.pkl')

In [18]:
prediction_date = datetime.date.today()
# prediction_date = datetime.date(2023, 5, 10) # 예측을 필요로 하는 일자

In [19]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

if prediction_date not in list(opening_days_kor):
    print(f'오늘 {prediction_date}은 휴장일입니다.')
else:
    print(f'오늘 {prediction_date}은 개장일입니다.')

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_combine.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 
    com_fname = f'{com_name}_historical.pkl'  # 실제와 예측을 비교하기 위하여 실제데이터을 불러 옴
    f_com_name = directory_for_data + com_fname
    com_data = pd.read_pickle(f_com_name)

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns
    
    prediction_row = current_data[current_data.index == prediction_date]
        
    if(len(prediction_row) == 0):
        # 데이터 최종 기록일 확인
        print(f"미국 dji   마지막 일자 : {df_dji['date'].iloc[-1].isoformat()} (거래일자)")
        lf1_index = list(opening_days_usa).index(df_dji['date'].iloc[-1]) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     미국 다음 개장일은 {list(opening_days_usa)[lf1_index]} 입니다.")
        print(f"한국 주식  마지막 일자 : {df_sec['date'].iloc[-1].isoformat()[:10]} (거래일자)")
        lf1_index = list(opening_days_kor).index(df_sec['date'].iloc[-1].date()) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
#         lf1_index = l_index + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     한국 다음 개장일은 {list(opening_days_kor)[lf1_index]} 입니다.")
        print(f"df_common  마지막 일자 : {df_common.index[-1].isoformat()} (예측일자)")
        print(f"df_company 마지막 일자 : {df_company.index[-1].isoformat()} (예측일자)")
        print(f"df_combine 마지막 일자 : {df_combine.index[-1].isoformat()} (예측일자)")  
        raise Exception(f"예측을 위한 최근 데이터가 준비가 되어 있지 않음. 혹은 한국, 미국 주식 휴장 등. 예측 당일 최신자료로 진행하도록...")
    
    com_row = com_data[com_data['date']  == prediction_date]
#     com_row = com_data[com_data['date'].apply(lambda x: x.date())  == prediction_date]
#     com_data['date'].apply(lambda x: x.date()) <  prediction_date

    try:
        cr = com_row['close_cr'].values[0] # 실제의 등락을 확인
    except:
        cr = -1  # 예측 당일 아침 실제 결과가 없을시 임시 지정
        
        #**************************************************************

    # locate the model data directory
    directory_model_data = f'./data/data_for_ml/model/model/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
#     result = load_from_pickle(result_pkl)[:-5] 
    result = load_from_pickle(result_pkl)
    
    yes_no = is_new_format(find_filename(directory_model_data))
    if yes_no:
        precision = result.loc['test_precision'].iloc[-1]
    else:
        precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = prediction_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(prediction_row.index[-1], com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(prediction_row.index[-1], com_name, result, y_predict, weight, cr, yes_no)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {prediction_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

오늘 2023-06-07은 개장일입니다.


Unnamed: 0_level_0,precision,predict,yes,no,tn,fp,fn,tp,result
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sec,1.0,1,0.51,0.49,7.0,0.0,2.0,3.0,wrong
naver,1.0,0,0.49,0.51,5.0,0.0,7.0,1.0,draw
hyunmotor,0.91,0,0.43,0.57,28.0,1.0,24.0,10.0,draw
kakao,1.0,0,0.45,0.55,40.0,0.0,21.0,2.0,draw
skhynix,1.0,0,0.47,0.53,41.0,0.0,20.0,2.0,draw
sdi,0.8,0,0.45,0.55,34.0,1.0,24.0,4.0,draw
secpre,0.86,0,0.32,0.68,35.0,1.0,21.0,6.0,draw
kbbank,0.88,0,0.35,0.65,34.0,1.0,21.0,7.0,draw
mobis,0.85,0,0.5,0.5,27.0,2.0,23.0,11.0,draw
shgroup,0.83,1,0.51,0.49,31.0,1.0,26.0,5.0,wrong


In [20]:
prediction_row

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,ixtr_cr_2,ixut_cr_2,nbi_cr_2,bkx_cr_2,open_2,high_2,low_2,close_2,vol_2,weekday
2023-06-07,-5.298855,-16.564945,-1.712737,-12.795455,-1.414444,-0.915344,-1.05,-1.0,0.5,-1.0,...,-0.379711,-0.30247,0.235194,1.817265,0.0194,0.022569,0.026738,0.038938,-0.134605,0


In [21]:
result.tail()

Unnamed: 0_level_0,iter_1,iter_2,iter_3,iter_4,iter_5
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
roc,0.5,0.5,0.5625,0.535714,0.5625
test_tn,14.0,14.0,14.0,8.0,14.0
test_fp,0.0,0.0,0.0,6.0,0.0
test_fn,8.0,8.0,7.0,4.0,7.0
test_tp,0.0,0.0,1.0,4.0,1.0


In [22]:
load_from_pickle(result_pkl)

Unnamed: 0_level_0,iter_1,iter_2,iter_3,iter_4,iter_5
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cv,5,5,5,5,5
iterations,50,100,100,100,100
num_col,98,98,98,98,98
scoring,precision,precision,precision,precision,precision
boost_from_average,True,True,False,True,True
boosting_type,dart,dart,gbdt,gbdt,dart
class_weight,,,,,
colsample_bytree,0.590492,0.548382,0.667053,0.724132,0.521626
force_col_wise,true,true,true,true,true
importance_type,split,split,split,split,split


In [23]:
prediction_date in list(opening_days_kor)

True

In [24]:
# save current prediction data

prediction_list = pd.concat([prediction_list, df_base], axis=0)
prediction_list = prediction_list[~prediction_list.index.duplicated(keep='last')]

directory_for_predict = './data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
fname_c = 'prediction_list.csv'
path_p = directory_for_predict+'prediction/' + fname_p
path_c = directory_for_predict+'prediction/' + fname_c
prediction_list.to_pickle(path_p)
prediction_list.to_csv(path_c)

In [25]:
# 결과를 회사별로 확인하기
directory_for_predict = './data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
predict_list = pd.read_pickle(directory_for_predict+'prediction/' + fname_p)

for i, (key, val) in enumerate(code_good.items()):
    print("***", i, key, val)
    k =  i * 4
    globals()[f'{val[1]}_df'] = predict_list.iloc[:, k:k+4]

*** 0 005930 ['삼성전자', 'sec']
*** 1 035420 ['NAVER', 'naver']
*** 2 005380 ['현대차', 'hyunmotor']
*** 3 035720 ['카카오', 'kakao']
*** 4 000660 ['SK하이닉스', 'skhynix']
*** 5 006400 ['삼성SDI', 'sdi']
*** 6 005935 ['삼성전자우', 'secpre']
*** 7 105560 ['KB금융', 'kbbank']
*** 8 012330 ['현대모비스', 'mobis']
*** 9 055550 ['신한지주', 'shgroup']
*** 10 003670 ['포스코퓨처엠', 'poscochemical']
*** 11 034730 ['SK', 'sk']
*** 12 032830 ['삼성생명', 'sslife']
*** 13 086790 ['하나금융지주', 'hana']
*** 14 009150 ['삼성전기', 'sselec']
*** 15 017670 ['SK텔레콤', 'sktelecom']
*** 16 011200 ['HMM', 'hmm']
*** 17 000810 ['삼성화재', 'ssfire']
*** 18 010950 ['S-Oil', 'soil']
*** 19 018260 ['삼성에스디에스', 'sds']
*** 20 316140 ['우리금융지주', 'woorifg']
*** 21 024110 ['기업은행', 'ibk']
*** 22 377300 ['카카오페이', 'kakaopay']
*** 23 028050 ['삼성엔지니어링', 'ssengineering']


In [26]:
secpre_df

Unnamed: 0_level_0,kia_precision,kia_predict,kia_yes,kia_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,1.0,0.0,0.23,0.77
2023-04-04,1.0,0.0,0.17,0.83
2023-04-05,1.0,0.0,0.14,0.86
2023-04-06,1.0,0.0,0.08,0.92
2023-04-07,1.0,0.0,0.14,0.86
2023-04-24,,,,
2023-04-27,,,,
2023-04-28,,,,
2023-05-02,,,,
2023-05-08,,,,


In [None]:
def calc_results(lgbm, model, train_scaled, val_scaled, test_scaled, train_target, val_target, test_target):
# model = lgbmgs.best_estimator_  # 최적의 파라미터로 모델 생성
    y_predict_train = model.predict(train_scaled)
    y_predict_val = model.predict(val_scaled)
    y_predict_test = model.predict(test_scaled)
    result_dict= {}
    result_dict['best_score'] = lgbm.best_score_ 
    result_dict['best_index'] = lgbm.best_index_
    result_dict['acc_train'] = model.score(train_scaled, train_target)
    result_dict['acc_val'] = model.score(val_scaled, val_target)
    result_dict['acc_test'] = model.score(test_scaled, test_target)
    result_dict['train_precision'] = precision_score(train_target, y_predict_train)
    cm = confusion_matrix(train_target, y_predict_train)
    result_dict['train_tn'] = cm[0,0]
    result_dict['train_fp'] = cm[0,1]
    result_dict['train_fn'] = cm[1,0]
    result_dict['train_tp'] = cm[1,1]
    result_dict['val_precision'] = precision_score(val_target, y_predict_val)
    cm = confusion_matrix(val_target, y_predict_val)
    result_dict['val_tn'] = cm[0,0]
    result_dict['val_fp'] = cm[0,1]
    result_dict['val_fn'] = cm[1,0]
    result_dict['val_tp'] = cm[1,1]
    result_dict['test_precision'] = precision_score(test_target, y_predict_test)
    result_dict['recall'] = recall_score(test_target, y_predict_test)
    result_dict['f1score'] = f1_score(test_target, y_predict_test)
    result_dict['roc'] = roc_auc_score(test_target, y_predict_test)
    cm = confusion_matrix(test_target, y_predict_test)
    result_dict['test_tn'] = cm[0,0]
    result_dict['test_fp'] = cm[0,1]
    result_dict['test_fn'] = cm[1,0]
    result_dict['test_tp'] = cm[1,1]
#     result_dict['precision_neg'] = cm[0,0] / (cm[0,0] + cm[1,0])
    return result_dict

In [None]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

if prediction_date not in list(opening_days_kor):
    print(f'오늘 {prediction_date}은 휴장일입니다.')
else:
    print(f'오늘 {prediction_date}은 개장일입니다.')

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_combine.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 
    com_fname = f'{com_name}_historical.pkl'  # 실제와 예측을 비교하기 위하여 실제데이터을 불러 옴
    f_com_name = directory_for_data + com_fname
    com_data = pd.read_pickle(f_com_name)

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns
    
    prediction_row = current_data[current_data.index == prediction_date]
        
    if(len(prediction_row) == 0):
        # 데이터 최종 기록일 확인
        print(f"미국 dji   마지막 일자 : {df_dji['date'].iloc[-1].isoformat()} (거래일자)")
        lf1_index = list(opening_days_usa).index(df_dji['date'].iloc[-1]) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     미국 다음 개장일은 {list(opening_days_usa)[lf1_index]} 입니다.")
        print(f"한국 주식  마지막 일자 : {df_sec['date'].iloc[-1].isoformat()[:10]} (거래일자)")
        lf1_index = list(opening_days_kor).index(df_sec['date'].iloc[-1].date()) + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
#         lf1_index = l_index + 1 # 현재 개장일 이후에 오는 개장일 날짜 index (+1 index)
        print(f"     한국 다음 개장일은 {list(opening_days_kor)[lf1_index]} 입니다.")
        print(f"df_common  마지막 일자 : {df_common.index[-1].isoformat()} (예측일자)")
        print(f"df_company 마지막 일자 : {df_company.index[-1].isoformat()} (예측일자)")
        print(f"df_combine 마지막 일자 : {df_combine.index[-1].isoformat()} (예측일자)")  
        raise Exception(f"예측을 위한 최근 데이터가 준비가 되어 있지 않음. 혹은 한국, 미국 주식 휴장 등. 예측 당일 최신자료로 진행하도록...")
    
    com_row = com_data[com_data['date']  == prediction_date]
#     com_row = com_data[com_data['date'].apply(lambda x: x.date())  == prediction_date]
#     com_data['date'].apply(lambda x: x.date()) <  prediction_date

    try:
        cr = com_row['close_cr'].values[0] # 실제의 등락을 확인
    except:
        cr = -1  # 예측 당일 아침 실제 결과가 없을시 임시 지정
        
        #**************************************************************

    # locate the model data directory
    directory_model_data = f'./data/data_for_ml/model/model/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
#     result = load_from_pickle(result_pkl)[:-5] 
    result = load_from_pickle(result_pkl)
    
    yes_no = is_new_format(find_filename(directory_model_data))
    if yes_no:
        precision = result.loc['test_precision'].iloc[-1]
    else:
        precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = prediction_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(prediction_row.index[-1], com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(prediction_row.index[-1], com_name, result, y_predict, weight, cr, yes_no)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {prediction_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

In [None]:
# read selected columns, scaler and model to be appllied

# locate the model data directory
directory_model_data = f'./data/data_for_ml/model/model/{com_name}/'

# get the model data filepath
columns_pkl = directory_model_data + 'best_columns.pkl' # save with pickle.dump 
scaler_pkl = directory_model_data + 'best_scaler.pkl'  # one saved with joblib.dump
scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl' # one saved with pickle.dump
model_pkl = directory_model_data + 'best_model.pkl'  # one saved with joblib.dump
model_p_pkl = directory_model_data + 'best_model_p.pkl' # one saved with pickle.dump
result_pkl = directory_model_data + 'best_result.pkl'

# load result data
result = load_from_pickle(result_pkl) # with pickle.load

yes_no = is_new_format(find_filename(directory_model_data))
if yes_no:
    precision = result.loc['test_precision'].iloc[-1]
else:
    precision = result.loc['precision'].iloc[-1]

# load columns data
real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
# scale the data
scaler = joblib.load(scaler_pkl) # scaler 읽기
# apply the scaled real_data to the model
model = joblib.load(model_pkl) # model 읽기

choose dates to be predicted from companu_combined.
1. model fit에 사용된 데이터 날짜 확인(언제부터 언제까지)
2. test data 취득 (가장 최근 날짜 제외. 취득 날짜 에러 방지.)
   start_date = 
    end_date = 
    
3. 한개의 회사 자료로 테스트 후.
4. code_company_good dict 이용하여 for loop

# read each row one by one
for i_row in len(df_tobetested):
    prediction_row = ----------
    real_data_df = prediction_row[real_columns] # select necessary columns
    real_scaled = scaler.transform(real_data_df)

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)
    if y_predict == 1: # 예측이 True일 경우
        find value of cr_.5 # 
        if True: # 실제가 True
            sum_count  # 일치하는 갯수 합계
        else:
            sum_count: # 일치하지 않는 갯수 합계
        make dataframe 
            'no, predict, actual, weight(proba), 전일 최종가, 최저가, 최고가, 변화율(cr)' 
    else:
        continue