<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testing with real world data

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [2]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [5]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    df = pd.DataFrame.from_dict(my_dict, orient='index') 
    df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 

In [6]:
def to_df(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['date'] = date
    dict_temp[f'{com_name}_precision'] = f'{precision:.2f}'
    dict_temp[f'{com_name}_predict'] = f'{y_predict[0]}'
    dict_temp[f'{com_name}_yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'{com_name}_no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('date', inplace=True)
    return df_t

In [7]:
def to_df_todays(date, com_name, result, y_predict, weight, cr):
    precision = result.loc['precision'].iloc[-1]
    tn = result.loc['tn'].iloc[-1]
    fp = result.loc['fp'].iloc[-1]
    fn = result.loc['fn'].iloc[-1]
    tp = result.loc['tp'].iloc[-1]
    
    dict_temp = {}
    dict_temp['name'] = com_name
    dict_temp[f'precision'] = f'{precision:.2f}'
    dict_temp[f'predict'] = f'{y_predict[0]}'
    dict_temp[f'yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'no'] = f'{weight[0,0]:.2f}'
    dict_temp[f'tn'] = f'{tn:.1f}'
    dict_temp[f'fp'] = f'{fp:.1f}'
    dict_temp[f'fn'] = f'{fn:.1f}'
    dict_temp[f'tp'] = f'{tp:.1f}'
    if ((y_predict[0] == 1) & (cr > 0)):
        result = 'right'
    elif ((y_predict[0] == 1) & (cr <= 0)):
        result = 'wrong'
    else:
        result = 'draw'
    dict_temp[f'result'] = result
    
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('name', inplace=True)
    return df_t

In [8]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코퓨처엠', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng'], '030200' : ['KT', 'kt']}

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'], '005935' : ['삼성전자우', 'secpre'],
             '373220' : ['LG에너지솔루션', 'lgenergy'],
            }

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'],'005935' : ['삼성전자우', 'secpre'],
             '000270' : ['기아','kia'], '373220' : ['LG에너지솔루션', 'lgenergy'],
             '005380' : ['현대차', 'hyunmotor'], '000660' : ['SK하이닉스', 'skhinix'],
             '006400' : ['삼성SDI', 'sdi'],
            }

code_mid = {'105560' : ['KB금융', 'kbbank'],
            '003670' : ['포스코퓨처엠', 'poscochemical'],
            }

code_bad = { '030200' : ['KT', 'kt'],
             '033780' : ['KT&G', 'ktng'], '066570' : ['LG전자', 'lgelec'], 
             '005490' : ['POSCO홀딩스', 'poscoholding'], '055550' : ['신한지주', 'shgroup'],
             '096770' : ['SK이노베이션', 'skinnovation'],
             '207940' : ['삼성바이오로직스', 'ssbio'], '028260' : ['삼성물산', 'sscnt'],
             '068270' : ['셀트리온', 'celltrion'],
           }
# code_good = {'005930' : ['삼성전자', 'sec']}

In [9]:
directory_for_predict = '../data/data_for_ml/predict/'
directory_for_data = '../data/company_pkl/'

In [10]:
if not os.path.exists(directory_for_predict+ 'prediction/prediction_list.pkl'):
    os.makedirs(directory_for_predict+'prediction')
    prediction_list=pd.DataFrame()
    fname_p = 'prediction_list.pkl'
    path_p = directory_for_predict+'prediction/' + fname_p
    prediction_list.to_pickle(path_p)

In [11]:
prediction_list = pd.read_pickle(directory_for_predict+ 'prediction/prediction_list.pkl')

In [12]:
prediction_date = datetime.date.today()
# prediction_date = datetime.date(2023, 4, 4) # 예측을 필요로 하는 일자

In [16]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_sel.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 
    com_fname = f'{com_name}_historical.pkl'  # 실제와 예측을 비교하기 위하여 실제데이터을 불러 옴
    f_com_name = directory_for_data + com_fname
    com_data = pd.read_pickle(f_com_name)

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns
    
    prediction_row = current_data[current_data.index == prediction_date.isoformat()]
    com_row = com_data[com_data['date'] == prediction_date.isoformat()]
    try:
        cr = com_row['close_cr'].values[0] # 실제의 등락을 확인
    except:
        cr = -1.0 # 예측 당일 아침 실제 결과가 없을시 임시 지정

    # locate the model data directory
    directory_model_data = f'../machine_learning/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
    result = load_from_pickle(result_pkl)[:-5] 
    precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = prediction_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(prediction_row.index[-1].date(), com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(prediction_row.index[-1].date(), com_name, result, y_predict, weight, cr)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {prediction_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

ValueError: Found array with 0 sample(s) (shape=(0, 51)) while a minimum of 1 is required by StandardScaler.

In [18]:
current_data

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,ixtr_cr_2,ixut_cr_2,nbi_cr_2,bkx_cr_2,open_2,high_2,low_2,close_2,vol_2,weekday
2022-01-13,-0.728308,-1.134320,-3.800544,-6.280593,-1.379315,1.202381,8.502452,-3.342857,-0.199498,-3.892308,...,0.388204,1.357070,-0.381551,1.397590,0.017926,0.019206,0.019455,0.011538,0.124455,2
2022-01-14,-2.922569,-0.961471,-2.248559,-1.318007,0.232121,1.968292,-1.305245,-1.000000,0.878336,-1.271277,...,0.171763,-0.681729,-3.743265,0.696059,0.011480,0.003797,-0.001282,-0.012674,0.047903,3
2022-01-17,-0.141878,-51.541050,0.494157,1.782079,0.050840,-0.501553,-2.065171,1.000000,-0.748316,54.666667,...,-0.541227,-0.778962,-1.356175,-0.122211,-0.022642,-0.018844,-0.019084,-0.020279,-0.100512,4
2022-01-19,-8.379546,-1.116349,-0.293620,-0.431429,0.985260,-0.638852,-2.502804,-1.000000,0.671470,-0.823529,...,-2.901290,-3.069136,-2.962608,-2.703978,-0.001287,-0.003841,-0.006485,-0.003881,-0.052837,1
2022-01-20,-0.353206,-0.647862,-0.316408,-12.128503,-0.288602,1.911987,-3.847015,-1.000000,-0.384173,-7.476190,...,-3.169371,-3.408924,-4.860017,-5.458870,-0.014175,-0.011568,-0.010403,-0.015484,0.193324,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-31,5.045599,-6.505242,-1.378964,-1.455990,-0.755787,-161.913669,-2.436101,10.841270,-0.542801,3.500000,...,1.057836,1.854493,0.600471,0.843254,0.020833,0.012719,0.016103,0.004769,0.368456,3
2023-04-03,0.823596,0.504240,-1.996055,-1.952853,-1.884406,-0.857066,-0.317107,0.147453,-1.742882,-1.046154,...,2.048021,2.039977,0.770444,-0.340135,0.024000,0.020734,0.024116,0.020734,0.285894,4
2023-04-04,-1.106890,-0.814171,-4.739488,-3.959295,-3.598117,0.343447,0.037441,-1.591121,-6.087108,14.666667,...,0.670550,1.813086,1.938160,0.332018,0.004710,0.004710,-0.001585,-0.001582,-0.238065,0
2023-04-05,-4.669072,1.004010,-1.191640,-1.446173,-1.103679,-4.333411,-0.980275,-2.604743,-2.743607,-0.420804,...,-3.203092,-0.547517,-0.234654,-2.510967,-0.009375,-0.003125,-0.014129,-0.006250,-0.216077,1


In [76]:
# save current prediction data

prediction_list = pd.concat([prediction_list, df_base], axis=0)
prediction_list = prediction_list[~prediction_list.index.duplicated(keep='last')]

directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
fname_c = 'prediction_list.csv'
path_p = directory_for_predict+'prediction/' + fname_p
path_c = directory_for_predict+'prediction/' + fname_c
prediction_list.to_pickle(path_p)
prediction_list.to_csv(path_c)

In [77]:
# 결과를 회사별로 확인하기
directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
predict_list = pd.read_pickle(directory_for_predict+'prediction/' + fname_p)

for i, (key, val) in enumerate(code_good.items()):
    print("***", i, key, val)
    k =  i * 4
    globals()[f'{val[1]}_df'] = predict_list.iloc[:, k:k+4]

*** 0 005930 ['삼성전자', 'sec']
*** 1 035420 ['NAVER', 'naver']
*** 2 035720 ['카카오', 'kakao']
*** 3 012330 ['현대모비스', 'mobis']
*** 4 051910 ['LG화학', 'lgchemical']
*** 5 005935 ['삼성전자우', 'secpre']
*** 6 000270 ['기아', 'kia']
*** 7 373220 ['LG에너지솔루션', 'lgenergy']
*** 8 005380 ['현대차', 'hyunmotor']
*** 9 000660 ['SK하이닉스', 'skhinix']
*** 10 006400 ['삼성SDI', 'sdi']


In [78]:
kia_df

Unnamed: 0_level_0,lgenergy_precision,lgenergy_predict,lgenergy_yes,lgenergy_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,0.89,1,0.49,0.51
2023-04-04,0.83,0,0.5,0.5
2023-04-05,0.83,0,0.48,0.52
2023-04-06,0.83,0,0.48,0.52
