<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testing with real world data

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [56]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [57]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [58]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [59]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    df = pd.DataFrame.from_dict(my_dict, orient='index') 
    df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 

In [60]:
def to_df(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['date'] = date
    dict_temp[f'{com_name}_precision'] = f'{precision:.2f}'
    dict_temp[f'{com_name}_predict'] = f'{y_predict[0]}'
    dict_temp[f'{com_name}_yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'{com_name}_no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('date', inplace=True)
    return df_t

In [61]:
def to_df_todays(date, com_name, result, y_predict, weight):
    precision = result.loc['precision'].iloc[-1]
    tn = result.loc['tn'].iloc[-1]
    fp = result.loc['fp'].iloc[-1]
    fn = result.loc['fn'].iloc[-1]
    tp = result.loc['tp'].iloc[-1]
    
    dict_temp = {}
    dict_temp['name'] = com_name
    dict_temp[f'precision'] = f'{precision:.2f}'
    dict_temp[f'predict'] = f'{y_predict[0]}'
    dict_temp[f'yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'no'] = f'{weight[0,0]:.2f}'
    dict_temp[f'tn'] = f'{tn:.1f}'
    dict_temp[f'fp'] = f'{fp:.1f}'
    dict_temp[f'fn'] = f'{fn:.1f}'
    dict_temp[f'tp'] = f'{tp:.1f}'
    
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('name', inplace=True)
    return df_t

In [70]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코퓨처엠', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng'], '030200' : ['KT', 'kt']}

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'], '005935' : ['삼성전자우', 'secpre'],
             '373220' : ['LG에너지솔루션', 'lgenergy'],
            }

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'],'005935' : ['삼성전자우', 'secpre'],
             '000270' : ['기아','kia'], '373220' : ['LG에너지솔루션', 'lgenergy'],
             '005380' : ['현대차', 'hyunmotor'], '000660' : ['SK하이닉스', 'skhinix'],
             '006400' : ['삼성SDI', 'sdi'],
            }

code_mid = {'105560' : ['KB금융', 'kbbank'],
            '003670' : ['포스코퓨처엠', 'poscochemical'],
            }

code_bad = { '030200' : ['KT', 'kt'],
             '033780' : ['KT&G', 'ktng'], '066570' : ['LG전자', 'lgelec'], 
             '005490' : ['POSCO홀딩스', 'poscoholding'], '055550' : ['신한지주', 'shgroup'],
             '096770' : ['SK이노베이션', 'skinnovation'],
             '207940' : ['삼성바이오로직스', 'ssbio'], '028260' : ['삼성물산', 'sscnt'],
             '068270' : ['셀트리온', 'celltrion'],
           }

In [71]:
directory_for_predict = '../data/data_for_ml/predict/'

if not os.path.exists(directory_for_predict+ 'prediction/prediction_list.pkl'):
    os.makedirs(directory_for_predict+'prediction')
    prediction_list=pd.DataFrame()
    fname_p = 'prediction_list.pkl'
    path_p = directory_for_predict+'prediction/' + fname_p
    prediction_list.to_pickle(path_p)

In [72]:
prediction_list = pd.read_pickle(directory_for_predict+ 'prediction/prediction_list.pkl')

In [73]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_sel.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns

    last_row = current_data.iloc[[-1]] # get the last row data == previous date data
    
    # locate the model data directory
    directory_model_data = f'../machine_learning/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
    result = load_from_pickle(result_pkl)[:-5] 
    precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = last_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(last_row.index[-1].date(), com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(last_row.index[-1].date(), com_name, result, y_predict, weight)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {last_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

Unnamed: 0_level_0,precision,predict,yes,no,tn,fp,fn,tp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sec,1.0,0,0.34,0.66,22.0,0.0,22.0,12.0
naver,1.0,0,0.44,0.56,23.0,0.0,16.0,8.0
kakao,1.0,0,0.32,0.68,22.0,0.0,19.0,4.0
mobis,1.0,0,0.38,0.62,27.0,0.0,25.0,6.0
lgchemical,1.0,0,0.45,0.55,23.0,0.0,17.0,7.0
secpre,1.0,0,0.1,0.9,26.0,0.0,27.0,5.0
kia,1.0,0,0.15,0.85,29.0,0.0,27.0,1.0
lgenergy,0.83,0,0.48,0.52,15.0,3.0,13.0,15.0
hyunmotor,0.82,0,0.37,0.63,26.0,3.0,15.0,14.0
skhinix,1.0,0,0.45,0.55,34.0,0.0,20.0,4.0


In [74]:
# save current prediction data

prediction_list = pd.concat([prediction_list, df_base], axis=0)
prediction_list = prediction_list[~prediction_list.index.duplicated(keep='last')]

directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
fname_c = 'prediction_list.csv'
path_p = directory_for_predict+'prediction/' + fname_p
path_c = directory_for_predict+'prediction/' + fname_c
prediction_list.to_pickle(path_p)
prediction_list.to_csv(path_c)

In [75]:
# 결과를 회사별로 확인하기
directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
predict_list = pd.read_pickle(directory_for_predict+'prediction/' + fname_p)

for i, (key, val) in enumerate(code_good.items()):
    print("***", i, key, val)
    k =  i * 4
    globals()[f'{val[1]}_df'] = predict_list.iloc[:, k:k+4]

*** 0 005930 ['삼성전자', 'sec']
*** 1 035420 ['NAVER', 'naver']
*** 2 035720 ['카카오', 'kakao']
*** 3 012330 ['현대모비스', 'mobis']
*** 4 051910 ['LG화학', 'lgchemical']
*** 5 005935 ['삼성전자우', 'secpre']
*** 6 000270 ['기아', 'kia']
*** 7 373220 ['LG에너지솔루션', 'lgenergy']
*** 8 005380 ['현대차', 'hyunmotor']
*** 9 000660 ['SK하이닉스', 'skhinix']
*** 10 006400 ['삼성SDI', 'sdi']


In [77]:
sdi_df

Unnamed: 0_level_0,sdi_precision,sdi_predict,sdi_yes,sdi_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,,,,
2023-04-04,,,,
2023-04-05,0.8,0.0,0.22,0.78


In [26]:
hyunmotor_df

Unnamed: 0_level_0,hyunmotor_precision,hyunmotor_predict,hyunmotor_yes,hyunmotor_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,,,,
2023-04-04,,,,
2023-04-05,0.82,0.0,0.37,0.63


In [30]:
from pandas_datareader import data as pdr
# from datetime import datetime
import yfinance as yf
yf.pdr_override()

In [48]:
startdate = datetime.datetime(2021,12,25)
# enddate = datetime.datetime(2023,3,23)
enddate = datetime.date.today() + datetime.timedelta(days=2)

In [49]:
kospi = pdr.get_data_yahoo('003550.KS', start=startdate, end=enddate)


[*********************100%***********************]  1 of 1 completed


In [42]:
kospi

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-27,82500.0,82900.0,82200.0,82600.0,76912.375000,243474
2021-12-28,82800.0,83800.0,82300.0,83700.0,77936.625000,373939
2021-12-29,82300.0,82400.0,81500.0,81800.0,78803.664062,399291
2021-12-30,81400.0,81800.0,80700.0,80900.0,77936.632812,357548
2022-01-04,81000.0,82500.0,80600.0,82400.0,79381.687500,363062
...,...,...,...,...,...,...
2023-03-30,83300.0,83800.0,81200.0,81600.0,81600.000000,308791
2023-03-31,81700.0,83300.0,81300.0,82800.0,82800.000000,311357
2023-04-03,82600.0,83400.0,82000.0,83200.0,83200.000000,229308
2023-04-04,83400.0,84900.0,82900.0,84600.0,84600.000000,215582


In [47]:
enddate

datetime.date(2023, 4, 7)

In [50]:
ixic_future = pdr.get_data_yahoo('NQ=F', start=startdate, end=enddate)

[*********************100%***********************]  1 of 1 completed


In [51]:
ixic_future

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-27,16304.25,16580.750000,16304.25,16560.00,16560.00,351308
2021-12-28,16565.75,16659.500000,16452.25,16488.00,16488.00,454633
2021-12-29,16513.50,16564.250000,16387.50,16490.50,16490.50,354985
2021-12-30,16484.50,16567.500000,16411.25,16430.25,16430.25,304259
2021-12-31,16431.00,16464.000000,16313.75,16320.75,16320.75,363720
...,...,...,...,...,...,...
2023-03-30,12962.50,13111.750000,12931.50,13082.00,13082.00,583151
2023-03-31,13084.25,13311.500000,13057.50,13301.75,13301.75,624955
2023-04-03,13262.00,13278.750000,13154.00,13270.00,13270.00,542896
2023-04-04,13259.00,13348.750000,13168.25,13219.00,13219.00,542896


In [52]:
enddate = datetime.date.today()

In [53]:
ixic_future = pdr.get_data_yahoo('NQ=F', start=startdate, end=enddate)

[*********************100%***********************]  1 of 1 completed


In [54]:
ixic_future

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-27,16304.25,16580.75,16304.25,16560.00,16560.00,351308
2021-12-28,16565.75,16659.50,16452.25,16488.00,16488.00,454633
2021-12-29,16513.50,16564.25,16387.50,16490.50,16490.50,354985
2021-12-30,16484.50,16567.50,16411.25,16430.25,16430.25,304259
2021-12-31,16431.00,16464.00,16313.75,16320.75,16320.75,363720
...,...,...,...,...,...,...
2023-03-29,12745.50,12994.00,12742.00,12965.00,12965.00,566316
2023-03-30,12962.50,13111.75,12931.50,13082.00,13082.00,583151
2023-03-31,13084.25,13311.50,13057.50,13301.75,13301.75,624955
2023-04-03,13262.00,13278.75,13154.00,13270.00,13270.00,542896
