<a href="https://colab.research.google.com/github/fasthill/ML-DL-study-alone/blob/main/5-1%20%EA%B2%B0%EC%A0%95%20%ED%8A%B8%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testing with real world data

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-1.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import datetime
import os, re

In [2]:
from xgboost import XGBClassifier
import xgboost
from lightgbm import LGBMClassifier
import lightgbm
import joblib

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
import pickle

# write list, dictionary to pickle
def save_to_pickle(path, filename):
    open_file = open(path, "wb")
    pickle.dump(filename, open_file)
    open_file.close()

# read list, dictionary from pickle
def load_from_pickle(path):
    open_file = open(path, "rb")
    loaded_file = pickle.load(open_file)
    open_file.close()
    return loaded_file

In [5]:
import csv

# write list, dictionary to csv
# path = './xxx/', my_dict = filename

def save_dict_to_csv(path, my_dict):
    df = pd.DataFrame.from_dict(my_dict, orient='index') 
    df.to_csv (path, index=False, header=True)  
    
def save_list_to_csv(path, my_list):
    df = pd.DataFrame(my_list, columns=['columns'])
    df.to_csv (path, index=False, header=True) 

In [6]:
def to_df(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['date'] = date
    dict_temp[f'{com_name}_precision'] = f'{precision:.2f}'
    dict_temp[f'{com_name}_predict'] = f'{y_predict[0]}'
    dict_temp[f'{com_name}_yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'{com_name}_no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('date', inplace=True)
    return df_t

In [7]:
def to_df_todays(date, com_name, precision, y_predict, weight):
    dict_temp = {}
    dict_temp['name'] = com_name
    dict_temp[f'precision'] = f'{precision:.2f}'
    dict_temp[f'predict'] = f'{y_predict[0]}'
    dict_temp[f'yes'] = f'{weight[0,1]:.2f}'
    dict_temp[f'no'] = f'{weight[0,0]:.2f}'
    df_t = pd.DataFrame.from_dict(dict_temp, orient='index').T
    df_t.set_index('name', inplace=True)
    return df_t

In [8]:
code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhinix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코퓨처엠', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng'], '030200' : ['KT', 'kt']}

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'], '005935' : ['삼성전자우', 'secpre'],
             '373220' : ['LG에너지솔루션', 'lgenergy'],
            }

code_good = {'005930' : ['삼성전자', 'sec'], '035420' : ['NAVER', 'naver'],
             '035720' : ['카카오', 'kakao'], '012330' : ['현대모비스', 'mobis'],
             '051910' : ['LG화학', 'lgchemical'],'005935' : ['삼성전자우', 'secpre'],
             '000270' : ['기아','kia'], '373220' : ['LG에너지솔루션', 'lgenergy'],
             '005380' : ['현대차', 'hyunmotor']
            }

code_mid = {'105560' : ['KB금융', 'kbbank'],
            '003670' : ['포스코퓨처엠', 'poscochemical'], '006400' : ['삼성SDI', 'sdi'],
            }

code_bad = { '030200' : ['KT', 'kt'],
             '033780' : ['KT&G', 'ktng'], '066570' : ['LG전자', 'lgelec'], 
             '005490' : ['POSCO홀딩스', 'poscoholding'], '055550' : ['신한지주', 'shgroup'],
             '000660' : ['SK하이닉스', 'skhinix'], '096770' : ['SK이노베이션', 'skinnovation'],
             '207940' : ['삼성바이오로직스', 'ssbio'], '028260' : ['삼성물산', 'sscnt'],
             '068270' : ['셀트리온', 'celltrion'],
           }

In [9]:
directory_for_predict = '../data/data_for_ml/predict/'

if not os.path.exists(directory_for_predict+ 'prediction/prediction_list.pkl'):
    os.makedirs(directory_for_predict+'prediction')
    prediction_list=pd.DataFrame()
    fname_p = 'prediction_list.pkl'
    path_p = directory_for_predict+'prediction/' + fname_p
    prediction_list.to_pickle(path_p)

In [10]:
prediction_list = pd.read_pickle(directory_for_predict+ 'prediction/prediction_list.pkl')

In [11]:
df_base = pd.DataFrame()
df_todays = pd.DataFrame()

for key, val in code_good.items():
 
    com_name = val[1]
    
    fname = f'df_{com_name}_sel.pkl'
    f_name = directory_for_predict + fname
    df_o = pd.read_pickle(f_name) 

    current_data = df_o.loc[:, 'retail_1':'weekday'] # select columns except targets columns

    last_row = current_data.iloc[[-1]] # get the last row data == previous date data
    
    # locate the model data directory
    directory_model_data = f'../machine_learning/{com_name}/'

    # get the model data filepath
    columns_pkl = directory_model_data + 'best_columns.pkl'
    scaler_pkl = directory_model_data + 'best_scaler.pkl'
    scaler_p_pkl = directory_model_data + 'best_scaler_p.pkl'
    model_pkl = directory_model_data + 'best_model.pkl'
    model_p_pkl = directory_model_data + 'best_model_p.pkl'
    result_pkl = directory_model_data + 'best_result.pkl'
    
    # load result data
    result = load_from_pickle(result_pkl)[:-5] 
    precision = result.loc['precision'].iloc[-1]
    
    # load columns data
    real_columns = load_from_pickle(columns_pkl)[:-5] # column 읽기. target columns 5개는 제외
    real_data_df = last_row[real_columns] # select necessary columns
    
    # scale the data
    scaler = joblib.load(scaler_pkl) # scaler 읽기
#     scaler = load_from_pickle(scaler_p_pkl) # scaler 읽기
    real_scaled = scaler.transform(real_data_df)
    
    # apply the scaled real_data to the model
    model = joblib.load(model_pkl) # model 읽기
#     model = load_from_pickle(model_p_pkl) # model made with pickle 읽기

    y_predict = model.predict(real_scaled)
    weight = model.predict_proba(real_scaled)

    df_temp = to_df(last_row.index[-1].date(), com_name, precision, y_predict, weight)
    df_base = pd.concat([df_base, df_temp],axis=1)
    df_temp_todays = to_df_todays(last_row.index[-1].date(), com_name, precision, y_predict, weight)
    df_todays = pd.concat([df_todays, df_temp_todays],axis=0)
    
#     print(f'**date: {last_row.index[-1].date()}, {precision:.2f}, {com_name}, 예측: {y_predict}, 가능성:{weight}')
df_todays

Unnamed: 0_level_0,precision,predict,yes,no
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sec,1.0,0,0.34,0.66
naver,1.0,0,0.44,0.56
kakao,1.0,0,0.32,0.68
mobis,1.0,0,0.38,0.62
lgchemical,1.0,0,0.45,0.55
secpre,1.0,0,0.1,0.9
kia,1.0,0,0.15,0.85
lgenergy,0.83,0,0.48,0.52
hyunmotor,0.82,0,0.37,0.63


In [12]:
# save current prediction data

prediction_list = pd.concat([prediction_list, df_base], axis=0)
prediction_list = prediction_list[~prediction_list.index.duplicated(keep='last')]

directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
fname_c = 'prediction_list.csv'
path_p = directory_for_predict+'prediction/' + fname_p
path_c = directory_for_predict+'prediction/' + fname_c
prediction_list.to_pickle(path_p)
prediction_list.to_csv(path_c)

NameError: name 'path_c' is not defined

In [20]:
directory_for_predict = '../data/data_for_ml/predict/'
fname_p = 'prediction_list.pkl'
predict_list = pd.read_pickle(directory_for_predict+'prediction/' + fname_p)

In [21]:
# sec_df = aa.iloc[:, :4]
# sec_df = aa.iloc[:, 4:8]
# sec_df = aa.iloc[:, 8:12]
# sec_df = aa.iloc[:, 12:16]
# sec_df = aa.iloc[:, 16:20]
# sec_df = aa.iloc[:, 8:12]
# sec_df = aa.iloc[:, 12:16]
# sec_df = aa.iloc[:, 16:20]

Unnamed: 0_level_0,sec_precision,sec_predict,sec_yes,sec_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-03,0.93,0,0.52,0.48
2023-04-04,1.0,0,0.55,0.45
2023-04-05,1.0,0,0.34,0.66


In [22]:
from pandas_datareader import data as pdr
# from datetime import datetime
import yfinance as yf
yf.pdr_override()

In [23]:
startdate = datetime.datetime(2021,12,25)
# enddate = datetime.datetime(2023,3,23)
enddate = datetime.date.today()

In [None]:
kospi = pdr.get_data_yahoo('003550.KS', start=startdate, end=enddate)
