## 당일 예측을 위한 데이터 취득 -> 사용후에 차기 분석용으로 사용

### feature합치기-- 하루전과 이틀전의 데이터와 비교
#### investors, historical(매일 거래량 등) and 분류항목 합치기

In [2]:
import datetime, time
from datetime import date

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

In [3]:
# index column : date
def find_previous_date(df_inv, df_his, date_current, opening_days): # 현재일에서 바로 직전 1~3거래일을 찾음.
    i = 0
    date2 = 0
    while True:
        i += 1
        date2 = date_current - datetime.timedelta(days = i)
        
#         try:
#             (df_inv.loc[date2]['retail'] != 0) & (df_his.loc[date2]['vol'] != '') # 2022년 01. 01.년 이전 날짜로 들어오면 return
#             return 0
#         except :

#         if (df_inv.loc[date2]['retail'] != 0) & (df_his.loc[date2]['vol'] != '') :  
#             # investors와 historical 두 개 데이터에 같이 없어야 함
#             break
        if ~opening_days.isin([date2]).any():
            break
        if i >= 30:  # 현재일 대비 최대 30일전까지 거래가 있을 경우 계산 (현실적으로 최대 10일 이상 폐장되는 경우 없음)
            break

    return date2

In [4]:
# 거래일과 거래일 바로 이전일의 변화율을 계산함.
def find_ratio(df_o, date_current):
    df_o_trans = df_o.transpose()
    print("**", df_o_trans.tail())
    df_o_trans.columns = ['before', 'after']
    df_o_trans['before'][df_o_trans['before'] == 0] = 1e-20 # 0으로 나누는 것을 예외로 두기 위하여 작은 숫자로 대체
    df_o_trans[date_current] = (df_o_trans['after'] - df_o_trans['before']) / df_o_trans['before']
    df_o_trans[date_current] = df_o_trans[date_current].apply(lambda x: np.inf if x > 1e+10 else -np.inf if x < -1e+10 else x)
    # -inf, +inf로 대체함.
    return df_o_trans.transpose()

In [5]:
# 변화율(historical, investors), weekday를 계산하고 합하여 return함
def combine_data(df_o, df_inv, df_his, opening_days):
    # df_inv : df_investor, df_his: df_historical
    investor_rate = pd.DataFrame()
    historical_rate = pd.DataFrame()
    date_weekday = pd.DataFrame()
    current_previous_date = {} # 현재날짜와 하루 이전 날짜 보관
    
    # 최초의 0이 아닌(최초 거래가 이루어진 일자) 날짜로 부터 일주일 후부터 기준.
    # 최초 2일전까지의 날짜를 확보하기 위함. 현재일의 전날과 전전날의 이틀간 데이터 변화율 계산
    for k, i in enumerate(df_inv.iloc[:, 1]):  
        if i != 0: 
            skip_num = k + 10 # 8: 전일, 9: 전전일, 10: -3일까지 확보 
            break

    for date_current in df_o.index[skip_num:]: 
        # 최초 2일전까지의 날짜를 확보하기 위함. 현재일의 전날과 전전날의 이틀간 데이터 변화율 계산
        # 거래 전일(-1일) 날짜구하기

        # 거래가 없는날은 skip : opening_days: 2023년 말까지 개장일 리스트
        if ~opening_days.isin([date_current]).any():
            continue
            
        date_previous_c = find_previous_date(df_inv, df_his, date_current, opening_days)
        current_previous_date[date_current] = date_previous_c

        # 거래 전날 요일 구하기
        date_temp = {'date': date_current, 'weekday' : date_previous_c.weekday()}
        df_temp = pd.DataFrame(date_temp, index=[0]).set_index('date')
        date_weekday = pd.concat([date_weekday, df_temp], axis=0)
        date_previous_1 = find_previous_date(df_inv, df_his, date_previous_c, opening_days)
        # 거래 전날, 전전날을 확인하고 변화정도 계산하기(find_ratio)
        df_inv_comp_1 = df_inv.loc[[date_previous_1, date_previous_c]]
        df_his_comp_1 = df_his.loc[[date_previous_1, date_previous_c]]
    
    
        # 거래 전전일(-2일) 날짜 구하기
        date_previous_2 = find_previous_date(df_inv, df_his, date_previous_1, opening_days)
        # 거래 전전날(-2일째) 요일 구하기  -- 2일전 요일은 의미 없을 것 같아서 추가하지 않음.
#         date_temp_2 = {'date': date_current, 'weekday' : date_previous_2.weekday()}
#         df_temp_2 = pd.DataFrame(date_temp_2, index=[0]).set_index('date')
#         date_weekday = pd.concat([date_weekday, df_temp_2], axis=0)
        # 거래 전날(-1일), 전전날(-3일)을 확인하고 변화정도 계산하기(find_ratio)
        df_inv_comp_2 = df_inv.loc[[date_previous_2, date_previous_c]]
        df_his_comp_2 = df_his.loc[[date_previous_2, date_previous_c]]
            
        # 전날 -전전날 ratio, 전날 - 전전전날 ratio, column확대
        df_inv_concat_2 = pd.concat([find_ratio(df_inv_comp_1, date_current).iloc[[-1]],
                                 find_ratio(df_inv_comp_2, date_current).iloc[[-1]]], axis=1)
        df_his_concat_2 = pd.concat([find_ratio(df_his_comp_1, date_current).iloc[[-1]],
                                 find_ratio(df_his_comp_2, date_current).iloc[[-1]]], axis=1)
        
        investor_rate = pd.concat([investor_rate,  df_inv_concat_2], axis=0)
        historical_rate = pd.concat([historical_rate, df_his_concat_2], axis=0)

#     total = pd.concat([investor_rate, historical_rate, date_weekday, df_o ], axis=1)
    total = pd.concat([investor_rate, historical_rate, date_weekday], axis=1)

    return total, current_previous_date

In [23]:
def df_previous_days():
    base_data_directory = './data/base_data/stock_market_holydays/'
    opening_days = pd.read_pickle(base_data_directory+'opening_days.pkl') # 한국 개장일 데이터 
    df = pd.DataFrame(opening_days)
    df['date_1'] = df['date'].shift(1)
    df['date_2'] = df['date'].shift(2)
    df['date_3'] = df['date'].shift(3)
    c_p1_dict = df.set_index('date').to_dict()['date_1'] # date로 date_p 찾기
    c_p2_dict = df.set_index('date').to_dict()['date_2'] # date로 date_p 찾기
    p1_c_dict = df.set_index('date_1').to_dict()['date'] # date_p로 date 찾기
    p2_c_dict = df.set_index('date_2').to_dict()['date'] # date_p로 date 찾기
    return c_p1_dict, c_p2_dict, p1_c_dict, p2_c_dict

In [6]:
code = {'005930': ['삼성전자', 'sec'], '005380': ['현대차', 'hyunmotor'],
                 '035420': ['NAVER', 'naver'], '033780': ['KT&G', 'ktng']}
# code = {'005930': ['삼성전자', 'sec']}

code = {'005930' : ['삼성전자', 'sec'], '373220' : ['LG에너지솔루션', 'lgenergy'], 
        '000660' : ['SK하이닉스', 'skhynix'], '207940' : ['삼성바이오로직스', 'ssbio'],
        '006400' : ['삼성SDI', 'sdi'], '051910' : ['LG화학', 'lgchemical'],
        '005935' : ['삼성전자우', 'secpre'], '005380' : ['현대차', 'hyunmotor'],
        '035420' : ['NAVER', 'naver'], '000270' : ['기아','kia'],
        '035720' : ['카카오', 'kakao'], '005490' : ['POSCO홀딩스', 'poscoholding'],
        '105560' : ['KB금융', 'kbbank'], '028260' : ['삼성물산', 'sscnt'],
        '068270' : ['셀트리온', 'celltrion'], '012330' : ['현대모비스', 'mobis'],
        '055550' : ['신한지주', 'shgroup'], '066570' : ['LG전자', 'lgelec'],
        '003670' : ['포스코퓨처엠', 'poscochemical'], '096770' : ['SK이노베이션', 'skinnovation'],
        '033780' : ['KT&G', 'ktng'], '030200' : ['KT', 'kt']}
# code = {'005380' : ['현대차', 'hyunmotor']}

In [7]:
# hist_column = [ 'date', 'open', 'high', 'low', 'close', 'close_cr', 'vol']
hist_column_m = [ 'date', 'open', 'high', 'low', 'close', 'vol'] # close_cr 제외하고 사용. divided by zero 회피용

In [8]:
# investor.pkl, historical.pkl. 읽기

directory_for_predict = './data/data_for_ml/predict/'
pkl_directory = './data/company_pkl/'
base_data_directory = './data/base_data/stock_market_holydays/'
opening_days = pd.read_pickle(base_data_directory+'opening_days.pkl') # 한국 개장일 데이터 

for key, val in code.items():
 
    pkl_name= '{}_historical.pkl'.format(val[1])
    df_historical_temp = pd.read_pickle(pkl_directory + pkl_name)
    df_historical_temp = df_historical_temp[hist_column_m]

    # close_cr 행을 없앰(변동이 없는 경우가 빌생하여 divided zero error 발생), close_cr은 target column에서 재 계산하여사용
    df_historical_temp['date'] = df_historical_temp['date'].dt.date # change to datetime
    
    pkl_name= '{}_investors.pkl'.format(val[1])
    df_investors_temp = pd.read_pickle(pkl_directory + pkl_name)
    df_investors_temp['date'] = df_investors_temp['date'].dt.date # change to datetime
    
    # ******** 시작 일자, 마지막 일자  지정 ***********
    start_date = datetime.date(2022, 1, 1) # 2022년 01월 01일 자료 있음. 추후 이전날짜 추가시 수정 필요
    end_date = df_investors_temp['date'].iloc[-1]  # 투자자별 자료가 있는 마지막 날짜
    today = datetime.date.today()
    if end_date != today :
        end_date = end_date + datetime.timedelta(days = 1) # 분석대상전일에 할 경우
    
    date_range_ts = pd.date_range(start=start_date, end=end_date)
    df_base = pd.DataFrame(pd.Series(date_range_ts, name='date'))
    df_base['date'] = df_base['date'].dt.date

    df_combined_temp, cp_date = combine_data(df_base.set_index('date'), 
                                             df_investors_temp.set_index('date'), 
                                             df_historical_temp.set_index('date'),
                                             opening_days)
    # cp_date : 한국시장에서 거래일과 거래일의 하루전 거래일을 1:1 묶어 놓은 df

    # column nama change according to the newly added columns
    column_name_change = ['retail_1', 'foreigner_1', 'institution_1', 'financial_1', 'invtrust_1', 'pension_1', 
          'privequity_1', 'bank_1', 'insurance_1', 'financeetc_1', 'corporateetc_1', 'foreigneretc_1', 
          'retail_2', 'foreigner_2', 'institution_2', 'financial_2', 'invtrust_2', 'pension_2', 
          'privequity_2', 'bank_2', 'insurance_2', 'financeetc_2', 'corporateetc_2', 'foreigneretc_2', 
          'open_1', 'high_1', 'low_1', 'close_1', 'vol_1', 
          'open_2', 'high_2', 'low_2', 'close_2', 'vol_2', 'weekday' ]
    
    df_combined_temp.columns = column_name_change
    
    df_combined_temp['temp'] = df_combined_temp['close_1'].shift(-1) # 현재날짜 증감을 확인하기 위하여 임시 컬럼 추가
    
    # 마지막 row의 temp는 None이기 때문에 0으로 처리하여 진행 (예측시 사용하지 않아 무관)
    # None을 0으로 변환
    ctemp = df_combined_temp['temp'].copy() 
    ctemp[-1] = 0
    df_combined_temp['temp'] = ctemp 
    # ------------------------------------
    
#     print("before com name", val[1])
#     print("before df_combined_temp", df_combined_temp.tail())
#     a = input("stop, df_combined_temp")
#     df_combined_temp = df_combined_temp.iloc[:-1]
#     print("after df_combined_temp", df_combined_temp.tail())

    min_rate = 0.0 # +로 끝난 상황을 알기 위함
    df_combined_temp['cr_00'] = df_combined_temp['temp'].map(lambda x : 1 if x > min_rate else 0)
    min_rate = 0.005 # 수수료등 비용 0.2672% 이상 확인하기 위함, 0.5% 상승 마감
    df_combined_temp['cr_05'] = df_combined_temp['temp'].map(lambda x : 1 if x >= min_rate else 0)
    min_rate = 0.010 # 1.0% 상승 마감
    df_combined_temp['cr_10'] = df_combined_temp['temp'].map(lambda x : 1 if x >= min_rate else 0)
    min_rate = 0.015 # 1.5% 상승 마감
    df_combined_temp['cr_15'] = df_combined_temp['temp'].map(lambda x : 1 if x >= min_rate else 0)
    min_rate = 0.020 # 2.0% 상승 마감
    df_combined_temp['cr_20'] = df_combined_temp['temp'].map(lambda x : 1 if x >= min_rate else 0)
    
    df_combined_temp.drop(columns='temp', inplace=True) # 사용후 삭제
    
    column_selected = column_name_change
    column_selected.extend(['cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20'])  # 아래 class column 이 변경에 따라 수정해야 함
    
    globals()['df_{}_combined'.format(val[1])] = df_combined_temp.copy()
    globals()['df_{}_sel'.format(val[1])] = df_combined_temp[column_selected]

** date          2022-01-08  2022-01-09
bank                 0.0         0.0
insurance            0.0         0.0
financeetc           0.0         0.0
corporateetc         0.0         0.0
foreigneretc         0.0         0.0
** date          2022-01-02  2022-01-09
bank                 0.0         0.0
insurance            0.0         0.0
financeetc           0.0         0.0
corporateetc         0.0         0.0
foreigneretc         0.0         0.0
** date  2022-01-08 2022-01-09
open                       
high                       
low                        
close                      
vol                        


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [10]:
opening_days

1       2020-01-02
2       2020-01-03
3       2020-01-06
4       2020-01-07
5       2020-01-08
           ...    
1036    2023-12-21
1037    2023-12-22
1039    2023-12-26
1040    2023-12-27
1041    2023-12-28
Name: date, Length: 996, dtype: object

In [11]:
df = pd.DataFrame(opening_days)

In [16]:
df['date_1'] = df['date'].shift(1)
df['date_2'] = df['date'].shift(2)
df['date_3'] = df['date'].shift(3)

In [17]:
df

Unnamed: 0,date,date_1,date_2
1,2020-01-02,,
2,2020-01-03,2020-01-02,
3,2020-01-06,2020-01-03,2020-01-02
4,2020-01-07,2020-01-06,2020-01-03
5,2020-01-08,2020-01-07,2020-01-06
...,...,...,...
1036,2023-12-21,2023-12-20,2023-12-19
1037,2023-12-22,2023-12-21,2023-12-20
1039,2023-12-26,2023-12-22,2023-12-21
1040,2023-12-27,2023-12-26,2023-12-22


In [21]:
c_p_dict = df.set_index('date').to_dict()['date_1'] # date_c로 date_p 찾기

In [22]:
c_p_dict

{datetime.date(2020, 1, 2): nan,
 datetime.date(2020, 1, 3): datetime.date(2020, 1, 2),
 datetime.date(2020, 1, 6): datetime.date(2020, 1, 3),
 datetime.date(2020, 1, 7): datetime.date(2020, 1, 6),
 datetime.date(2020, 1, 8): datetime.date(2020, 1, 7),
 datetime.date(2020, 1, 9): datetime.date(2020, 1, 8),
 datetime.date(2020, 1, 10): datetime.date(2020, 1, 9),
 datetime.date(2020, 1, 13): datetime.date(2020, 1, 10),
 datetime.date(2020, 1, 14): datetime.date(2020, 1, 13),
 datetime.date(2020, 1, 15): datetime.date(2020, 1, 14),
 datetime.date(2020, 1, 16): datetime.date(2020, 1, 15),
 datetime.date(2020, 1, 17): datetime.date(2020, 1, 16),
 datetime.date(2020, 1, 20): datetime.date(2020, 1, 17),
 datetime.date(2020, 1, 21): datetime.date(2020, 1, 20),
 datetime.date(2020, 1, 22): datetime.date(2020, 1, 21),
 datetime.date(2020, 1, 23): datetime.date(2020, 1, 22),
 datetime.date(2020, 1, 25): datetime.date(2020, 1, 23),
 datetime.date(2020, 1, 26): datetime.date(2020, 1, 25),
 datetime

In [26]:
date_range_ts


DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10',
               ...
               '2023-04-09', '2023-04-10', '2023-04-11', '2023-04-12',
               '2023-04-13', '2023-04-14', '2023-04-15', '2023-04-16',
               '2023-04-17', '2023-04-18'],
              dtype='datetime64[ns]', length=473, freq='D')

In [25]:
cp_date

{datetime.date(2022, 2, 7): datetime.date(2022, 2, 4),
 datetime.date(2022, 2, 8): datetime.date(2022, 2, 7),
 datetime.date(2022, 2, 9): datetime.date(2022, 2, 8),
 datetime.date(2022, 2, 10): datetime.date(2022, 2, 9),
 datetime.date(2022, 2, 11): datetime.date(2022, 2, 10),
 datetime.date(2022, 2, 14): datetime.date(2022, 2, 11),
 datetime.date(2022, 2, 15): datetime.date(2022, 2, 14),
 datetime.date(2022, 2, 16): datetime.date(2022, 2, 15),
 datetime.date(2022, 2, 17): datetime.date(2022, 2, 16),
 datetime.date(2022, 2, 18): datetime.date(2022, 2, 17),
 datetime.date(2022, 2, 21): datetime.date(2022, 2, 18),
 datetime.date(2022, 2, 22): datetime.date(2022, 2, 21),
 datetime.date(2022, 2, 23): datetime.date(2022, 2, 22),
 datetime.date(2022, 2, 24): datetime.date(2022, 2, 23),
 datetime.date(2022, 2, 25): datetime.date(2022, 2, 24),
 datetime.date(2022, 2, 28): datetime.date(2022, 2, 25),
 datetime.date(2022, 3, 2): datetime.date(2022, 2, 28),
 datetime.date(2022, 3, 3): datetime.da

In [15]:
df_combined_temp[column_selected]

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,high_2,low_2,close_2,vol_2,weekday,cr_00,cr_05,cr_10,cr_15,cr_20
2022-02-07,0.178357,-0.317851,0.534176,-0.209934,0.872374,0.303383,2.780304,1.145027,-0.151999,-12.035129,...,0.045549,0.069663,0.12,-0.506798,4,1,1,1,1,1
2022-02-08,-0.229554,0.055307,-0.318702,4.043373,-0.097632,-0.494596,-0.626622,-0.927434,-0.780098,-0.541596,...,0.105954,0.15873,0.148847,-0.25845,0,0,0,0,0,0
2022-02-09,-0.737929,-1.421156,-0.372163,-0.636168,1.254492,-0.413341,-0.938408,3.043902,-2.874169,-0.362037,...,0.142574,0.10084,0.075397,0.383017,1,0,0,0,0,0
2022-02-10,-1.757433,-1.363425,-1.507353,-1.474661,-0.903429,-0.547200,-28.824967,-0.814234,0.594180,0.296807,...,0.016423,-0.011742,-0.067518,0.066542,2,0,0,0,0,0
2022-02-11,5.876703,-17.224223,1.779904,6.603999,-2.502642,0.315610,0.696584,1.902597,-0.690500,-0.744264,...,-0.121317,-0.100191,-0.124539,-0.22262,3,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-13,-2.053600,-1.499162,-0.079132,-0.859865,1.018194,-3.274806,0.190904,-1.333333,-2.328754,-1.008333,...,0.020067,0.006849,-0.010067,-0.091205,2,1,1,1,1,0
2023-04-14,-1.284426,-1.049908,-1.813450,-2.124257,-6.134900,-1.138851,-1.370782,-0.629630,-0.881996,-1.000000,...,-0.022838,-0.035235,-0.018033,-0.53663,3,0,0,0,0,0
2023-04-17,-3.102010,-16.933062,-1.308012,-0.716002,-1.294752,-0.120660,-6.965269,15.550000,0.936556,-inf,...,-0.009836,-0.003401,0.00678,-0.376044,4,0,0,0,0,0
2023-04-18,-0.453251,-0.608432,0.271266,-6.232375,-1.237396,0.330766,-0.789400,0.519637,3.691108,-1.167585,...,-0.010017,0.015652,-0.013356,-0.286204,0,0,0,0,0,0


In [13]:
df_investors_temp.tail()

Unnamed: 0,date,retail,foreigner,institution,financial,invtrust,pension,privequity,bank,insurance,financeetc,corporateetc,foreigneretc
1191,2023-04-06,-14633.0,13208.0,1657.0,-12510.0,-1811.0,16294.0,226.0,-18.0,-525.0,0.0,-6.0,-226.0
1192,2023-04-07,3844.0,2989.0,-6836.0,-2354.0,-3506.0,-288.0,-65.0,0.0,-744.0,121.0,191.0,-189.0
1193,2023-04-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1194,2023-04-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1195,2023-04-10,-20377.0,3791.0,12407.0,7817.0,2720.0,2878.0,-120.0,-932.0,130.0,-86.0,4736.0,-556.0


In [14]:
df_combined_temp.tail()

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,high_2,low_2,close_2,vol_2,weekday,cr_00,cr_05,cr_10,cr_15,cr_20
2023-04-05,0.13057,-4.227236,-1.000929,-0.785412,-0.727766,-1.18099,-0.812021,-1.0,-3.530516,-1.257143,...,0.008653,0.007697,-0.003794,-0.2324,1,1,1,1,1,0
2023-04-06,1.419788,0.740609,-831.266667,-5.646632,2.467463,-3.14131,-0.229431,-inf,-0.456401,-3.333333,...,0.009164,0.009325,0.016885,-0.035018,2,0,0,0,0,0
2023-04-07,-0.561032,-0.372154,-0.86695,-3.78992,-1.693604,3.119848,-0.803819,-0.1,-2.791809,-1.0,...,0.01555,0.007638,0.007073,0.323218,3,0,0,0,0,0
2023-04-10,-1.262694,-0.773698,-5.125528,-0.811831,0.935947,-1.017675,-1.287611,-1.0,0.417143,inf,...,-0.003739,0.003261,-0.00857,-0.148364,4,1,0,0,0,0
2023-04-11,-6.300989,0.268317,-2.81495,-4.320731,-1.775813,-10.993056,0.846154,-inf,-1.174731,-1.710744,...,-0.004752,-0.002707,0.004322,-0.283388,0,0,0,0,0,0


In [15]:
df_hyunmotor_sel.tail()

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,high_2,low_2,close_2,vol_2,weekday,cr_00,cr_05,cr_10,cr_15,cr_20
2023-04-05,0.13057,-4.227236,-1.000929,-0.785412,-0.727766,-1.18099,-0.812021,-1.0,-3.530516,-1.257143,...,0.008653,0.007697,-0.003794,-0.2324,1,1,1,1,1,0
2023-04-06,1.419788,0.740609,-831.266667,-5.646632,2.467463,-3.14131,-0.229431,-inf,-0.456401,-3.333333,...,0.009164,0.009325,0.016885,-0.035018,2,0,0,0,0,0
2023-04-07,-0.561032,-0.372154,-0.86695,-3.78992,-1.693604,3.119848,-0.803819,-0.1,-2.791809,-1.0,...,0.01555,0.007638,0.007073,0.323218,3,0,0,0,0,0
2023-04-10,-1.262694,-0.773698,-5.125528,-0.811831,0.935947,-1.017675,-1.287611,-1.0,0.417143,inf,...,-0.003739,0.003261,-0.00857,-0.148364,4,1,0,0,0,0
2023-04-11,-6.300989,0.268317,-2.81495,-4.320731,-1.775813,-10.993056,0.846154,-inf,-1.174731,-1.710744,...,-0.004752,-0.002707,0.004322,-0.283388,0,0,0,0,0,0


## 주요 지표 읽기

### Read and Combine data 

In [36]:
# 매일 변화가 있는 항목 추출; 금리변동과 같이 주기적을 발생하는 항목들(cpi, bok_rate, fed_rate 등)은 제외
common_pkl = [ "dji.pkl", "dji_future.pkl", "dxy_future.pkl", 
               "ixic_future.pkl", "kor_10yr_bond.pkl",
              "kor_2yr_bond.pkl", "kosdaq.pkl", "kospi.pkl", "krw_rate.pkl", "nas.pkl",
              "snp_future.pkl", "sox.pkl", "spx.pkl", "us_10yr_bond.pkl", "us_2yr_bond.pkl",
              "us_3mon_bond.pkl", "vix.pkl", "wti_future.pkl",
              'spsy.pkl', 'spny.pkl', 'spxhc.pkl', 'splrcd.pkl', 'splrci.pkl', 'splrcu.pkl', 'splrcs.pkl', 
              'splrct.pkl', 'splrcl.pkl', 'splrcm.pkl', 'ixbk.pkl', 'ixfn.pkl', 'ixid.pkl', 'ixis.pkl', 
              'ixk.pkl', 'ixtr.pkl', 'ixut.pkl', 'nbi.pkl', 'bkx.pkl' 
             ]
common_pkl = [ "dji.pkl" ]

In [17]:
# dict로 만들어 pkl내에 있는 df의 column 이름과 매칭시키기 위하여 구성
common_col_name = {'dji.pkl':'dji', 'dji_future.pkl':'dji_f', 'dxy_future.pkl':'dxy', 
                   'ixic_future.pkl':'ixic_f', 'kor_10yr_bond.pkl':'bond_kor_10',
                   'kor_2yr_bond.pkl':'bond_kor_2', 'kosdaq.pkl':'kosdaq', 'kospi.pkl':'kospi',
                   'krw_rate.pkl':'krw', 'nas.pkl':'ixic', 'snp_future.pkl':'spx_f',
                   'sox.pkl':'sox', 'spx.pkl':'spx', 'us_10yr_bond.pkl':'bond_usa_10',
                   'us_2yr_bond.pkl':'bond_usa_2', 'us_3mon_bond.pkl':'bond_usa_3m',
                   'vix.pkl':'vix', 'wti_future.pkl':'wti',
                   'spsy.pkl':'spsy', 'spny.pkl':'spny', 'spxhc.pkl':'spxhc', 'splrcd.pkl':'splrcd', 
                   'splrci.pkl':'splrci', 'splrcu.pkl':'splrcu', 'splrcs.pkl':'splrcs', 
                   'splrct.pkl':'splrct', 'splrcl.pkl':'splrcl', 'splrcm.pkl':'splrcm', 
                   'ixbk.pkl':'ixbk', 'ixfn.pkl':'ixfn', 'ixid.pkl':'ixid', 'ixis.pkl':'ixis', 
                   'ixk.pkl':'ixk', 'ixtr.pkl':'ixtr', 'ixut.pkl':'ixut', 'nbi.pkl':'nbi', 'bkx.pkl':'bkx'
                   }

In [18]:
end_date = datetime.datetime.today().date() + datetime.timedelta(days = 1)
start_date = '20220101'  # 2022년 01월 01일 자료 있음. 추후 이전날짜 추가시 수정 필요
date_range_ts = pd.date_range(start=start_date, end=end_date)

end_date 2023-04-11


In [98]:
pkl_common_directory = './data/common_pkl/'
df_base = pd.DataFrame(pd.Series(date_range_ts, name='date')).set_index('date')
# df_base_t = pd.DataFrame(pd.Series(date_range_ts, name='date'))['date'].dt.date
# df_base = df_base_t.set_index('date', inplace=True)

date_compare = pd.DataFrame.from_dict(cp_date, orient='index')
date_compare.reset_index(inplace=True)
date_compare.columns = ['date_c', 'date_p'] #거래일과 바로 이전 거래일
c_p_dict = date_compare.set_index('date_c').to_dict()['date_p'] # date_c로 date_p 찾기
p_c_dict = date_compare.set_index('date_p').to_dict()['date_c'] # date_p로 date_c 찾기

for index_name in common_pkl:
    key_name = index_name[:-4]
    col_name = common_col_name[index_name]
    df_temp = pd.read_pickle(pkl_common_directory + index_name)
    df_temp['date'] = df_temp['date'].dt.date # date type으로 변환
    df_temp['temp'] = df_temp[col_name].shift(2)  # 거래일과 이전(-1) 이전(-1) 거래일 데이터와 비교하기 위하여 -2 shift
    df_temp[f'{col_name}_cr_2'] = (df_temp[col_name] - df_temp['temp'])/df_temp['temp']*100 # 이전 거래일 데이터는 "_cr"
    df_new = df_temp[df_temp['date'].isin(date_compare['date_p'])] # 이전 거래일과 비교날짜가 있는 row만 선택
    # 예: 2023-04-10('date_c'), 2023-04-07('date_p') 일 경우, 2023년 4월 7일 데이터는 2023년 4월 10일 row에 작성해야 함.
    # 위 예와 같이 하기 위해 'date_p'에 있는 date로만 df재구성(df_new)
    
    add_c= df_new['date'].apply(lambda x : p_c_dict[x])  # 하루전 날짜와 매칭한 column 생성 (add_c)
    # 예: 2023년 4월 7일 데이터는 2023년 4월 10일 row에 기입
    df_new.insert(len(df_new.columns)-1, 'date_c', value=add_c)  #SettingWithCopyWarning: error 회피
    df_new.set_index('date_c', inplace=True)
    
    df_base = df_base.merge(df_new.iloc[:, [-3, -1]], \
                            how='left', left_index=True, right_index=True)

In [101]:
for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])] \
    = globals()['df_{}_sel'.format(val[1])].merge(df_base, how='left', left_index=True, right_index=True)

In [102]:
df_hyunmotor_sel.tail(10)

Unnamed: 0,retail_1,foreigner_1,institution_1,financial_1,invtrust_1,pension_1,privequity_1,bank_1,insurance_1,financeetc_1,...,close_2,vol_2,weekday,cr_00,cr_05,cr_10,cr_15,cr_20,dji_cr,dji_cr_2
2023-03-29,3.64112,0.014773,-0.516134,-0.088446,2.625404,-0.019847,-0.957306,-inf,-0.678486,-1.0,...,-0.003376,-0.475888,1,1,1,1,1,0,-0.12%,0.486144
2023-03-30,4.553668,0.92487,-5.945383,0.358916,-1.339667,-1.710821,-17.709091,-1.264706,-1.138599,inf,...,0.024418,0.441051,2,1,0,0,0,0,1.00%,0.880361
2023-03-31,-0.256597,0.478951,-1.09543,-1.679274,0.487762,-0.878338,-0.605005,-1.222222,0.645161,-1.0,...,0.023151,0.577612,3,1,1,1,1,0,0.43%,1.434765
2023-04-03,0.264095,0.079817,-2.948849,-2.255682,-2.09577,2.329268,-1.756887,13.0,-13.673203,-inf,...,0.022727,0.203307,4,0,0,0,0,0,1.26%,1.701069
2023-04-04,-0.629286,-1.124023,4.298556,-1.678281,-2.48311,6.47619,-8.236579,0.071429,-0.89015,1.333333,...,0.013245,0.031715,0,1,0,0,0,0,0.98%,2.258488
2023-04-05,0.13057,-4.227236,-1.000929,-0.785412,-0.727766,-1.18099,-0.812021,-1.0,-3.530516,-1.257143,...,-0.003794,-0.2324,1,1,1,1,1,0,-0.59%,0.385376
2023-04-06,1.419788,0.740609,-831.266667,-5.646632,2.467463,-3.14131,-0.229431,-inf,-0.456401,-3.333333,...,0.016885,-0.035018,2,0,0,0,0,0,0.24%,-0.352457
2023-04-07,-0.561032,-0.372154,-0.86695,-3.78992,-1.693604,3.119848,-0.803819,-0.1,-2.791809,-1.0,...,0.007073,0.323218,3,0,0,0,0,0,0.01%,0.248216
2023-04-10,-1.262694,-0.773698,-5.125528,-0.811831,0.935947,-1.017675,-1.287611,-1.0,0.417143,inf,...,-0.00857,-0.148364,4,1,0,0,0,0,,
2023-04-11,-6.300989,0.268317,-2.81495,-4.320731,-1.775813,-10.993056,0.846154,-inf,-1.174731,-1.710744,...,0.004322,-0.283388,0,0,0,0,0,0,,


In [16]:
col_inv1 = ['retail_1', 'foreigner_1', 'institution_1', 'financial_1', 'invtrust_1', 'pension_1', 'privequity_1', 
            'bank_1', 'insurance_1', 'financeetc_1', 'corporateetc_1', 'foreigneretc_1']

In [17]:
col_inv2 = ['retail_2', 'foreigner_2', 'institution_2', 'financial_2', 'invtrust_2', 'pension_2',
            'privequity_2', 'bank_2', 'insurance_2', 'financeetc_2', 'corporateetc_2', 'foreigneretc_2']

In [18]:
col_his1 = ['open_1', 'high_1', 'low_1', 'close_1', 'vol_1']

In [19]:
col_his2 = ['open_2', 'high_2', 'low_2', 'close_2', 'vol_2']

In [20]:
col_cr = ['weekday', 'cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20']

In [21]:
col_common1 = ["dji_cr", "dji_f_cr", "dxy_cr", "ixic_f_cr", "bond_kor_10_cr", "bond_kor_2_cr", "kosdaq_cr", "kospi_cr", 
         "krw_cr", "ixic_cr", "spx_f_cr", "sox_cr", "spx_cr", "bond_usa_10_cr", "bond_usa_2_cr", "bond_usa_3m_cr", 
         "vix_cr", "wti_cr", "spsy_cr", "spny_cr", "spxhc_cr", "splrcd_cr", "splrci_cr", "splrcu_cr", "splrcs_cr",
         "splrct_cr", "splrcl_cr", "splrcm_cr", "ixbk_cr", "ixfn_cr", "ixid_cr", "ixis_cr", "ixk_cr", "ixtr_cr",
         "ixut_cr", "nbi_cr", "bkx_cr"]

In [22]:
col_common2 = ["dji_cr_2", "dji_f_cr_2", "dxy_cr_2", "ixic_f_cr_2", "bond_kor_10_cr_2", "bond_kor_2_cr_2", "kosdaq_cr_2", "kospi_cr_2",
         "krw_cr_2", "ixic_cr_2", "spx_f_cr_2", "sox_cr_2", "spx_cr_2", "bond_usa_10_cr_2", "bond_usa_2_cr_2", "bond_usa_3m_cr_2",
         "vix_cr_2", "wti_cr_2", "spsy_cr_2", "spny_cr_2", "spxhc_cr_2", "splrcd_cr_2", "splrci_cr_2", "splrcu_cr_2",
         "splrcs_cr_2", "splrct_cr_2", "splrcl_cr_2", "splrcm_cr_2", "ixbk_cr_2", "ixfn_cr_2", "ixid_cr_2",
         "ixis_cr_2", "ixk_cr_2", "ixtr_cr_2", "ixut_cr_2", "nbi_cr_2", "bkx_cr_2"]

In [23]:
col_col = col_inv1 + col_common1 + col_his1 + col_inv2 + col_common2 + col_his2 + col_cr

In [24]:
# write common data only
df_temp = df_base.copy()
df_temp = df_temp[col_common1+col_common2]
pkl_common_directory = './data/common_pkl/'
df_temp.to_pickle(pkl_common_directory + 'df_common.pkl')
df_temp.to_csv(pkl_common_directory + 'df_common.csv')

In [25]:
'''
df_sec_sel = df_sec_sel.merge(df_base, how='left', left_index=True, right_index=True)
df_hyunmotor_sel = df_hyunmotor_sel.merge(df_base, how='left', left_index=True, right_index=True)
df_naver_sel = df_naver_sel.merge(df_base, how='left', left_index=True, right_index=True)
df_ktng_sel = df_ktng_sel.merge(df_base, how='left', left_index=True, right_index=True)
'''

"\ndf_sec_sel = df_sec_sel.merge(df_base, how='left', left_index=True, right_index=True)\ndf_hyunmotor_sel = df_hyunmotor_sel.merge(df_base, how='left', left_index=True, right_index=True)\ndf_naver_sel = df_naver_sel.merge(df_base, how='left', left_index=True, right_index=True)\ndf_ktng_sel = df_ktng_sel.merge(df_base, how='left', left_index=True, right_index=True)\n"

In [26]:
# # 전체 column을 선정하여, 분석시에 선별하여 사용하도록 함.
# # weekday, 'cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20' column을 마지막으로 이동 
# new_columns = ['retail_1', 'foreigner_1', 'institution_1', 'financial_1', 'invtrust_1', 'pension_1', 
#                'privequity_1', 'bank_1', 'insurance_1', 'financeetc_1', 'corporateetc_1', 'foreigneretc_1', 
#                'dji_cr', 'dji_f_cr', 'ixic_cr', 'ixic_f_cr', 'spx_cr', 'spx_f_cr', 'bond_kor_10_cr',
#                'bond_kor_2_cr', 'dxy_cr', 'bond_usa_10_cr','bond_usa_2_cr', 'bond_usa_3m_cr',
#                'kosdaq_cr', 'kospi_cr', 'krw_cr', 'sox_cr', 'vix_cr', 'wti_cr',                
#                'open_1', 'high_1', 'low_1', 'close_1', 'vol_1',      
#                'retail_2', 'foreigner_2', 'institution_2', 'financial_2', 'invtrust_2', 'pension_2',
#                'privequity_2', 'bank_2', 'insurance_2', 'financeetc_2', 'corporateetc_2', 'foreigneretc_2',
#                'dji_cr_2', 'dji_f_cr_2', 'ixic_cr_2', 'ixic_f_cr_2', 'spx_cr_2', 'spx_f_cr_2',
#                'bond_kor_10_cr_2', 'bond_kor_2_cr_2', 'dxy_cr_2', 'bond_usa_10_cr_2','bond_usa_2_cr_2', 
#                'bond_usa_3m_cr_2', 'kosdaq_cr_2', 'kospi_cr_2', 'krw_cr_2', 'sox_cr_2', 'vix_cr_2', 'wti_cr_2', 
# #                 'open', 'high', 'low', 'close', 'vol', 'weekday',  # close column 제거함 (class column 계산항목으로 사용됨)
#                 'open_2', 'high_2', 'low_2', 'close_2', 'vol_2', 'weekday', 
#                 'cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20']

In [27]:
new_columns = col_col

In [28]:
# column 순서 변경 : weekday, 'cr_00', 'cr_05', 'cr_10', 'cr_15', 'cr_20' column을 마지막으로 이동 
for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])] = globals()['df_{}_sel'.format(val[1])][new_columns]

In [29]:
'''
# drop inf, -inf : replace inf to 1 or -1 . 데이터를 살리고
# 변동률이 무한대가 되는 것을 방지하기 위해서, 나중에 발생하는 에러를 방지
df_sec_sel.replace([np.inf, -np.inf], [1, -1], inplace=True)

.impute 사용하는 것을 고려할 필요 있음.
'''

'\n# drop inf, -inf : replace inf to 1 or -1 . 데이터를 살리고\n# 변동률이 무한대가 되는 것을 방지하기 위해서, 나중에 발생하는 에러를 방지\ndf_sec_sel.replace([np.inf, -np.inf], [1, -1], inplace=True)\n\n.impute 사용하는 것을 고려할 필요 있음.\n'

In [30]:
# drop inf, -inf : replace inf to 1 or -1 . 데이터를 살리고
# 변동률이 무한대가 되는 것을 방지하기 위해서, 나중에 발생하는 에러를 방지
for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])].replace([np.inf, -np.inf], [1, -1], inplace=True)

# **** 데이터 전처리 from sklearn.impute import SimpleImputer, SimpleImputer 사용하기로 함.

In [31]:
# # delete rows which include NaN : dji, spx, nasdaq 지수가 nan인 rows 제거
# df_sec_sel.dropna(inplace=True)

In [32]:
# delete rows which include NaN : dji, spx, nasdaq 지수가 nan인 rows 제거
for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])].dropna(inplace=True)

In [33]:
# string column을 numeric으로 전환
def string_to_num(df):
    df.replace('%', '', regex=True, inplace=True)
    return df.apply(pd.to_numeric)
'''
df_sec_sel = string_to_num(df_sec_sel)
'''

for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])] = string_to_num(globals()['df_{}_sel'.format(val[1])])

In [34]:
# pickle로 데이터 저장

for key, val in code.items():
    globals()['df_{}_sel'.format(val[1])].to_pickle(directory_for_predict + 'df_{}_sel.pkl'.format(val[1]))
    globals()['df_{}_sel'.format(val[1])].to_csv(directory_for_predict + 'df_{}_sel.csv'.format(val[1]))