### 한국, 미국 주식 휴장일 리스트 구하기

- 휴장일을 제외한 자료 사전 처리에 사용하기 위한 휴장일 수집(2020~2022)

In [15]:
import pandas as pd
import numpy as np

In [3]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

In [4]:
def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

In [5]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

In [6]:
def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [7]:
filename = 'data/data_hyunmotor.csv'
df = get_seq_data(filename, kor_days)

In [8]:
df.tail()

Unnamed: 0,date,retail,retail_cr,retail_days,foreigner,foreigner_cr,foreigner_days,institution,institution_cr,institution_days,...,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time
922,2022-07-11,-10749.0,,,11123.0,,,-719.0,,,...,,180000.0,182000.0,179000.0,179000.0,0.56,119200.0,,,
923,2022-07-12,868.0,,,3282.0,,,-3835.0,,,...,,180000.0,181500.0,177000.0,179000.0,0.0,96440.0,,,
924,2022-07-13,-40414.0,,,18096.0,,,22755.0,,,...,,180000.0,185000.0,178500.0,183500.0,2.51,172566.0,,,
925,2022-07-14,13810.0,,,9619.0,,,-23410.0,,,...,,183000.0,184500.0,180500.0,182500.0,-0.54,142369.0,,,
926,2022-07-15,0.0,,,0.0,,,0.0,,,...,,183000.0,184500.0,178500.0,183500.0,0.55,115240.0,,,


In [18]:
col_cr = ['retail', 'foreigner','institution', 'financial',  'invtrust', 'pension', 'privequity',
          'bank', 'insurance', 'financeetc', 'corporateetc',  'foreigneretc', 'vol']

In [19]:
for cr in col_cr:
    df_shift_p1 = df[cr].shift(1)
    df[cr+'_cr'] = (df[cr] - df_shift_p1)/df_shift_p1
    df.replace(np.nan, '', inplace=True)   

In [21]:
df.tail()

Unnamed: 0,date,retail,retail_cr,retail_days,foreigner,foreigner_cr,foreigner_days,institution,institution_cr,institution_days,...,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time
922,2022-07-11,-10749.0,-0.475121,,11123.0,-0.287763,,-719.0,-1.186608,,...,,180000.0,182000.0,179000.0,179000.0,0.56,119200.0,-0.189832,,
923,2022-07-12,868.0,-1.080752,,3282.0,-0.704936,,-3835.0,4.333797,,...,,180000.0,181500.0,177000.0,179000.0,0.0,96440.0,-0.19094,,
924,2022-07-13,-40414.0,-47.559908,,18096.0,4.513711,,22755.0,-6.933507,,...,,180000.0,185000.0,178500.0,183500.0,2.51,172566.0,0.789361,,
925,2022-07-14,13810.0,-1.341713,,9619.0,-0.468446,,-23410.0,-2.028785,,...,,183000.0,184500.0,180500.0,182500.0,-0.54,142369.0,-0.174988,,
926,2022-07-15,0.0,-1.0,,0.0,-1.0,,0.0,-1.0,,...,,183000.0,184500.0,178500.0,183500.0,0.55,115240.0,-0.190554,,
