In [1]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.precision = 4

In [3]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [4]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max ]
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [5]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [6]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [7]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [8]:
def get_change_rate(df, col_names): # get _cr value for each column
    for col in col_names:
        df[col+'_cr'] = (df[col] - df[col].shift(1)) / df[col].shift(1)
    
    return df

In [9]:
# fill nan with the preceesing value
def fillna_with_bfill(df, col_names): 
    for col in col_names:
        df[col].fillna(method='ffill', inplace=True) # forward fill, or bfill
    
    return df

In [10]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

In [34]:
filename = '../data/data_common.csv'
df = get_seq_data(filename, usa_days)

In [35]:
fill_columns =  ['cpi', 'cpi_anticipated', 'cpi_previous',
                 'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'fed_rate_ann', 'fed_rate_imp', 
                 'bok_rate', 'bok_rate_fore', 'bok_rate_prev', 'bok_rate_ann', 'bok_rate_imp' ]
futures_columns =  ['fu_usa_date', 'op_usa_date', 'qw_usa_day', 
                 'fu_kor_date', 'op_kor_date', 'dw_kor_day']
df = fillna_with_bfill(df, fill_columns)
df[futures_columns] = df[futures_columns].replace(np.nan, 0)

In [36]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1024,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.10%,8.80%,8.60%,,,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,1.0,0.0,0.0,0.0,0.0
1044,2022-11-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.10%,8.80%,8.60%,,,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,1.0,0.0
1052,2022-11-18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.10%,8.80%,8.60%,,,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,1.0,0.0,0.0,0.0,0.0
1072,2022-12-08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.10%,8.80%,8.60%,,,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,1.0,1.0,1.0
1080,2022-12-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.10%,8.80%,8.60%,,,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,1.0,1.0,1.0,0.0,0.0,0.0


In [37]:
df.dropna(subset=['ixic', 'dji'], inplace=True) # drop empty rows

In [38]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
919,2022-07-08,11635.31,0.12%,12152.0,0.11%,31339.2,-0.14%,31310.0,-0.18%,3899.38,-0.08%,3901.25,-0.10%,2350.61,0.70%,766.48,1.12%,104.79,2.01%,106.821,-0.13%,1294.68,-0.23%,3.08,2.67%,3.1047,2.66%,1.963,0.77%,3.365,2.25%,3.318,0.97%,24.64,-5.52%,29.0,2618.0,0.47%,8.60%,8.30%,8.30%,1742.35,0.0052,1.75%,1.50%,1.00%,1.0,1.0,1.75,,,,,0.0,0.0,0.0,0.0,0.0,0.0
922,2022-07-11,11372.6,-2.26%,11884.25,-2.20%,31175.52,-0.52%,31140.0,-0.54%,3854.43,-1.15%,3856.75,-1.14%,2340.27,-0.44%,767.04,0.07%,103.16,-1.20%,107.829,0.94%,1310.81,1.25%,2.993,-3.37%,3.0697,-1.72%,2.061,7.20%,3.385,0.74%,3.308,-0.30%,26.17,6.21%,27.0,2553.7,-2.46%,8.60%,8.30%,8.30%,1733.39,0.0046,1.75%,1.50%,1.00%,1.0,1.0,1.75,,,,,0.0,0.0,0.0,0.0,0.0,0.0
923,2022-07-12,11264.73,-0.95%,11779.0,-0.89%,30985.02,-0.61%,30966.0,-0.56%,3818.8,-0.92%,3823.75,-0.86%,2317.76,-0.96%,750.78,-2.12%,94.41,-8.48%,107.912,0.08%,1305.21,-0.43%,2.971,-0.74%,3.0512,-0.60%,2.205,6.99%,3.355,-0.89%,3.259,-1.48%,27.29,4.28%,25.0,2558.2,0.18%,8.60%,8.30%,8.30%,1725.42,-0.0038,1.75%,1.50%,1.00%,1.0,1.0,1.75,,,,,0.0,0.0,0.0,0.0,0.0,0.0
924,2022-07-13,11247.58,-0.15%,11762.25,-0.14%,30771.54,-0.69%,30758.0,-0.67%,3801.78,-0.45%,3804.5,-0.50%,2328.61,0.47%,763.18,1.65%,95.9,1.58%,107.756,-0.14%,1304.73,-0.04%,2.935,-1.18%,3.1485,3.19%,2.376,7.76%,3.303,-1.55%,3.177,-2.52%,26.82,-1.72%,23.0,2577.4,0.75%,9.10%,8.80%,8.60%,1732.08,0.0119,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0
925,2022-07-14,11251.18,0.03%,11797.5,0.30%,30630.01,-0.46%,30604.0,-0.50%,3790.38,-0.30%,3793.25,-0.30%,2322.32,-0.27%,766.08,0.38%,96.65,0.78%,108.407,0.60%,1316.41,0.90%,2.958,0.75%,3.1279,-0.65%,2.399,0.97%,3.305,0.06%,3.215,1.20%,26.4,-1.57%,23.0,2626.9,1.92%,9.10%,8.80%,8.60%,1711.78,-0.0009,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
df.replace('%', '', regex=True, inplace = True)

In [40]:
df.head()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1,2020-01-02,9092.19,1.33,8891.75,1.31,28868.8,1.16,28841.0,0.96,3257.85,0.84,3259.0,0.66,2175.17,-1.02,674.02,0.63,61.18,-0.24,96.525,0.49,1157.35,0.29,1.877,-2.29,1.571,0.0,1.533,-1.5,1.638,-2.03,1.335,-2.2,12.47,-9.51,,1887.9,2.07,2.1,2.0,1.8,1528.95,0.0069,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-03,9020.77,-0.79,8810.0,-0.92,28634.88,-0.81,28602.0,-0.83,3234.85,-0.71,3235.5,-0.72,2176.46,0.06,669.93,-0.61,63.05,3.06,96.521,0.0,1164.95,0.66,1.793,-4.46,1.5326,-2.44,1.52,-0.85,1.552,-5.25,1.29,-3.37,14.02,12.43,,1854.0,-1.8,2.1,2.0,1.8,1552.24,0.0152,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-01-06,9071.47,0.56,8847.5,0.43,28703.38,0.24,28642.0,0.14,3246.28,0.35,3243.5,0.25,2155.07,-0.98,655.31,-2.18,63.27,0.35,96.35,-0.18,1166.94,0.17,1.809,2.37,1.5466,2.11,1.538,0.49,1.537,-1.47,1.299,0.7,13.85,-1.21,,1834.7,-1.04,2.1,2.0,1.8,1563.83,0.0075,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-01-07,9068.58,-0.03,8853.0,0.06,28583.68,-0.42,28526.0,-0.4,3237.18,-0.28,3235.25,-0.25,2175.54,0.95,663.44,1.24,62.7,-0.9,96.702,0.37,1167.3,0.03,1.811,0.1,1.5384,-0.53,1.551,0.85,1.616,5.14,1.343,3.39,13.79,-0.43,,1867.3,1.78,2.1,2.0,1.8,1595.24,0.0201,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-01-08,9129.24,0.67,8944.5,1.03,28745.09,0.56,28770.0,0.86,3253.05,0.49,3260.25,0.77,2151.31,-1.11,640.94,-3.39,59.61,-4.93,96.996,0.3,1162.25,-0.43,1.874,3.48,1.5846,3.0,1.544,-0.45,1.628,0.74,1.322,-1.56,13.45,-2.47,,1867.6,0.02,2.1,2.0,1.8,1557.89,-0.0234,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df.dropna(axis=1, how='all', inplace=True)

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1,2020-01-02,9092.19,1.33,8891.75,1.31,28868.80,1.16,28841.0,0.96,3257.85,0.84,3259.00,0.66,61.18,-0.24,96.525,0.49,1157.35,0.29,1.877,-2.29,1.638,-2.03,1.335,-2.20,12.47,-9.51,1887.9,2.07,2.10,2.00,1.80,1528.95,0.0069,1.75,1.75,1.75,1.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-03,9020.77,-0.79,8810.00,-0.92,28634.88,-0.81,28602.0,-0.83,3234.85,-0.71,3235.50,-0.72,63.05,3.06,96.521,0.00,1164.95,0.66,1.793,-4.46,1.552,-5.25,1.290,-3.37,14.02,12.43,1854.0,-1.80,2.10,2.00,1.80,1552.24,0.0152,1.75,1.75,1.75,1.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-01-06,9071.47,0.56,8847.50,0.43,28703.38,0.24,28642.0,0.14,3246.28,0.35,3243.50,0.25,63.27,0.35,96.350,-0.18,1166.94,0.17,1.809,2.37,1.537,-1.47,1.299,0.70,13.85,-1.21,1834.7,-1.04,2.10,2.00,1.80,1563.83,0.0075,1.75,1.75,1.75,1.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-01-07,9068.58,-0.03,8853.00,0.06,28583.68,-0.42,28526.0,-0.40,3237.18,-0.28,3235.25,-0.25,62.70,-0.90,96.702,0.37,1167.30,0.03,1.811,0.10,1.616,5.14,1.343,3.39,13.79,-0.43,1867.3,1.78,2.10,2.00,1.80,1595.24,0.0201,1.75,1.75,1.75,1.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-01-08,9129.24,0.67,8944.50,1.03,28745.09,0.56,28770.0,0.86,3253.05,0.49,3260.25,0.77,59.61,-4.93,96.996,0.30,1162.25,-0.43,1.874,3.48,1.628,0.74,1.322,-1.56,13.45,-2.47,1867.6,0.02,2.10,2.00,1.80,1557.89,-0.0234,1.75,1.75,1.75,1.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,2022-07-08,11635.31,0.12,12152.00,0.11,31339.20,-0.14,31310.0,-0.18,3899.38,-0.08,3901.25,-0.10,104.79,2.01,106.821,-0.13,1294.68,-0.23,3.080,2.67,3.365,2.25,3.318,0.97,24.64,-5.52,2618.0,0.47,8.60,8.30,8.30,1742.35,0.0052,1.75,1.50,1.00,1.0,1.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0
922,2022-07-11,11372.60,-2.26,11884.25,-2.20,31175.52,-0.52,31140.0,-0.54,3854.43,-1.15,3856.75,-1.14,103.16,-1.20,107.829,0.94,1310.81,1.25,2.993,-3.37,3.385,0.74,3.308,-0.30,26.17,6.21,2553.7,-2.46,8.60,8.30,8.30,1733.39,0.0046,1.75,1.50,1.00,1.0,1.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0
923,2022-07-12,11264.73,-0.95,11779.00,-0.89,30985.02,-0.61,30966.0,-0.56,3818.80,-0.92,3823.75,-0.86,94.41,-8.48,107.912,0.08,1305.21,-0.43,2.971,-0.74,3.355,-0.89,3.259,-1.48,27.29,4.28,2558.2,0.18,8.60,8.30,8.30,1725.42,-0.0038,1.75,1.50,1.00,1.0,1.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0
924,2022-07-13,11247.58,-0.15,11762.25,-0.14,30771.54,-0.69,30758.0,-0.67,3801.78,-0.45,3804.50,-0.50,95.90,1.58,107.756,-0.14,1304.73,-0.04,2.935,-1.18,3.303,-1.55,3.177,-2.52,26.82,-1.72,2577.4,0.75,9.10,8.80,8.60,1732.08,0.0119,1.75,1.50,1.00,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0
