In [1]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.precision = 4

In [3]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [4]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max ]
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [5]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [6]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [7]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [8]:
def get_change_rate(df, col_names): # get _cr value for each column
    for col in col_names:
        df[col+'_cr'] = (df[col] - df[col].shift(1)) / df[col].shift(1)
    
    return df

In [9]:
# fill nan with the preceesing value
def fillna_with_bfill(df, col_names): 
    for col in col_names:
        df[col].fillna(method='ffill', inplace=True) # forward fill, or bfill
    
    return df

In [11]:
# filename = 'https://raw.githubusercontent.com/fasthill/'\
#             'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
filename = '../data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2023-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

In [13]:
filename = '../data/data_common.csv'
df = get_seq_data(filename, usa_days)

In [14]:
fill_columns =  ['cpi', 'cpi_anticipated', 'cpi_previous',
                 'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'fed_rate_ann', 'fed_rate_imp', 
                 'bok_rate', 'bok_rate_fore', 'bok_rate_prev', 'bok_rate_ann', 'bok_rate_imp' ]
futures_columns =  ['fu_usa_date', 'op_usa_date', 'qw_usa_day', 
                 'fu_kor_date', 'op_kor_date', 'dw_kor_day']
df = fillna_with_bfill(df, fill_columns)
df[futures_columns] = df[futures_columns].replace(np.nan, 0)

In [15]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1087,2022-12-23,10497.86,0.21%,11075.25,0.19%,33204.26,0.53%,33375.0,0.50%,3844.82,0.59%,3869.75,0.53%,2313.69,-1.83%,691.25,-3.32%,79.34,1.13%,104.01,-0.11%,1279.55,-0.94%,3.751,1.76%,4.3275,1.11%,4.335,0.07%,3.552,1.51%,3.743,1.13%,20.87,-5.01%,38.0,2535.5,0.09%,7.10%,7.30%,7.70%,1798.58,0.0029,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1091,2022-12-27,10353.23,-1.38%,10915.5,-2.16%,33241.69,0.11%,33416.0,-0.36%,3829.25,-0.40%,3855.0,-0.99%,2332.79,0.68%,704.19,1.37%,79.77,-1.48%,103.893,-0.11%,1271.79,-0.17%,3.849,2.81%,4.3827,1.18%,4.311,-1.32%,3.612,-0.77%,3.761,-0.37%,21.65,3.74%,36.0,2490.2,-1.79%,7.10%,7.30%,7.70%,1804.88,0.0016,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1092,2022-12-28,10213.29,-1.35%,10772.75,-1.31%,32875.01,-1.10%,33046.0,-1.11%,3783.22,-1.20%,3807.5,-1.23%,2280.45,-2.24%,692.37,-1.68%,78.6,-1.47%,104.182,0.28%,1274.37,0.20%,3.886,0.98%,4.3574,-0.58%,4.457,3.39%,3.675,1.74%,3.779,0.48%,22.14,2.26%,31.0,2453.5,-1.47%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1093,2022-12-29,10478.09,2.59%,11032.75,2.41%,33221.06,1.05%,33375.0,1.00%,3849.28,1.75%,3871.75,1.69%,2236.4,-1.93%,679.29,-1.89%,78.61,0.01%,103.586,-0.57%,1260.85,-1.06%,3.82,-1.71%,4.3656,0.19%,4.423,-0.76%,3.723,1.31%,3.809,0.79%,21.44,-3.16%,36.0,2534.9,3.32%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1094,2022-12-30,10466.48,-0.11%,11022.25,-0.10%,33147.28,-0.22%,33285.0,-0.27%,3839.5,-0.25%,3861.0,-0.28%,,,,,80.47,2.37%,103.269,-0.31%,1260.92,0.01%,3.879,1.53%,4.4279,1.43%,4.405,-0.41%,3.735,0.32%,3.791,-0.47%,21.67,1.07%,37.0,2532.1,-0.11%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.dropna(subset=['ixic', 'dji'], inplace=True) # drop empty rows

In [17]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1087,2022-12-23,10497.86,0.21%,11075.25,0.19%,33204.26,0.53%,33375.0,0.50%,3844.82,0.59%,3869.75,0.53%,2313.69,-1.83%,691.25,-3.32%,79.34,1.13%,104.01,-0.11%,1279.55,-0.94%,3.751,1.76%,4.3275,1.11%,4.335,0.07%,3.552,1.51%,3.743,1.13%,20.87,-5.01%,38.0,2535.5,0.09%,7.10%,7.30%,7.70%,1798.58,0.0029,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1091,2022-12-27,10353.23,-1.38%,10915.5,-2.16%,33241.69,0.11%,33416.0,-0.36%,3829.25,-0.40%,3855.0,-0.99%,2332.79,0.68%,704.19,1.37%,79.77,-1.48%,103.893,-0.11%,1271.79,-0.17%,3.849,2.81%,4.3827,1.18%,4.311,-1.32%,3.612,-0.77%,3.761,-0.37%,21.65,3.74%,36.0,2490.2,-1.79%,7.10%,7.30%,7.70%,1804.88,0.0016,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1092,2022-12-28,10213.29,-1.35%,10772.75,-1.31%,32875.01,-1.10%,33046.0,-1.11%,3783.22,-1.20%,3807.5,-1.23%,2280.45,-2.24%,692.37,-1.68%,78.6,-1.47%,104.182,0.28%,1274.37,0.20%,3.886,0.98%,4.3574,-0.58%,4.457,3.39%,3.675,1.74%,3.779,0.48%,22.14,2.26%,31.0,2453.5,-1.47%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1093,2022-12-29,10478.09,2.59%,11032.75,2.41%,33221.06,1.05%,33375.0,1.00%,3849.28,1.75%,3871.75,1.69%,2236.4,-1.93%,679.29,-1.89%,78.61,0.01%,103.586,-0.57%,1260.85,-1.06%,3.82,-1.71%,4.3656,0.19%,4.423,-0.76%,3.723,1.31%,3.809,0.79%,21.44,-3.16%,36.0,2534.9,3.32%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1094,2022-12-30,10466.48,-0.11%,11022.25,-0.10%,33147.28,-0.22%,33285.0,-0.27%,3839.5,-0.25%,3861.0,-0.28%,,,,,80.47,2.37%,103.269,-0.31%,1260.92,0.01%,3.879,1.53%,4.4279,1.43%,4.405,-0.41%,3.735,0.32%,3.791,-0.47%,21.67,1.07%,37.0,2532.1,-0.11%,7.10%,7.30%,7.70%,,,4.50%,4.50%,4.00%,1.0,1.0,3.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df.replace('%', '', regex=True, inplace = True)

In [19]:
df.head()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1,2020-01-02,9092.19,1.33,8891.75,1.31,28868.8,1.16,28841.0,0.96,3257.85,0.84,3259.0,0.66,2175.17,-1.02,674.02,0.63,61.18,-0.24,96.525,0.49,1157.35,0.29,1.877,-2.29,1.571,0.0,1.533,-1.5,1.638,-2.03,1.335,-2.2,12.47,-9.51,,1887.9,2.07,2.1,2.0,1.8,1528.95,0.0069,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-03,9020.77,-0.79,8810.0,-0.92,28634.88,-0.81,28602.0,-0.83,3234.85,-0.71,3235.5,-0.72,2176.46,0.06,669.93,-0.61,63.05,3.06,96.521,0.0,1164.95,0.66,1.793,-4.46,1.5326,-2.44,1.52,-0.85,1.552,-5.25,1.29,-3.37,14.02,12.43,,1854.0,-1.8,2.1,2.0,1.8,1552.24,0.0152,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-01-06,9071.47,0.56,8847.5,0.43,28703.38,0.24,28642.0,0.14,3246.28,0.35,3243.5,0.25,2155.07,-0.98,655.31,-2.18,63.27,0.35,96.35,-0.18,1166.94,0.17,1.809,2.37,1.5466,2.11,1.538,0.49,1.537,-1.47,1.299,0.7,13.85,-1.21,,1834.7,-1.04,2.1,2.0,1.8,1563.83,0.0075,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-01-07,9068.58,-0.03,8853.0,0.06,28583.68,-0.42,28526.0,-0.4,3237.18,-0.28,3235.25,-0.25,2175.54,0.95,663.44,1.24,62.7,-0.9,96.702,0.37,1167.3,0.03,1.811,0.1,1.5384,-0.53,1.551,0.85,1.616,5.14,1.343,3.39,13.79,-0.43,,1867.3,1.78,2.1,2.0,1.8,1595.24,0.0201,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-01-08,9129.24,0.67,8944.5,1.03,28745.09,0.56,28770.0,0.86,3253.05,0.49,3260.25,0.77,2151.31,-1.11,640.94,-3.39,59.61,-4.93,96.996,0.3,1162.25,-0.43,1.874,3.48,1.5846,3.0,1.544,-0.45,1.628,0.74,1.322,-1.56,13.45,-2.47,,1867.6,0.02,2.1,2.0,1.8,1557.89,-0.0234,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df.dropna(axis=1, how='all', inplace=True)

In [21]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1087,2022-12-23,10497.86,0.21,11075.25,0.19,33204.26,0.53,33375.0,0.5,3844.82,0.59,3869.75,0.53,2313.69,-1.83,691.25,-3.32,79.34,1.13,104.01,-0.11,1279.55,-0.94,3.751,1.76,4.3275,1.11,4.335,0.07,3.552,1.51,3.743,1.13,20.87,-5.01,38.0,2535.5,0.09,7.1,7.3,7.7,1798.58,0.0029,4.5,4.5,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1091,2022-12-27,10353.23,-1.38,10915.5,-2.16,33241.69,0.11,33416.0,-0.36,3829.25,-0.4,3855.0,-0.99,2332.79,0.68,704.19,1.37,79.77,-1.48,103.893,-0.11,1271.79,-0.17,3.849,2.81,4.3827,1.18,4.311,-1.32,3.612,-0.77,3.761,-0.37,21.65,3.74,36.0,2490.2,-1.79,7.1,7.3,7.7,1804.88,0.0016,4.5,4.5,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1092,2022-12-28,10213.29,-1.35,10772.75,-1.31,32875.01,-1.1,33046.0,-1.11,3783.22,-1.2,3807.5,-1.23,2280.45,-2.24,692.37,-1.68,78.6,-1.47,104.182,0.28,1274.37,0.2,3.886,0.98,4.3574,-0.58,4.457,3.39,3.675,1.74,3.779,0.48,22.14,2.26,31.0,2453.5,-1.47,7.1,7.3,7.7,,,4.5,4.5,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1093,2022-12-29,10478.09,2.59,11032.75,2.41,33221.06,1.05,33375.0,1.0,3849.28,1.75,3871.75,1.69,2236.4,-1.93,679.29,-1.89,78.61,0.01,103.586,-0.57,1260.85,-1.06,3.82,-1.71,4.3656,0.19,4.423,-0.76,3.723,1.31,3.809,0.79,21.44,-3.16,36.0,2534.9,3.32,7.1,7.3,7.7,,,4.5,4.5,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1094,2022-12-30,10466.48,-0.11,11022.25,-0.1,33147.28,-0.22,33285.0,-0.27,3839.5,-0.25,3861.0,-0.28,,,,,80.47,2.37,103.269,-0.31,1260.92,0.01,3.879,1.53,4.4279,1.43,4.405,-0.41,3.735,0.32,3.791,-0.47,21.67,1.07,37.0,2532.1,-0.11,7.1,7.3,7.7,,,4.5,4.5,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
