In [1]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.precision = 4

In [3]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [4]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min-np.inf, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max+np.inf ]
        # pd.cut은 최소값을 경계값에 포함하지 않고 최대값은 경계갑에 포함됨. 따라서 최소값에 -np.inf을 삽입.
        # 최대값은 +1 을 하지 않아도 되나 여유있게 +np.inf 삽입.
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [5]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [6]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [7]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [8]:
def get_change_rate(df, col_names): # get _cr value for each column
    for col in col_names:
        df[col+'_cr'] = (df[col] - df[col].shift(1)) / df[col].shift(1)
    
    return df

In [9]:
# fill nan with the preceesing value
def fillna_with_bfill(df, col_names): 
    for col in col_names:
        df[col].fillna(method='ffill', inplace=True) # forward fill, or bfill
    
    return df

In [10]:
def get_dayofweek(df):
    # insert 요일 column: 1: monday, 5: friday
    df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d"))
    df_dayofweek = df['date'].dt.dayofweek + 1 # 1: monday, 5: Friday
    df.insert(1, 'dayofweek', df_dayofweek)
    
    return df

In [11]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

### get and manipulate common data

In [12]:
filename_common = '../data/data_common.csv'
df_common = get_seq_data(filename_common, usa_days)

In [13]:
fill_columns =  ['cpi', 'cpi_anticipated', 'cpi_previous',
                 'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'fed_rate_ann', 'fed_rate_imp', 
                 'bok_rate', 'bok_rate_fore', 'bok_rate_prev', 'bok_rate_ann', 'bok_rate_imp' ]
futures_columns =  ['fu_usa_date', 'op_usa_date', 'qw_usa_day', 
                 'fu_kor_date', 'op_kor_date', 'dw_kor_day']
df_common = fillna_with_bfill(df_common, fill_columns)
df_common[futures_columns] = df_common[futures_columns].replace(np.nan, 0)

In [14]:
df_common.head()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1,2020-01-02,9092.19,1.33%,8891.75,1.31%,28868.8,1.16%,28841.0,0.96%,3257.85,0.84%,3259.0,0.66%,2175.17,-1.02%,674.02,0.63%,61.18,-0.24%,96.525,0.49%,1157.35,0.29%,1.877,-2.29%,1.571,0.00%,1.533,-1.50%,1.638,-2.03%,1.335,-2.20%,12.47,-9.51%,,1887.9,2.07%,2.10%,2.00%,1.80%,1528.95,0.0069,1.75%,1.75%,1.75%,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-03,9020.77,-0.79%,8810.0,-0.92%,28634.88,-0.81%,28602.0,-0.83%,3234.85,-0.71%,3235.5,-0.72%,2176.46,0.06%,669.93,-0.61%,63.05,3.06%,96.521,0.00%,1164.95,0.66%,1.793,-4.46%,1.5326,-2.44%,1.52,-0.85%,1.552,-5.25%,1.29,-3.37%,14.02,12.43%,,1854.0,-1.80%,2.10%,2.00%,1.80%,1552.24,0.0152,1.75%,1.75%,1.75%,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-01-06,9071.47,0.56%,8847.5,0.43%,28703.38,0.24%,28642.0,0.14%,3246.28,0.35%,3243.5,0.25%,2155.07,-0.98%,655.31,-2.18%,63.27,0.35%,96.35,-0.18%,1166.94,0.17%,1.809,2.37%,1.5466,2.11%,1.538,0.49%,1.537,-1.47%,1.299,0.70%,13.85,-1.21%,,1834.7,-1.04%,2.10%,2.00%,1.80%,1563.83,0.0075,1.75%,1.75%,1.75%,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-01-07,9068.58,-0.03%,8853.0,0.06%,28583.68,-0.42%,28526.0,-0.40%,3237.18,-0.28%,3235.25,-0.25%,2175.54,0.95%,663.44,1.24%,62.7,-0.90%,96.702,0.37%,1167.3,0.03%,1.811,0.10%,1.5384,-0.53%,1.551,0.85%,1.616,5.14%,1.343,3.39%,13.79,-0.43%,,1867.3,1.78%,2.10%,2.00%,1.80%,1595.24,0.0201,1.75%,1.75%,1.75%,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-01-08,9129.24,0.67%,8944.5,1.03%,28745.09,0.56%,28770.0,0.86%,3253.05,0.49%,3260.25,0.77%,2151.31,-1.11%,640.94,-3.39%,59.61,-4.93%,96.996,0.30%,1162.25,-0.43%,1.874,3.48%,1.5846,3.00%,1.544,-0.45%,1.628,0.74%,1.322,-1.56%,13.45,-2.47%,,1867.6,0.02%,2.10%,2.00%,1.80%,1557.89,-0.0234,1.75%,1.75%,1.75%,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_common.dropna(subset=['ixic', 'dji'], inplace=True) # drop empty rows

In [16]:
df_common.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
936,2022-07-25,11782.67,-0.43%,12354.5,-0.56%,31990.0,0.28%,31967.0,0.29%,3966.84,0.13%,3970.0,0.13%,2403.69,0.44%,789.69,-0.01%,96.7,1.89%,106.355,-0.25%,1311.12,0.08%,2.807,0.79%,3.0182,0.49%,2.486,0.83%,3.213,-2.67%,3.16,-1.28%,23.36,1.43%,38.0,2825.0,-0.61%,9.1%,8.8%,8.6%,1719.06,-0.0049,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0
937,2022-07-26,11562.58,-1.87%,12112.5,-1.96%,31760.85,-0.72%,31732.0,-0.74%,3921.05,-1.15%,3923.25,-1.18%,2412.96,0.39%,789.93,0.03%,94.98,-1.78%,107.044,0.65%,1310.1,-0.08%,2.803,-0.13%,3.0589,1.35%,2.53,1.77%,3.184,-0.90%,3.146,-0.44%,24.69,5.69%,35.0,2779.0,-1.63%,9.1%,8.8%,8.6%,1718.02,-0.0006,1.75%,1.50%,1.00%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0
938,2022-07-27,12032.42,4.06%,12619.0,4.18%,32196.0,1.37%,32172.0,1.39%,4023.61,2.62%,4024.5,2.58%,2415.53,0.11%,795.7,0.73%,97.26,2.40%,106.331,-0.67%,1306.08,-0.31%,2.785,-0.65%,2.9837,-2.46%,2.451,-3.12%,3.132,-1.63%,3.116,-0.95%,23.24,-5.87%,37.0,2911.0,4.75%,9.1%,8.8%,8.6%,1738.87,0.0121,2.50%,2.50%,1.75%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0
939,2022-07-28,12162.59,1.08%,12737.5,0.94%,32526.86,1.03%,32490.0,0.99%,4072.43,1.21%,4073.5,1.22%,2435.27,0.82%,798.32,0.33%,96.42,-0.86%,106.236,-0.09%,1295.47,-0.81%,2.671,-4.11%,2.8683,-3.87%,2.386,-2.65%,3.197,2.08%,3.148,1.03%,22.33,-3.92%,38.0,2944.5,1.15%,9.1%,8.8%,8.6%,1755.05,0.0093,2.50%,2.50%,1.75%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0
940,2022-07-29,12390.69,1.88%,12971.5,1.84%,32846.45,0.98%,32825.0,1.03%,4130.29,1.42%,4133.5,1.47%,2451.5,0.67%,803.62,0.66%,98.62,2.28%,105.779,-0.43%,1302.97,0.58%,2.658,-0.48%,2.8905,0.77%,2.373,-0.54%,3.127,-2.19%,3.026,-3.88%,21.33,-4.48%,41.0,2967.1,0.77%,9.1%,8.8%,8.6%,1766.16,0.0063,2.50%,2.50%,1.75%,1.0,1.0,2.25,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_common.replace('%', '', regex=True, inplace = True) # delete '%'

In [18]:
df_common.head()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,bok_rate_fore,bok_rate_prev,bok_rate_ann,bok_rate_imp,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
1,2020-01-02,9092.19,1.33,8891.75,1.31,28868.8,1.16,28841.0,0.96,3257.85,0.84,3259.0,0.66,2175.17,-1.02,674.02,0.63,61.18,-0.24,96.525,0.49,1157.35,0.29,1.877,-2.29,1.571,0.0,1.533,-1.5,1.638,-2.03,1.335,-2.2,12.47,-9.51,,1887.9,2.07,2.1,2.0,1.8,1528.95,0.0069,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-03,9020.77,-0.79,8810.0,-0.92,28634.88,-0.81,28602.0,-0.83,3234.85,-0.71,3235.5,-0.72,2176.46,0.06,669.93,-0.61,63.05,3.06,96.521,0.0,1164.95,0.66,1.793,-4.46,1.5326,-2.44,1.52,-0.85,1.552,-5.25,1.29,-3.37,14.02,12.43,,1854.0,-1.8,2.1,2.0,1.8,1552.24,0.0152,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-01-06,9071.47,0.56,8847.5,0.43,28703.38,0.24,28642.0,0.14,3246.28,0.35,3243.5,0.25,2155.07,-0.98,655.31,-2.18,63.27,0.35,96.35,-0.18,1166.94,0.17,1.809,2.37,1.5466,2.11,1.538,0.49,1.537,-1.47,1.299,0.7,13.85,-1.21,,1834.7,-1.04,2.1,2.0,1.8,1563.83,0.0075,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-01-07,9068.58,-0.03,8853.0,0.06,28583.68,-0.42,28526.0,-0.4,3237.18,-0.28,3235.25,-0.25,2175.54,0.95,663.44,1.24,62.7,-0.9,96.702,0.37,1167.3,0.03,1.811,0.1,1.5384,-0.53,1.551,0.85,1.616,5.14,1.343,3.39,13.79,-0.43,,1867.3,1.78,2.1,2.0,1.8,1595.24,0.0201,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-01-08,9129.24,0.67,8944.5,1.03,28745.09,0.56,28770.0,0.86,3253.05,0.49,3260.25,0.77,2151.31,-1.11,640.94,-3.39,59.61,-4.93,96.996,0.3,1162.25,-0.43,1.874,3.48,1.5846,3.0,1.544,-0.45,1.628,0.74,1.322,-1.56,13.45,-2.47,,1867.6,0.02,2.1,2.0,1.8,1557.89,-0.0234,1.75,1.75,1.75,1.0,1.0,0.5,,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_common.dropna(axis=1, how='all', inplace=True)

In [20]:
df_common.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,qw_usa_day,fu_kor_date,op_kor_date,dw_kor_day
936,2022-07-25,11782.67,-0.43,12354.5,-0.56,31990.0,0.28,31967.0,0.29,3966.84,0.13,3970.0,0.13,2403.69,0.44,789.69,-0.01,96.7,1.89,106.355,-0.25,1311.12,0.08,2.807,0.79,3.0182,0.49,2.486,0.83,3.213,-2.67,3.16,-1.28,23.36,1.43,38.0,2825.0,-0.61,9.1,8.8,8.6,1719.06,-0.0049,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0
937,2022-07-26,11562.58,-1.87,12112.5,-1.96,31760.85,-0.72,31732.0,-0.74,3921.05,-1.15,3923.25,-1.18,2412.96,0.39,789.93,0.03,94.98,-1.78,107.044,0.65,1310.1,-0.08,2.803,-0.13,3.0589,1.35,2.53,1.77,3.184,-0.9,3.146,-0.44,24.69,5.69,35.0,2779.0,-1.63,9.1,8.8,8.6,1718.02,-0.0006,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0
938,2022-07-27,12032.42,4.06,12619.0,4.18,32196.0,1.37,32172.0,1.39,4023.61,2.62,4024.5,2.58,2415.53,0.11,795.7,0.73,97.26,2.4,106.331,-0.67,1306.08,-0.31,2.785,-0.65,2.9837,-2.46,2.451,-3.12,3.132,-1.63,3.116,-0.95,23.24,-5.87,37.0,2911.0,4.75,9.1,8.8,8.6,1738.87,0.0121,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0
939,2022-07-28,12162.59,1.08,12737.5,0.94,32526.86,1.03,32490.0,0.99,4072.43,1.21,4073.5,1.22,2435.27,0.82,798.32,0.33,96.42,-0.86,106.236,-0.09,1295.47,-0.81,2.671,-4.11,2.8683,-3.87,2.386,-2.65,3.197,2.08,3.148,1.03,22.33,-3.92,38.0,2944.5,1.15,9.1,8.8,8.6,1755.05,0.0093,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0
940,2022-07-29,12390.69,1.88,12971.5,1.84,32846.45,0.98,32825.0,1.03,4130.29,1.42,4133.5,1.47,2451.5,0.67,803.62,0.66,98.62,2.28,105.779,-0.43,1302.97,0.58,2.658,-0.48,2.8905,0.77,2.373,-0.54,3.127,-2.19,3.026,-3.88,21.33,-4.48,41.0,2967.1,0.77,9.1,8.8,8.6,1766.16,0.0063,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0


### get and manipulate company data

In [21]:
filename_company = '../data/data_hyunmotor.csv'
df_company = get_seq_data(filename_company, kor_days)

In [22]:
# get day of week
df_company = get_dayofweek(df_company)

In [23]:
# get volumne percentage
df_company['vol_percent'] = (df_company['vol'] / df_company['vol'].sum()) * 100

In [24]:
df_company =  df_company[:-1]  # delete last row 
df_company.tail()

Unnamed: 0,date,dayofweek,retail,retail_cr,retail_days,foreigner,foreigner_cr,foreigner_days,institution,institution_cr,institution_days,financial,financial_cr,financial_days,invtrust,invtrust_cr,invtrust_days,pension,pension_cr,pension_days,privequity,privequity_cr,privequity_days,bank,bank_cr,bank_days,insurance,insurance_cr,insurance_days,financeetc,financeetc_cr,financeetc_days,corporateetc,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent
936,2022-07-25,1,-73117.0,,,54863.0,,,17843.0,,,15669.0,,,2824.0,,,-1813.0,,,2397.0,,,-16.0,,,-1100.0,,,-118.0,,,259.0,,,152.0,,,,,,,192500.0,198500.0,192500.0,196000.0,2.62,265515.0,,,,0.1643
937,2022-07-26,2,2783.0,,,-12537.0,,,10542.0,,,3618.0,,,743.0,,,7519.0,,,-852.0,,,-92.0,,,-391.0,,,-2.0,,,-463.0,,,-326.0,,,,,,,196000.0,196500.0,192500.0,196000.0,0.0,142943.0,,,,0.0884
938,2022-07-27,3,-7725.0,,,6548.0,,,886.0,,,6839.0,,,-4710.0,,,-1538.0,,,2859.0,,,-826.0,,,-1739.0,,,0.0,,,327.0,,,-36.0,,,,,,,196000.0,197000.0,193000.0,196000.0,0.0,118451.0,,,,0.0733
939,2022-07-28,4,520.0,,,3470.0,,,-4243.0,,,-923.0,,,-1328.0,,,-941.0,,,949.0,,,-137.0,,,-1874.0,,,12.0,,,239.0,,,14.0,,,,,,,198000.0,198000.0,194000.0,195000.0,-0.51,147792.0,,,,0.0914
940,2022-07-29,5,-24249.0,,,22492.0,,,-2696.0,,,2575.0,,,1871.0,,,-4634.0,,,-3074.0,,,-330.0,,,882.0,,,14.0,,,4613.0,,,-161.0,,,,,,,196000.0,197000.0,194000.0,196000.0,0.51,165275.0,,,,0.1023


In [25]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = classify_data(df_company, investors) # column data catagorizing

In [26]:
# get consecutive days with non empty values
df_company = consequtive_days(df_company, investors)

In [27]:
df_company.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,institution,institution_class,institution_cr,institution_days,financial,financial_class,financial_cr,financial_days,invtrust,invtrust_class,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,-12570.0,5,,-1,-2598.0,5,,-1,-531.0,5,,-1,-9936.0,4,,-1,-22.0,5,,-1,50.0,5,,1,355.0,5,,1,112.0,5,,1,4600.0,6,,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,,,,0.041
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,-21214.0,4,,-2,-13522.0,4,,-2,-687.0,5,,-2,-3659.0,5,,-2,-1735.0,5,,-2,-43.0,5,,-1,-1553.0,5,,-1,-17.0,5,,-1,6353.0,7,,2,150.0,5,,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,,,,0.0827
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,-5078.0,5,,-3,-9243.0,4,,-3,-563.0,5,,-3,1346.0,5,,1,127.0,5,,1,0.0,5,,-2,3266.0,7,,1,-9.0,5,,-2,3571.0,6,,3,-7.0,5,,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,,,,0.0241
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,-11405.0,5,,-4,-2705.0,5,,-4,-1738.0,5,,-4,-6291.0,5,,-1,81.0,5,,2,3.0,5,,1,-755.0,5,,-1,0.0,5,,-3,3741.0,6,,4,-51.0,5,,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,,,,0.0484
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,-20996.0,4,,-5,-6117.0,5,,-5,-1014.0,5,,-5,-16321.0,3,,-2,2514.0,6,,3,2.0,5,,2,-3.0,5,,-2,-57.0,5,,-4,4562.0,6,,5,0.0,5,,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,,,,0.0729


In [28]:
# get investors amount relative ratio
investors_sum = ['retail', 'foreigner', 'financial', 'invtrust', 'pension',
         'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = get_weight_ratio(df_company, investors, investors_sum)

In [29]:
df_company.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,institution,institution_class,institution_cr,institution_days,financial,financial_class,financial_cr,financial_days,invtrust,invtrust_class,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,-12570.0,5,,-1,-2598.0,5,,-1,-531.0,5,,-1,-9936.0,4,,-1,-22.0,5,,-1,50.0,5,,1,355.0,5,,1,112.0,5,,1,4600.0,6,,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,,,,0.041,54274.0,4.0498,-2.5887,-2.316,-0.4787,-0.0978,-1.8307,-0.0041,0.0092,0.0654,0.0206,0.8476,0.0074
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,-21214.0,4,,-2,-13522.0,4,,-2,-687.0,5,,-2,-3659.0,5,,-2,-1735.0,5,,-2,-43.0,5,,-1,-1553.0,5,,-1,-17.0,5,,-1,6353.0,7,,2,150.0,5,,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,,,,0.0827,98309.0,4.3385,-2.842,-2.1579,-1.3755,-0.0699,-0.3722,-0.1765,-0.0044,-0.158,-0.0017,0.6462,0.0153
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,-5078.0,5,,-3,-9243.0,4,,-3,-563.0,5,,-3,1346.0,5,,1,127.0,5,,1,0.0,5,,-2,3266.0,7,,1,-9.0,5,,-2,3571.0,6,,3,-7.0,5,,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,,,,0.0241,21934.0,1.2118,-0.5216,-2.3151,-4.214,-0.2567,0.6137,0.0579,0.0,1.489,-0.0041,1.6281,-0.0032
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,-11405.0,5,,-4,-2705.0,5,,-4,-1738.0,5,,-4,-6291.0,5,,-1,81.0,5,,2,3.0,5,,1,-755.0,5,,-1,0.0,5,,-3,3741.0,6,,4,-51.0,5,,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,,,,0.0484,34714.0,3.8981,-1.6757,-3.2854,-0.7792,-0.5007,-1.8122,0.0233,0.0009,-0.2175,0.0,1.0777,-0.0147
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,-20996.0,4,,-5,-6117.0,5,,-5,-1014.0,5,,-5,-16321.0,3,,-2,2514.0,6,,3,2.0,5,,2,-3.0,5,,-2,-57.0,5,,-4,4562.0,6,,5,0.0,5,,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,,,,0.0729,49853.0,3.5801,-0.2838,-4.2116,-1.227,-0.2034,-3.2738,0.5043,0.0004,-0.0006,-0.0114,0.9151,0.0


In [30]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = get_change_rate(df_company, investors) # get column change rate data

In [31]:
df_company.iloc[0:5, 45:60]

Unnamed: 0,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol
1,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0
2,2,150.0,5,2.75,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0
5,3,-7.0,5,-1.0467,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0
6,4,-51.0,5,6.2857,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0
7,5,0.0,5,-1.0,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0


In [32]:
df_company.dropna(axis=1, how='all', inplace=True)

In [33]:
df_company.iloc[0:5, 45:60]

Unnamed: 0,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,open,high,low,close,close_cr,vol,vol_percent,total,retail_ratio,foreigner_ratio
1,1,40.0,5,,1,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,0.041,54274.0,4.0498,-2.5887
2,2,150.0,5,2.75,2,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,0.0827,98309.0,4.3385,-2.842
5,3,-7.0,5,-1.0467,-1,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,0.0241,21934.0,1.2118,-0.5216
6,4,-51.0,5,6.2857,-2,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,0.0484,34714.0,3.8981,-1.6757
7,5,0.0,5,-1.0,-3,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,0.0729,49853.0,3.5801,-0.2838


### Merge common and company

In [34]:
# 같은 type (str) 으로 형식 변환
df_common.date = df_common.date.astype(str)
df_company.date = df_company.date.astype(str)

In [35]:
df_merge = pd.merge(df_common,df_company, how='right', on='date') # company 기준 날짜로 merge

In [36]:
df_merge.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,...,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,open,high,low,close,close_cr,vol,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
633,2022-07-25,11782.67,-0.43,12354.5,-0.56,31990.0,0.28,31967.0,0.29,3966.84,0.13,3970.0,0.13,2403.69,0.44,789.69,-0.01,96.7,1.89,106.355,-0.25,1311.12,0.08,2.807,0.79,3.0182,0.49,2.486,0.83,3.213,-2.67,3.16,-1.28,23.36,1.43,38.0,2825.0,-0.61,9.1,8.8,8.6,1719.06,-0.0049,1.75,1.5,1.0,1.0,1.0,2.25,0.0,...,1.9083,2,-1813.0,5,-0.5836,-2,2397.0,6,-2.3272,1,-16.0,5,-0.6923,-2,-1100.0,5,-5.955,-1,-118.0,5,15.8571,-6,259.0,5,3.7091,2,152.0,5,12.8182,2,192500.0,198500.0,192500.0,196000.0,2.62,265515.0,0.1643,152328.0,-4.8,3.6016,1.1714,1.0286,0.1854,-0.119,0.1574,-0.0011,-0.0722,-0.0077,0.017,0.01
634,2022-07-26,11562.58,-1.87,12112.5,-1.96,31760.85,-0.72,31732.0,-0.74,3921.05,-1.15,3923.25,-1.18,2412.96,0.39,789.93,0.03,94.98,-1.78,107.044,0.65,1310.1,-0.08,2.803,-0.13,3.0589,1.35,2.53,1.77,3.184,-0.9,3.146,-0.44,24.69,5.69,35.0,2779.0,-1.63,9.1,8.8,8.6,1718.02,-0.0006,1.75,1.5,1.0,1.0,1.0,2.25,0.0,...,-0.7369,3,7519.0,6,-5.1473,1,-852.0,5,-1.3554,-1,-92.0,5,4.75,-3,-391.0,5,-0.6445,-2,-2.0,5,-0.9831,-7,-463.0,5,-2.7876,-1,-326.0,3,-3.1447,-1,196000.0,196500.0,192500.0,196000.0,0.0,142943.0,0.0884,29326.0,0.949,-4.275,3.5948,1.2337,0.2534,2.5639,-0.2905,-0.0314,-0.1333,-0.0007,-0.1579,-0.1112
635,2022-07-27,12032.42,4.06,12619.0,4.18,32196.0,1.37,32172.0,1.39,4023.61,2.62,4024.5,2.58,2415.53,0.11,795.7,0.73,97.26,2.4,106.331,-0.67,1306.08,-0.31,2.785,-0.65,2.9837,-2.46,2.451,-3.12,3.132,-1.63,3.116,-0.95,23.24,-5.87,37.0,2911.0,4.75,9.1,8.8,8.6,1738.87,0.0121,2.5,2.5,1.75,1.0,1.0,2.25,0.0,...,-7.3392,-1,-1538.0,5,-1.2045,-1,2859.0,6,-4.3556,1,-826.0,3,7.9783,-4,-1739.0,5,3.4476,-3,0.0,5,-1.0,-8,327.0,5,-1.7063,1,-36.0,5,-0.8896,-2,196000.0,197000.0,193000.0,196000.0,0.0,118451.0,0.0733,33147.0,-2.3305,1.9754,0.2673,2.0632,-1.4209,-0.464,0.8625,-0.2492,-0.5246,0.0,0.0987,-0.0109
636,2022-07-28,12162.59,1.08,12737.5,0.94,32526.86,1.03,32490.0,0.99,4072.43,1.21,4073.5,1.22,2435.27,0.82,798.32,0.33,96.42,-0.86,106.236,-0.09,1295.47,-0.81,2.671,-4.11,2.8683,-3.87,2.386,-2.65,3.197,2.08,3.148,1.03,22.33,-3.92,38.0,2944.5,1.15,9.1,8.8,8.6,1755.05,0.0093,2.5,2.5,1.75,1.0,1.0,2.25,0.0,...,-0.718,-2,-941.0,5,-0.3882,-2,949.0,5,-0.6681,2,-137.0,5,-0.8341,-5,-1874.0,5,0.0776,-4,12.0,5,inf,1,239.0,5,-0.2691,2,14.0,5,-1.3889,1,198000.0,198000.0,194000.0,195000.0,-0.51,147792.0,0.0914,10407.0,0.4997,3.3343,-4.0771,-0.8869,-1.2761,-0.9042,0.9119,-0.1316,-1.8007,0.0115,0.2297,0.0135
637,2022-07-29,12390.69,1.88,12971.5,1.84,32846.45,0.98,32825.0,1.03,4130.29,1.42,4133.5,1.47,2451.5,0.67,803.62,0.66,98.62,2.28,105.779,-0.43,1302.97,0.58,2.658,-0.48,2.8905,0.77,2.373,-0.54,3.127,-2.19,3.026,-3.88,21.33,-4.48,41.0,2967.1,0.77,9.1,8.8,8.6,1766.16,0.0063,2.5,2.5,1.75,1.0,1.0,2.25,0.0,...,-2.4089,1,-4634.0,5,3.9245,-3,-3074.0,4,-4.2392,-1,-330.0,4,1.4088,-6,882.0,5,-1.4707,1,14.0,5,0.1667,2,4613.0,6,18.3013,3,-161.0,4,-12.5,-1,196000.0,197000.0,194000.0,196000.0,0.51,165275.0,0.1023,64895.0,-3.7367,3.4659,-0.4154,0.3968,0.2883,-0.7141,-0.4737,-0.0509,0.1359,0.0022,0.7108,-0.0248


In [37]:
# nan columns 내용을 바로 row의 값으로 대체
col_names = df_common.columns
df_merge =  fillna_with_bfill(df_merge, col_names)

In [38]:
df_merge.head()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,f_g_index,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,...,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,open,high,low,close,close_cr,vol,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
0,2020-01-02,9092.19,1.33,8891.75,1.31,28868.8,1.16,28841.0,0.96,3257.85,0.84,3259.0,0.66,2175.17,-1.02,674.02,0.63,61.18,-0.24,96.525,0.49,1157.35,0.29,1.877,-2.29,1.571,0.0,1.533,-1.5,1.638,-2.03,1.335,-2.2,12.47,-9.51,,1887.9,2.07,2.1,2.0,1.8,1528.95,0.0069,1.75,1.75,1.75,1.0,1.0,0.5,0.0,...,,-1,-9936.0,4,,-1,-22.0,5,,-1,50.0,5,,1,355.0,5,,1,112.0,5,,1,4600.0,6,,1,40.0,5,,1,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,0.041,54274.0,4.0498,-2.5887,-2.316,-0.4787,-0.0978,-1.8307,-0.0041,0.0092,0.0654,0.0206,0.8476,0.0074
1,2020-01-03,9020.77,-0.79,8810.0,-0.92,28634.88,-0.81,28602.0,-0.83,3234.85,-0.71,3235.5,-0.72,2176.46,0.06,669.93,-0.61,63.05,3.06,96.521,0.0,1164.95,0.66,1.793,-4.46,1.5326,-2.44,1.52,-0.85,1.552,-5.25,1.29,-3.37,14.02,12.43,,1854.0,-1.8,2.1,2.0,1.8,1552.24,0.0152,1.75,1.75,1.75,1.0,1.0,0.5,0.0,...,0.2938,-2,-3659.0,5,-0.6317,-2,-1735.0,5,77.8636,-2,-43.0,5,-1.86,-1,-1553.0,5,-5.3746,-1,-17.0,5,-1.1518,-1,6353.0,7,0.3811,2,150.0,5,2.75,2,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,0.0827,98309.0,4.3385,-2.842,-2.1579,-1.3755,-0.0699,-0.3722,-0.1765,-0.0044,-0.158,-0.0017,0.6462,0.0153
2,2020-01-06,9071.47,0.56,8847.5,0.43,28703.38,0.24,28642.0,0.14,3246.28,0.35,3243.5,0.25,2155.07,-0.98,655.31,-2.18,63.27,0.35,96.35,-0.18,1166.94,0.17,1.809,2.37,1.5466,2.11,1.538,0.49,1.537,-1.47,1.299,0.7,13.85,-1.21,,1834.7,-1.04,2.1,2.0,1.8,1563.83,0.0075,1.75,1.75,1.75,1.0,1.0,0.5,0.0,...,-0.1805,-3,1346.0,5,-1.3679,1,127.0,5,-1.0732,1,0.0,5,-1.0,-2,3266.0,7,-3.103,1,-9.0,5,-0.4706,-2,3571.0,6,-0.4379,3,-7.0,5,-1.0467,-1,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,0.0241,21934.0,1.2118,-0.5216,-2.3151,-4.214,-0.2567,0.6137,0.0579,0.0,1.489,-0.0041,1.6281,-0.0032
3,2020-01-07,9068.58,-0.03,8853.0,0.06,28583.68,-0.42,28526.0,-0.4,3237.18,-0.28,3235.25,-0.25,2175.54,0.95,663.44,1.24,62.7,-0.9,96.702,0.37,1167.3,0.03,1.811,0.1,1.5384,-0.53,1.551,0.85,1.616,5.14,1.343,3.39,13.79,-0.43,,1867.3,1.78,2.1,2.0,1.8,1595.24,0.0201,1.75,1.75,1.75,1.0,1.0,0.5,0.0,...,2.087,-4,-6291.0,5,-5.6738,-1,81.0,5,-0.3622,2,3.0,5,inf,1,-755.0,5,-1.2312,-1,0.0,5,-1.0,-3,3741.0,6,0.0476,4,-51.0,5,6.2857,-2,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,0.0484,34714.0,3.8981,-1.6757,-3.2854,-0.7792,-0.5007,-1.8122,0.0233,0.0009,-0.2175,0.0,1.0777,-0.0147
4,2020-01-08,9129.24,0.67,8944.5,1.03,28745.09,0.56,28770.0,0.86,3253.05,0.49,3260.25,0.77,2151.31,-1.11,640.94,-3.39,59.61,-4.93,96.996,0.3,1162.25,-0.43,1.874,3.48,1.5846,3.0,1.544,-0.45,1.628,0.74,1.322,-1.56,13.45,-2.47,,1867.6,0.02,2.1,2.0,1.8,1557.89,-0.0234,1.75,1.75,1.75,1.0,1.0,0.5,0.0,...,-0.4166,-5,-16321.0,3,1.5943,-2,2514.0,6,30.037,3,2.0,5,-0.3333,2,-3.0,5,-0.996,-2,-57.0,5,-inf,-4,4562.0,6,0.2195,5,0.0,5,-1.0,-3,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,0.0729,49853.0,3.5801,-0.2838,-4.2116,-1.227,-0.2034,-3.2738,0.5043,0.0004,-0.0006,-0.0114,0.9151,0.0


In [39]:
df_merge = df_merge[1:] # delete first row which has s lot of the value 'nan'

In [40]:
del_c1 = df_merge.columns[(df_merge.isna().any())].tolist()  # select columns which has nan
del_c2 = df_merge.columns[df_merge.isin([np.inf, -np.inf]).any()].tolist() # select columns which has inf
del_column = list(set(del_c1+del_c2))

In [41]:
df = df_merge[df_merge.columns.drop(del_column)] # delete columns with the name selected above

In [42]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,...,financial_class,financial_cr,financial_days,invtrust,invtrust_class,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_days,insurance,insurance_class,insurance_days,financeetc,financeetc_class,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_days,open,high,low,close,close_cr,vol,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
633,2022-07-25,11782.67,-0.43,12354.5,-0.56,31990.0,0.28,31967.0,0.29,3966.84,0.13,3970.0,0.13,2403.69,0.44,789.69,-0.01,96.7,1.89,106.355,-0.25,1311.12,0.08,2.807,0.79,3.0182,0.49,2.486,0.83,3.213,-2.67,3.16,-1.28,23.36,1.43,2825.0,-0.61,9.1,8.8,8.6,1719.06,-0.0049,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,...,7,7.5811,2,2824.0,6,2,-1813.0,5,-0.5836,-2,2397.0,6,-2.3272,1,-16.0,5,-2,-1100.0,5,-1,-118.0,5,-6,259.0,5,3.7091,2,152.0,5,2,192500.0,198500.0,192500.0,196000.0,2.62,265515.0,0.1643,152328.0,-4.8,3.6016,1.1714,1.0286,0.1854,-0.119,0.1574,-0.0011,-0.0722,-0.0077,0.017,0.01
634,2022-07-26,11562.58,-1.87,12112.5,-1.96,31760.85,-0.72,31732.0,-0.74,3921.05,-1.15,3923.25,-1.18,2412.96,0.39,789.93,0.03,94.98,-1.78,107.044,0.65,1310.1,-0.08,2.803,-0.13,3.0589,1.35,2.53,1.77,3.184,-0.9,3.146,-0.44,24.69,5.69,2779.0,-1.63,9.1,8.8,8.6,1718.02,-0.0006,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,...,5,-0.7691,3,743.0,5,3,7519.0,6,-5.1473,1,-852.0,5,-1.3554,-1,-92.0,5,-3,-391.0,5,-2,-2.0,5,-7,-463.0,5,-2.7876,-1,-326.0,3,-1,196000.0,196500.0,192500.0,196000.0,0.0,142943.0,0.0884,29326.0,0.949,-4.275,3.5948,1.2337,0.2534,2.5639,-0.2905,-0.0314,-0.1333,-0.0007,-0.1579,-0.1112
635,2022-07-27,12032.42,4.06,12619.0,4.18,32196.0,1.37,32172.0,1.39,4023.61,2.62,4024.5,2.58,2415.53,0.11,795.7,0.73,97.26,2.4,106.331,-0.67,1306.08,-0.31,2.785,-0.65,2.9837,-2.46,2.451,-3.12,3.132,-1.63,3.116,-0.95,23.24,-5.87,2911.0,4.75,9.1,8.8,8.6,1738.87,0.0121,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,6,0.8903,4,-4710.0,4,-1,-1538.0,5,-1.2045,-1,2859.0,6,-4.3556,1,-826.0,3,-4,-1739.0,5,-3,0.0,5,-8,327.0,5,-1.7063,1,-36.0,5,-2,196000.0,197000.0,193000.0,196000.0,0.0,118451.0,0.0733,33147.0,-2.3305,1.9754,0.2673,2.0632,-1.4209,-0.464,0.8625,-0.2492,-0.5246,0.0,0.0987,-0.0109
636,2022-07-28,12162.59,1.08,12737.5,0.94,32526.86,1.03,32490.0,0.99,4072.43,1.21,4073.5,1.22,2435.27,0.82,798.32,0.33,96.42,-0.86,106.236,-0.09,1295.47,-0.81,2.671,-4.11,2.8683,-3.87,2.386,-2.65,3.197,2.08,3.148,1.03,22.33,-3.92,2944.5,1.15,9.1,8.8,8.6,1755.05,0.0093,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,5,-1.135,-1,-1328.0,5,-2,-941.0,5,-0.3882,-2,949.0,5,-0.6681,2,-137.0,5,-5,-1874.0,5,-4,12.0,5,1,239.0,5,-0.2691,2,14.0,5,1,198000.0,198000.0,194000.0,195000.0,-0.51,147792.0,0.0914,10407.0,0.4997,3.3343,-4.0771,-0.8869,-1.2761,-0.9042,0.9119,-0.1316,-1.8007,0.0115,0.2297,0.0135
637,2022-07-29,12390.69,1.88,12971.5,1.84,32846.45,0.98,32825.0,1.03,4130.29,1.42,4133.5,1.47,2451.5,0.67,803.62,0.66,98.62,2.28,105.779,-0.43,1302.97,0.58,2.658,-0.48,2.8905,0.77,2.373,-0.54,3.127,-2.19,3.026,-3.88,21.33,-4.48,2967.1,0.77,9.1,8.8,8.6,1766.16,0.0063,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,5,-3.7898,1,1871.0,5,1,-4634.0,5,3.9245,-3,-3074.0,4,-4.2392,-1,-330.0,4,-6,882.0,5,1,14.0,5,2,4613.0,6,18.3013,3,-161.0,4,-1,196000.0,197000.0,194000.0,196000.0,0.51,165275.0,0.1023,64895.0,-3.7367,3.4659,-0.4154,0.3968,0.2883,-0.7141,-0.4737,-0.0509,0.1359,0.0022,0.7108,-0.0248


In [43]:
price_range = ['close_cr']
df = classify_data(df, price_range) # column data catagorizing

In [44]:
df.loc[250:255]

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,...,financial_cr,financial_days,invtrust,invtrust_class,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_days,insurance,insurance_class,insurance_days,financeetc,financeetc_class,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_days,open,high,low,close,close_cr,close_cr_class,vol,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
250,2021-01-06,12740.79,-0.61,12616.75,-1.38,30829.4,1.44,30720.0,1.44,3748.14,0.57,3740.5,0.6,2968.21,-0.75,981.39,-0.44,50.63,1.4,89.502,0.1,1087.93,0.14,1.039,8.8,0.1388,12.85,0.089,5.95,1.731,2.43,0.857,1.18,25.07,-1.07,2828.0,-0.32,1.2,1.1,1.2,1924.34,-0.0134,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,1.491,-2,-4322.0,4,-6,-34631.0,2,1.9481,-2,-5342.0,3,6.0755,-5,-218.0,5,-3,-3940.0,3,-9,-118.0,5,-2,3471.0,6,-0.5906,2,78.0,5,2,209500.0,211500.0,202500.0,203000.0,-3.1,3,585671.0,0.3624,268884.0,4.868,-1.8875,-3.1125,-1.3061,-0.1607,-1.288,-0.1987,-0.0081,-0.1465,-0.0044,0.1291,0.0029
251,2021-01-07,13067.48,2.56,12928.0,2.47,31041.13,0.69,30942.0,0.72,3803.79,1.48,3795.5,1.47,3031.68,2.14,988.86,0.76,50.83,0.4,89.791,0.32,1094.28,0.58,1.081,4.07,0.1408,1.44,0.086,-3.37,1.725,-0.35,0.865,0.93,22.37,-10.77,2937.0,3.86,1.2,1.1,1.2,1916.69,-0.004,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,-2.0075,1,-5935.0,4,-7,-11073.0,4,-0.6803,-3,-1103.0,5,-0.7935,-6,-104.0,5,-4,-2144.0,4,-10,157.0,6,1,14989.0,9,3.3184,3,161.0,5,3,206000.0,210500.0,204500.0,206000.0,1.48,6,515292.0,0.3188,101381.0,-0.7568,-2.2349,1.4975,3.4902,-0.5854,-1.0922,-0.1088,-0.0103,-0.2115,0.0155,1.4785,0.0159
252,2021-01-08,13201.98,1.03,13097.25,1.31,31097.97,0.18,30993.0,0.16,3824.68,0.55,3817.5,0.58,3152.18,3.97,987.79,-0.11,52.24,2.77,90.068,0.31,1092.93,-0.12,1.119,3.47,0.1369,-2.77,0.086,0.0,1.733,0.46,0.885,2.31,21.56,-3.62,2936.5,-0.02,1.2,1.1,1.2,1849.15,-0.0352,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,-0.5416,2,-14421.0,2,-8,-19542.0,3,0.7648,-4,10255.0,8,-10.2974,1,-2729.0,1,-5,-5051.0,3,-11,-357.0,3,-1,-15721.0,1,-2.0488,-1,-174.0,4,-1,240500.0,257000.0,225000.0,246000.0,19.42,9,4851142.0,3.0014,398530.0,-3.5448,4.3357,-0.392,0.407,-0.3619,-0.4904,0.2573,-0.0685,-0.1267,-0.009,-0.3945,-0.0044
253,2021-01-11,13036.43,-1.25,12897.0,-1.53,31008.69,-0.29,30902.0,-0.29,3799.61,-0.66,3792.0,-0.67,3148.45,-0.12,976.63,-1.13,52.25,0.02,90.442,0.42,1097.23,0.39,1.144,2.29,0.1489,8.77,0.086,-0.23,1.693,-1.57,0.865,-0.57,24.08,11.69,2969.9,1.14,1.2,1.1,1.2,1847.34,-0.001,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,-4.6352,-1,-21867.0,1,-9,-49749.0,1,1.5457,-5,-8092.0,2,-1.7891,-1,-941.0,2,-6,-17290.0,1,-12,-2356.0,1,-2,-28243.0,1,0.7965,-2,1526.0,9,1,256500.0,289000.0,256000.0,267500.0,8.74,9,3919184.0,2.4248,667761.0,4.9772,-2.192,-2.385,-0.883,-0.3275,-0.745,-0.1212,-0.0141,-0.2589,-0.0353,-0.423,0.0229
254,2021-01-12,13072.43,0.28,12890.25,-0.05,31068.69,0.19,30974.0,0.23,3801.19,0.04,3794.5,0.07,3125.95,-0.71,973.72,-0.3,53.21,1.84,90.064,-0.42,1094.48,-0.25,1.129,-1.33,0.1469,-1.34,0.086,0.0,1.718,1.48,0.871,0.69,23.33,-3.11,2988.0,0.61,1.2,1.1,1.2,1858.6,0.0061,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,-0.8141,-2,-9514.0,3,-10,-41804.0,1,-0.1597,-6,-8638.0,2,0.0675,-2,-585.0,3,-7,-3840.0,3,-13,-252.0,4,-3,719.0,5,-1.0255,1,-195.0,4,-1,263500.0,274500.0,250500.0,261000.0,-2.43,3,2106295.0,1.3032,257019.0,4.972,-2.0513,-2.9411,-0.4264,-0.3702,-1.6265,-0.3361,-0.0228,-0.1494,-0.0098,0.028,-0.0076
255,2021-01-13,13128.95,0.43,12972.25,0.64,31060.47,-0.03,30959.0,-0.05,3809.84,0.23,3803.75,0.24,3148.29,0.71,979.13,0.56,52.91,-0.56,90.334,0.3,1097.12,0.24,1.092,-3.31,0.145,-1.29,0.086,0.0,1.719,0.06,0.865,-0.69,22.21,-4.8,2991.9,0.13,1.4,1.3,1.2,1848.2,-0.0056,0.25,0.25,0.25,1.0,1.0,0.5,0.0,0.0,...,-2.1623,1,-15901.0,2,-11,-59921.0,1,0.4334,-7,-6344.0,3,-0.2656,-3,-915.0,3,-8,-11195.0,1,-14,-485.0,3,-4,5363.0,6,6.459,2,589.0,7,1,259500.0,265000.0,250000.0,259000.0,-0.77,5,1496753.0,0.9261,189522.0,3.5512,0.4626,-4.3278,0.6721,-0.839,-3.1617,-0.3347,-0.0483,-0.5907,-0.0256,0.283,0.0311


In [45]:
dir_name = '../data/analysis/'
company = 'hyunmotor_ml.csv'
df.to_csv(dir_name+company, index=False)

### Select known daily values

In [46]:
col_common_fixed =  ['cpi', 'cpi_anticipated', 'cpi_previous',
                    'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'fed_rate_ann', 'fed_rate_imp', 
                     'bok_rate', 'fu_usa_date', 
                     'op_usa_date', 'qw_usa_day', 'fu_kor_date', 'op_kor_date', 'dw_kor_day']

#### transpose 후 각 column의 예측값을 구하고 이를 이용하여 다시 transpose 후 예측값을 구하는 방식 검토. 한 개의 column을 target값으로 하여 각 column 별로 예측값을 구하는 방식.

In [47]:
dir_name = '../data/analysis/'
company = 'hyunmotor_ml.csv'
df = pd.read_csv(dir_name+company)

In [48]:
len(df)

637

In [49]:
df.tail()

Unnamed: 0,date,ixic,ixic_cr,ixic_f,ixic_f_cr,dji,dji_cr,dji_f,dji_f_cr,spx,spx_cr,spx_f,spx_f_cr,kospi,kospi_cr,kosdaq,kosdaq_cr,wti,wti_cr,dxy,dxy_cr,krw,krw_cr,bond_usa_10,bond_usa_10_cr,bond_usa_2,bond_usa_2_cr,bond_usa_3m,bond_usa_3m_cr,bond_kor_10,bond_kor_10_cr,bond_kor_2,bond_kor_2_cr,vix,vix_cr,sox,sox_cr,cpi,cpi_anticipated,cpi_previous,gold,gold_cr,fed_rate,fed_rate_fore,fed_rate_prev,fed_rate_ann,fed_rate_imp,bok_rate,fu_usa_date,op_usa_date,...,financial_cr,financial_days,invtrust,invtrust_class,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_days,insurance,insurance_class,insurance_days,financeetc,financeetc_class,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_days,open,high,low,close,close_cr,close_cr_class,vol,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
632,2022-07-25,11782.67,-0.43,12354.5,-0.56,31990.0,0.28,31967.0,0.29,3966.84,0.13,3970.0,0.13,2403.69,0.44,789.69,-0.01,96.7,1.89,106.355,-0.25,1311.12,0.08,2.807,0.79,3.0182,0.49,2.486,0.83,3.213,-2.67,3.16,-1.28,23.36,1.43,2825.0,-0.61,9.1,8.8,8.6,1719.06,-0.0049,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,...,7.5811,2,2824.0,6,2,-1813.0,5,-0.5836,-2,2397.0,6,-2.3272,1,-16.0,5,-2,-1100.0,5,-1,-118.0,5,-6,259.0,5,3.7091,2,152.0,5,2,192500.0,198500.0,192500.0,196000.0,2.62,6,265515.0,0.1643,152328.0,-4.8,3.6016,1.1714,1.0286,0.1854,-0.119,0.1574,-0.0011,-0.0722,-0.0077,0.017,0.01
633,2022-07-26,11562.58,-1.87,12112.5,-1.96,31760.85,-0.72,31732.0,-0.74,3921.05,-1.15,3923.25,-1.18,2412.96,0.39,789.93,0.03,94.98,-1.78,107.044,0.65,1310.1,-0.08,2.803,-0.13,3.0589,1.35,2.53,1.77,3.184,-0.9,3.146,-0.44,24.69,5.69,2779.0,-1.63,9.1,8.8,8.6,1718.02,-0.0006,1.75,1.5,1.0,1.0,1.0,2.25,0.0,0.0,...,-0.7691,3,743.0,5,3,7519.0,6,-5.1473,1,-852.0,5,-1.3554,-1,-92.0,5,-3,-391.0,5,-2,-2.0,5,-7,-463.0,5,-2.7876,-1,-326.0,3,-1,196000.0,196500.0,192500.0,196000.0,0.0,5,142943.0,0.0884,29326.0,0.949,-4.275,3.5948,1.2337,0.2534,2.5639,-0.2905,-0.0314,-0.1333,-0.0007,-0.1579,-0.1112
634,2022-07-27,12032.42,4.06,12619.0,4.18,32196.0,1.37,32172.0,1.39,4023.61,2.62,4024.5,2.58,2415.53,0.11,795.7,0.73,97.26,2.4,106.331,-0.67,1306.08,-0.31,2.785,-0.65,2.9837,-2.46,2.451,-3.12,3.132,-1.63,3.116,-0.95,23.24,-5.87,2911.0,4.75,9.1,8.8,8.6,1738.87,0.0121,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,0.8903,4,-4710.0,4,-1,-1538.0,5,-1.2045,-1,2859.0,6,-4.3556,1,-826.0,3,-4,-1739.0,5,-3,0.0,5,-8,327.0,5,-1.7063,1,-36.0,5,-2,196000.0,197000.0,193000.0,196000.0,0.0,5,118451.0,0.0733,33147.0,-2.3305,1.9754,0.2673,2.0632,-1.4209,-0.464,0.8625,-0.2492,-0.5246,0.0,0.0987,-0.0109
635,2022-07-28,12162.59,1.08,12737.5,0.94,32526.86,1.03,32490.0,0.99,4072.43,1.21,4073.5,1.22,2435.27,0.82,798.32,0.33,96.42,-0.86,106.236,-0.09,1295.47,-0.81,2.671,-4.11,2.8683,-3.87,2.386,-2.65,3.197,2.08,3.148,1.03,22.33,-3.92,2944.5,1.15,9.1,8.8,8.6,1755.05,0.0093,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,-1.135,-1,-1328.0,5,-2,-941.0,5,-0.3882,-2,949.0,5,-0.6681,2,-137.0,5,-5,-1874.0,5,-4,12.0,5,1,239.0,5,-0.2691,2,14.0,5,1,198000.0,198000.0,194000.0,195000.0,-0.51,5,147792.0,0.0914,10407.0,0.4997,3.3343,-4.0771,-0.8869,-1.2761,-0.9042,0.9119,-0.1316,-1.8007,0.0115,0.2297,0.0135
636,2022-07-29,12390.69,1.88,12971.5,1.84,32846.45,0.98,32825.0,1.03,4130.29,1.42,4133.5,1.47,2451.5,0.67,803.62,0.66,98.62,2.28,105.779,-0.43,1302.97,0.58,2.658,-0.48,2.8905,0.77,2.373,-0.54,3.127,-2.19,3.026,-3.88,21.33,-4.48,2967.1,0.77,9.1,8.8,8.6,1766.16,0.0063,2.5,2.5,1.75,1.0,1.0,2.25,0.0,0.0,...,-3.7898,1,1871.0,5,1,-4634.0,5,3.9245,-3,-3074.0,4,-4.2392,-1,-330.0,4,-6,882.0,5,1,14.0,5,2,4613.0,6,18.3013,3,-161.0,4,-1,196000.0,197000.0,194000.0,196000.0,0.51,5,165275.0,0.1023,64895.0,-3.7367,3.4659,-0.4154,0.3968,0.2883,-0.7141,-0.4737,-0.0509,0.1359,0.0022,0.7108,-0.0248
