In [2]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [3]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.precision = 4

In [4]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [5]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max ]
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [6]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [7]:
def get_dayofweek(df):
    # insert 요일 column: 1: monday, 5: friday
    df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d"))
    df_dayofweek = df['date'].dt.dayofweek + 1 # 1: monday, 5: Friday
    df.insert(1, 'dayofweek', df_dayofweek)
    
    return df

In [8]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [9]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [10]:
def get_change_rate(df, col_names): # get _cr value for each column
    for col in col_names:
        df[col+'_cr'] = (df[col] - df[col].shift(1)) / df[col].shift(1)
    
    return df

In [11]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

In [12]:
filename = '../data/data_hyunmotor.csv'
df = get_seq_data(filename, kor_days)

In [13]:
# get day of week
df = get_dayofweek(df)

In [14]:
# get volumne percentage
df['vol_percent'] = (df['vol'] / df['vol'].sum()) * 100

In [15]:
df =  df[:-1]
df.tail()

Unnamed: 0,date,dayofweek,retail,retail_cr,retail_days,foreigner,foreigner_cr,foreigner_days,institution,institution_cr,institution_days,financial,financial_cr,financial_days,invtrust,invtrust_cr,invtrust_days,pension,pension_cr,pension_days,privequity,privequity_cr,privequity_days,bank,bank_cr,bank_days,insurance,insurance_cr,insurance_days,financeetc,financeetc_cr,financeetc_days,corporateetc,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent
919,2022-07-08,5,-20479.0,,,15617.0,,,3853.0,,,-1893.0,,,485.0,,,-1307.0,,,6348.0,,,-2.0,,,456.0,,,-234.0,,,1017.0,,,-8.0,,,,,,,177500.0,180500.0,177000.0,178000.0,1.71,147130.0,,,,0.0922
922,2022-07-11,1,-10749.0,,,11123.0,,,-719.0,,,-3528.0,,,4684.0,,,-354.0,,,-3569.0,,,0.0,,,2056.0,,,-6.0,,,348.0,,,-5.0,,,,,,,180000.0,182000.0,179000.0,179000.0,0.56,119200.0,,,,0.0747
923,2022-07-12,2,868.0,,,3282.0,,,-3835.0,,,-4108.0,,,-563.0,,,3325.0,,,-3446.0,,,28.0,,,1048.0,,,-119.0,,,-204.0,,,-111.0,,,,,,,180000.0,181500.0,177000.0,179000.0,0.0,96440.0,,,,0.0605
924,2022-07-13,3,-40414.0,,,18096.0,,,22755.0,,,6795.0,,,6382.0,,,6062.0,,,2539.0,,,0.0,,,956.0,,,21.0,,,-359.0,,,-79.0,,,,,,,180000.0,185000.0,178500.0,183500.0,2.51,172566.0,,,,0.1082
925,2022-07-14,4,13810.0,,,9619.0,,,-23410.0,,,-14077.0,,,949.0,,,-10782.0,,,141.0,,,25.0,,,318.0,,,17.0,,,56.0,,,-74.0,,,,,,,183000.0,184500.0,180500.0,182500.0,-0.54,142369.0,,,,0.0892


In [16]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df = classify_data(df, investors) # column data catagorizing

In [17]:
# get consecutive days with non empty values
df = consequtive_days(df, investors)

In [18]:
df.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,institution,institution_class,institution_cr,institution_days,financial,financial_class,financial_cr,financial_days,invtrust,invtrust_class,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,-12570.0,5,,-1,-2598.0,5,,-1,-531.0,5,,-1,-9936.0,4,,-1,-22.0,5,,-1,50.0,5,,1,355.0,5,,1,112.0,5,,1,4600.0,6,,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,,,,0.0415
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,-21214.0,4,,-2,-13522.0,4,,-2,-687.0,5,,-2,-3659.0,5,,-2,-1735.0,5,,-2,-43.0,5,,-1,-1553.0,5,,-1,-17.0,5,,-1,6353.0,7,,2,150.0,5,,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,,,,0.0838
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,-5078.0,5,,-3,-9243.0,4,,-3,-563.0,5,,-3,1346.0,5,,1,127.0,5,,1,0.0,5,,-2,3266.0,7,,1,-9.0,5,,-2,3571.0,6,,3,-7.0,5,,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,,,,0.0244
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,-11405.0,5,,-4,-2705.0,5,,-4,-1738.0,5,,-4,-6291.0,5,,-1,81.0,5,,2,3.0,5,,1,-755.0,5,,-1,0.0,5,,-3,3741.0,6,,4,-51.0,5,,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,,,,0.049
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,-20996.0,4,,-5,-6117.0,5,,-5,-1014.0,5,,-5,-16321.0,3,,-2,2514.0,6,,3,2.0,5,,2,-3.0,5,,-2,-57.0,5,,-4,4562.0,6,,5,0.0,5,,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,,,,0.0739


In [19]:
# get investors amount relative ratio
investors_sum = ['retail', 'foreigner', 'financial', 'invtrust', 'pension',
         'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df = get_weight_ratio(df, investors, investors_sum)

In [20]:
df.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,institution,institution_class,institution_cr,institution_days,financial,financial_class,financial_cr,financial_days,invtrust,invtrust_class,invtrust_cr,invtrust_days,pension,pension_class,pension_cr,pension_days,privequity,privequity_class,privequity_cr,privequity_days,bank,bank_class,bank_cr,bank_days,insurance,insurance_class,insurance_cr,insurance_days,financeetc,financeetc_class,financeetc_cr,financeetc_days,corporateetc,corporateetc_class,corporateetc_cr,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time,vol_percent,total,retail_ratio,foreigner_ratio,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,-12570.0,5,,-1,-2598.0,5,,-1,-531.0,5,,-1,-9936.0,4,,-1,-22.0,5,,-1,50.0,5,,1,355.0,5,,1,112.0,5,,1,4600.0,6,,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,,,,0.0415,54274.0,4.0498,-2.5887,-2.316,-0.4787,-0.0978,-1.8307,-0.0041,0.0092,0.0654,0.0206,0.8476,0.0074
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,-21214.0,4,,-2,-13522.0,4,,-2,-687.0,5,,-2,-3659.0,5,,-2,-1735.0,5,,-2,-43.0,5,,-1,-1553.0,5,,-1,-17.0,5,,-1,6353.0,7,,2,150.0,5,,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,,,,0.0838,98309.0,4.3385,-2.842,-2.1579,-1.3755,-0.0699,-0.3722,-0.1765,-0.0044,-0.158,-0.0017,0.6462,0.0153
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,-5078.0,5,,-3,-9243.0,4,,-3,-563.0,5,,-3,1346.0,5,,1,127.0,5,,1,0.0,5,,-2,3266.0,7,,1,-9.0,5,,-2,3571.0,6,,3,-7.0,5,,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,,,,0.0244,21934.0,1.2118,-0.5216,-2.3151,-4.214,-0.2567,0.6137,0.0579,0.0,1.489,-0.0041,1.6281,-0.0032
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,-11405.0,5,,-4,-2705.0,5,,-4,-1738.0,5,,-4,-6291.0,5,,-1,81.0,5,,2,3.0,5,,1,-755.0,5,,-1,0.0,5,,-3,3741.0,6,,4,-51.0,5,,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,,,,0.049,34714.0,3.8981,-1.6757,-3.2854,-0.7792,-0.5007,-1.8122,0.0233,0.0009,-0.2175,0.0,1.0777,-0.0147
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,-20996.0,4,,-5,-6117.0,5,,-5,-1014.0,5,,-5,-16321.0,3,,-2,2514.0,6,,3,2.0,5,,2,-3.0,5,,-2,-57.0,5,,-4,4562.0,6,,5,0.0,5,,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,,,,0.0739,49853.0,3.5801,-0.2838,-4.2116,-1.227,-0.2034,-3.2738,0.5043,0.0004,-0.0006,-0.0114,0.9151,0.0


In [21]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df = get_change_rate(df, investors) # get column change rate data

In [22]:
df.iloc[0:5, 45:60]

Unnamed: 0,corporateetc_days,foreigneretc,foreigneretc_class,foreigneretc_cr,foreigneretc_days,rc1_pcr,rc2_pcr,rc3_pcr,rc4_pcr,open,high,low,close,close_cr,vol
1,1,40.0,5,,1,,,,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0
2,2,150.0,5,2.75,2,,,,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0
5,3,-7.0,5,-1.0467,-1,,,,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0
6,4,-51.0,5,6.2857,-2,,,,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0
7,5,0.0,5,-1.0,-3,,,,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0
