In [1]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [2]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [3]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max ]
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [4]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [5]:
def get_dayofweek(df):
    # insert 요일 column: 1: monday, 5: friday
    df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d"))
    df_dayofweek = df['date'].dt.dayofweek + 1 # 1: monday, 5: Friday
    df.insert(1, 'dayofweek', df_dayofweek)
    
    return df

In [6]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [7]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [8]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

In [9]:
filename = '../data/data_hyunmotor.csv'
df = get_seq_data(filename, kor_days)

In [10]:
# get day of week
df = get_dayofweek(df)

In [11]:
df =  df[:-1]
df.tail()

Unnamed: 0,date,dayofweek,retail,retail_cr,retail_days,foreigner,foreigner_cr,foreigner_days,institution,institution_cr,...,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time
919,2022-07-08,5,-20479.0,,,15617.0,,,3853.0,,...,,177500.0,180500.0,177000.0,178000.0,1.71,147130.0,,,
922,2022-07-11,1,-10749.0,,,11123.0,,,-719.0,,...,,180000.0,182000.0,179000.0,179000.0,0.56,119200.0,,,
923,2022-07-12,2,868.0,,,3282.0,,,-3835.0,,...,,180000.0,181500.0,177000.0,179000.0,0.0,96440.0,,,
924,2022-07-13,3,-40414.0,,,18096.0,,,22755.0,,...,,180000.0,185000.0,178500.0,183500.0,2.51,172566.0,,,
925,2022-07-14,4,13810.0,,,9619.0,,,-23410.0,,...,,183000.0,184500.0,180500.0,182500.0,-0.54,142369.0,,,


In [12]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df = classify_data(df, investors) # column data catagorizing

In [13]:
# get consecutive days
df = consequtive_days(df, investors)

In [14]:
df.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,...,rc4_pcr,open,high,low,close,close_cr,vol,vol_cr,high_time,low_time
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,...,,121000.0,121000.0,118000.0,118000.0,-2.07,66205.0,,,
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,...,,118500.0,119000.0,115000.0,116000.0,-1.69,133657.0,,,
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,...,,116000.0,117000.0,115500.0,116000.0,0.0,38907.0,,,
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,...,,116500.0,117000.0,115500.0,115500.0,-0.43,78229.0,,,
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,...,,115000.0,115500.0,111500.0,112000.0,-3.03,117813.0,,,


In [15]:
# get investors amount relative ratio
investors_sum = ['retail', 'foreigner', 'financial', 'invtrust', 'pension',
         'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df = get_weight_ratio(df, investors, investors_sum)

In [16]:
df.head()

Unnamed: 0,date,dayofweek,retail,retail_class,retail_cr,retail_days,foreigner,foreigner_class,foreigner_cr,foreigner_days,...,institution_ratio,financial_ratio,invtrust_ratio,pension_ratio,privequity_ratio,bank_ratio,insurance_ratio,financeetc_ratio,corporateetc_ratio,foreigneretc_ratio
1,2020-01-02,4,21980.0,5,,1,-14050.0,5,,-1,...,-0.231603,-0.047868,-0.009784,-0.183071,-0.000405,0.000921,0.006541,0.002064,0.084755,0.000737
2,2020-01-03,5,42651.0,6,,2,-27939.0,4,,-2,...,-0.215789,-0.137546,-0.006988,-0.037219,-0.017648,-0.000437,-0.015797,-0.000173,0.064623,0.001526
5,2020-01-06,1,2658.0,5,,3,-1144.0,5,,-3,...,-0.231513,-0.421401,-0.025668,0.061366,0.00579,0.0,0.148901,-0.00041,0.162807,-0.000319
6,2020-01-07,2,13532.0,5,,4,-5817.0,5,,-4,...,-0.328542,-0.077922,-0.050066,-0.181224,0.002333,8.6e-05,-0.021749,0.0,0.107766,-0.001469
7,2020-01-08,3,17848.0,5,,5,-1415.0,5,,-5,...,-0.421158,-0.122701,-0.02034,-0.327383,0.050428,4e-05,-6e-05,-0.001143,0.091509,0.0
