In [None]:
import pandas as pd
import numpy as np
import datetime, time

import matplotlib.pyplot as plt

In [None]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.precision = 4

In [None]:
def allsatsundays(start_date, end_date):
    day_sat = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SAT').strftime('%Y-%m-%d').tolist()
    day_sun = pd.date_range(start=start_date, end=end_date, 
                         freq='W-SUN').strftime('%Y-%m-%d').tolist()
    days = day_sat+day_sun
    days.sort()
    return days

def get_stock_market_holydays(filename, start_date, end_date):
    df = pd.read_csv(filename)
    kor_days = df['KOR'][df['KOR'].notnull()].values.tolist()
    usa_days = df['USA'][df['USA'].notnull()].values.tolist() # Nan value를 제외한 값을 리스트로 변환

    holydays = allsatsundays(start_date, end_date)
    
    kor_days = sorted(set(kor_days + holydays))
    usa_days = sorted(set(usa_days + holydays))
    
    return kor_days, usa_days

def get_seq_data(filename, holydays):
    df = pd.read_csv(filename)
    df_seq = df[~df['date'].isin(holydays)].copy()
    df_seq.dropna(subset=df.columns[1:], how='all', inplace=True)
    return df_seq

In [None]:
def classify_data(df, col_name): # column 데이터를 9개로 분류
    col_index = df.columns.tolist()
    for i, col in enumerate(col_name):
        mu = df[col].mean()
        sigma = df[col].std()
        df_min = df[col].min()
        df_max = df[col].max()
        cut_div = [df_min, mu - 3*sigma, mu - 2*sigma, mu - sigma, mu - 0.5*sigma, 
                   mu + 0.5*sigma, mu + sigma, mu + 2*sigma, mu + 3*sigma, df_max ]
        ix = i + col_index.index(col)
        df_new = pd.cut(df[col], cut_div, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
        df.insert(ix+1, col+'_class', df_new)
        df[[col, col+'_class']].head()
        
    return df

In [None]:
def consecutive_count(df_sr, idx): # series, idx: row index 여기서부터 역산으로 추출
    # 매도(-), 매수(+) 연속일 계산
    if idx >= 29: # 최대 20개까지만 확인
        df_sel = df_sr[idx-29:idx+1][::-1]
    else:
        df_sel = df_sr[:idx+1][::-1]
    sr = df_sel.apply(lambda x : 1 if (x > 0) else -1)
    p_sum = 0
    m_sum = 0
    z_sum = 0
    for i, num in enumerate(sr):
        if (num == 0) & (i == z_sum) :
            z_sum += 1
            continue
        elif (num == 1) & (i == p_sum) :
            p_sum += 1
            continue
        elif (num == -1) & (i == m_sum) :
            m_sum += 1
            continue   
        else :
            break

    if p_sum > 0 : 
        return p_sum
    elif m_sum > 0 :
        return -m_sum
    else :
        return z_sum

In [None]:
def consequtive_days(df, col_names):
    for col in col_names:
        df_temp = df[col].apply(lambda x : 1 if (x > 0) else -1)
        count = []
        for idx in range(len(df_temp)):
            count.append(consecutive_count(df_temp, idx))

        df[col+'_days'] = count  # 매수, 매도 거래 연속일
        
    return df

In [None]:
def get_weight_ratio(df, investors, investor_sum): # 투자자별 매수 매도 비중 비율
    df['total'] = df[investors_sum].abs().sum(axis=1).to_frame()
    for col in investors:
        df[col+'_ratio'] = df[col]/df['total']*10 # magnify
        
    return df    

In [None]:
def get_change_rate(df, col_names): # get _cr value for each column
    for col in col_names:
        df[col+'_cr'] = (df[col] - df[col].shift(1)) / df[col].shift(1)
    
    return df

In [None]:
# fill nan with the preceesing value
def fillna_with_bfill(df, col_names): 
    for col in col_names:
        df[col].fillna(method='ffill', inplace=True) # forward fill, or bfill
    
    return df

In [None]:
def get_dayofweek(df):
    # insert 요일 column: 1: monday, 5: friday
    df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d"))
    df_dayofweek = df['date'].dt.dayofweek + 1 # 1: monday, 5: Friday
    df.insert(1, 'dayofweek', df_dayofweek)
    
    return df

In [None]:
filename = 'https://raw.githubusercontent.com/fasthill/'\
            'factors_affecting_stock_price/main/data/stock_market_holydays/stock_market_holidays.csv'
start_date = '2020-01-01'
end_date = '2022-12-31'

kor_days, usa_days = get_stock_market_holydays(filename, start_date, end_date) # get stock market holydays

### get and manipulate common data

In [None]:
filename = '../data/data_common.csv'
df_common = get_seq_data(filename, usa_days)

In [None]:
fill_columns =  ['cpi', 'cpi_anticipated', 'cpi_previous',
                 'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'fed_rate_ann', 'fed_rate_imp', 
                 'bok_rate', 'bok_rate_fore', 'bok_rate_prev', 'bok_rate_ann', 'bok_rate_imp' ]
futures_columns =  ['fu_usa_date', 'op_usa_date', 'qw_usa_day', 
                 'fu_kor_date', 'op_kor_date', 'dw_kor_day']
df_common = fillna_with_bfill(df_common, fill_columns)
df_common[futures_columns] = df_common[futures_columns].replace(np.nan, 0)

In [None]:
df_common.head()

In [None]:
df_common.dropna(subset=['ixic', 'dji'], inplace=True) # drop empty rows

In [None]:
df_common.tail()

In [None]:
df_common.replace('%', '', regex=True, inplace = True) # delete '%'

In [None]:
df_common.head()

In [None]:
df_common.dropna(axis=1, how='all', inplace=True)

In [None]:
df_common.tail()

### get and manipulate company data

In [None]:
filename = '../data/data_hyunmotor.csv'
df_company = get_seq_data(filename, kor_days)

In [None]:
# get day of week
df_company = get_dayofweek(df_company)

In [None]:
# get volumne percentage
df_company['vol_percent'] = (df_company['vol'] / df_company['vol'].sum()) * 100

In [None]:
df_company =  df_company[:-1]  # delete last row 
df_company.tail()

In [None]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = classify_data(df_company, investors) # column data catagorizing

In [None]:
# get consecutive days with non empty values
df_company = consequtive_days(df_company, investors)

In [None]:
df_company.head()

In [None]:
# get investors amount relative ratio
investors_sum = ['retail', 'foreigner', 'financial', 'invtrust', 'pension',
         'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = get_weight_ratio(df_company, investors, investors_sum)

In [None]:
df_company.head()

In [None]:
investors = ['retail', 'foreigner', 'institution', 'financial', 'invtrust', 'pension',
             'privequity', 'bank',  'insurance', 'financeetc', 'corporateetc', 'foreigneretc']
df_company = get_change_rate(df_company, investors) # get column change rate data

In [None]:
df_company.iloc[0:5, 45:60]

In [None]:
df_company.dropna(axis=1, how='all', inplace=True)

In [None]:
df_company.iloc[0:5, 45:60]