In [13]:
import os
import glob
import datetime
import statistics
import numpy as np
import pandas as pd
import import_ipynb
from common import *
from marcap import marcap_data
import matplotlib.pyplot as plt
from datetime import timedelta, date

In [18]:
DATA_DIR = 'data/daily/8-1'
df_account = pd.DataFrame(columns=['start_date', 'end_date', 'filepath'])
filepaths = glob.glob(DATA_DIR + "/*.csv")
filepaths = sorted(filepaths)

days = []
for i in range(len(filepaths)):
    if i < len(filepaths) - 1: # exclude last item
        start_date =  os.path.basename(filepaths[i])[:-4]
        end_date =  os.path.basename(filepaths[i + 1])[:-4]
        
        day = {'start_date':start_date, 'end_date':end_date, 'filepath':filepaths[i]}
        days.append(day)

df_account = df_account.append(days)

In [19]:
df_account

Unnamed: 0,start_date,end_date,filepath
0,2002-08-01,2003-08-01,data/daily/8-1/2002-08-01.csv
1,2003-08-01,2004-08-02,data/daily/8-1/2003-08-01.csv
2,2004-08-02,2005-08-01,data/daily/8-1/2004-08-02.csv
3,2005-08-01,2006-08-01,data/daily/8-1/2005-08-01.csv
4,2006-08-01,2007-08-01,data/daily/8-1/2006-08-01.csv
5,2007-08-01,2008-08-01,data/daily/8-1/2007-08-01.csv
6,2008-08-01,2009-08-03,data/daily/8-1/2008-08-01.csv
7,2009-08-03,2010-08-02,data/daily/8-1/2009-08-03.csv
8,2010-08-02,2011-08-01,data/daily/8-1/2010-08-02.csv
9,2011-08-01,2012-08-01,data/daily/8-1/2011-08-01.csv


In [22]:
# volatility가 큰 기업들 가져오기
def get_stocks(filepath, start_date):
    print(start_date, filepath)
    df = pd.read_csv(filepath, dtype={"기업코드":"string", "종목코드":"string"})
    
    cols = ['종목코드', '회사명', '시가총액', 'PBR', 'GP/A', '당기순이익']

    df = df[cols]
    
    # 지주사, 금융사 제외
    df = exclude_holdings_and_finances(df, '회사명')
    
    # 국외주식 제외
    df = exclude_foreign_corps(df, '종목코드')
    
    # 시가총액 상위 20% 
    df = df[df['시가총액'] > 0] # 시가총액 data가 없는 row는 제거
    df = df.sort_values(by=['시가총액'], ascending=False)
    df = df[:int(len(df)*0.2)]
    
    big_stock_codes = df['종목코드'].tolist()
    
    start_date_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    
    year_ago = start_date_dt - timedelta(days=365)
    day_ago = start_date_dt - timedelta(days=1)
    
    df_price = marcap_data(year_ago.strftime("%Y-%m-%d"), day_ago.strftime("%Y-%m-%d")).reset_index()
    df_price = df_price[['Code', 'Close']]
    df_price = df_price[df_price['Code'].isin(big_stock_codes)]
        
    df_price = df_price.groupby('Code').agg([np.std, 'count'])
    df_price = df_price[df_price['Close']['count'] > 200]

    df_price = df_price.sort_values([('Close', 'std')], ascending=False)
    
    return df_price[:30].index.tolist()

In [23]:
df_account['stocks'] = df_account.apply(lambda x: get_stocks(x.filepath, x.start_date), axis=1)

2002-08-01 data/daily/8-1/2002-08-01.csv
2003-08-01 data/daily/8-1/2003-08-01.csv
2004-08-02 data/daily/8-1/2004-08-02.csv
2005-08-01 data/daily/8-1/2005-08-01.csv
2006-08-01 data/daily/8-1/2006-08-01.csv
2007-08-01 data/daily/8-1/2007-08-01.csv
2008-08-01 data/daily/8-1/2008-08-01.csv
2009-08-03 data/daily/8-1/2009-08-03.csv
2010-08-02 data/daily/8-1/2010-08-02.csv
2011-08-01 data/daily/8-1/2011-08-01.csv
2012-08-01 data/daily/8-1/2012-08-01.csv
2013-08-01 data/daily/8-1/2013-08-01.csv
2014-08-01 data/daily/8-1/2014-08-01.csv
2015-08-03 data/daily/8-1/2015-08-03.csv
2016-08-01 data/daily/8-1/2016-08-01.csv
2017-08-01 data/daily/8-1/2017-08-01.csv
2018-08-01 data/daily/8-1/2018-08-01.csv


In [24]:
df_account

Unnamed: 0,start_date,end_date,filepath,stocks
0,2002-08-01,2003-08-01,data/daily/8-1/2002-08-01.csv,"[005300, 004990, 005930, 003920, 052270, 00417..."
1,2003-08-01,2004-08-02,data/daily/8-1/2003-08-01.csv,"[005300, 004990, 003920, 005930, 017670, 03657..."
2,2004-08-02,2005-08-01,data/daily/8-1/2004-08-02.csv,"[005930, 035250, 005300, 003640, 004370, 06908..."
3,2005-08-01,2006-08-01,data/daily/8-1/2005-08-01.csv,"[003240, 005300, 003920, 004990, 010780, 00593..."
4,2006-08-01,2007-08-01,data/daily/8-1/2006-08-01.csv,"[004990, 005300, 003920, 035420, 002790, 01078..."
5,2007-08-01,2008-08-01,data/daily/8-1/2007-08-01.csv,"[003240, 000670, 004990, 005300, 090430, 00549..."
6,2008-08-01,2009-08-03,data/daily/8-1/2008-08-01.csv,"[003240, 005300, 004990, 000670, 010060, 00392..."
7,2009-08-03,2010-08-02,data/daily/8-1/2009-08-03.csv,"[003240, 004990, 005300, 003920, 000670, 00593..."
8,2010-08-02,2011-08-01,data/daily/8-1/2010-08-02.csv,"[030000, 004990, 090430, 000670, 003240, 00549..."
9,2011-08-01,2012-08-01,data/daily/8-1/2011-08-01.csv,"[003240, 005300, 004990, 000670, 004170, 00392..."


In [25]:
def get_daily_yield(row):
    threshold = row['Open'] + row['volatility'] * 0.5
    if row['High'] > threshold:
        return (row['Close'] / threshold) * 0.997
    else:
        return 1.0

In [26]:
def get_stock_yield(df_price, stock_code, start_date, end_date):
    df_price = df_price[df_price['Code'] == stock_code]
    
    if len(df_price) == 0:
        return None
    
    df_price = df_price[['Date', 'Open', 'Close', 'High', 'Low']]
    
    df_price['prev_high'] = df_price['High'].shift(1)
    df_price['prev_low'] = df_price['Low'].shift(1)
    
    df_price['volatility'] = df_price['prev_high'] - df_price['prev_low']
    df_price = df_price[1:]
    
    df_price['yield'] = df_price.apply(lambda x: get_daily_yield(x), axis=1)
    
    return df_price['yield'].cumprod().tail(1).item()

In [27]:
# 연도별 수익률을 구한다.
df_account['yield'] = 0.0
df_account['yields'] = 0.0
df_account['yields'] = df_account['yields'].astype('object')

for i in range(len(df_account)):
    start_date = df_account.iloc[i].start_date
    end_date = df_account.iloc[i].end_date

    stocks = df_account.iloc[i].stocks
    yields = []

    df_price = marcap_data(start_date, end_date).reset_index()

    # 종목별 1년 수익률을 구한다.
    for stock in stocks:
        stock_yield = get_stock_yield(df_price, stock, start_date, end_date)
        if stock_yield:
            yields.append(stock_yield)
            print(stock, stock_yield)

    # stock 수익률들의 avg가 total 수익률이 된다. (동일 비중이기 때문에)
    mean_yields = statistics.mean(yields)
    df_account.at[i, 'yield'] = mean_yields
    df_account.at[i, 'yields'] = yields

    print(start_date, '~', end_date, ":", mean_yields)

005300 1.1198305576377618
004990 1.671484226639845
005930 1.0434811262371182
003920 1.1821749048044627
052270 0.6796762967389185
004170 0.8618334403314072
036570 1.2717428626827076
002270 0.7530004983633373
028150 0.868150310135971
035760 1.0093445807749706
002380 0.971316767836231
030000 0.6331491339157532
002790 0.8701106945172875
006400 1.1952955974985697
005490 0.7278973302248725
017670 0.7279498122247826
003240 1.0449407255738925
009150 1.2442255682516314
007340 0.9295093230544709
004370 1.0763653875089771
001450 0.7739042209749233
016800 1.0514395670020997
013000 0.268104268687157
010280 0.3726642927303884
051910 1.1700240092844347
005440 0.8200788851291566
005500 0.5333080191563977
034120 0.9732566324802454
053800 0.4751471292170847
029530 0.8431746099912955
2002-08-01 ~ 2003-08-01 : 0.905419359320205
005300 1.3438418092831474
004990 1.3726597409038543
003920 1.8117949170598133
005930 0.9506268942556576
017670 0.8149366144022668
036570 0.8027078486807043
026960 0.733712852152002

KeyboardInterrupt: 

In [None]:
df_account['total'] = df_account['yield'].cumprod()

df_account['cagr'] = 0.0
for i, row in enumerate(df_account.iterrows()):
    df_account.at[i, 'cagr'] = pow(row[1]['total'], 1/(i+1))    

In [None]:
fig, ax = plt.subplots()
ax.plot_date(df_account['start_date'], df_account['total'], marker='', linestyle='-')
fig.autofmt_xdate()
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot_date(df_account['start_date'], df_account['cagr'], marker='', linestyle='-')
fig.autofmt_xdate()
plt.show()