In [1]:
import os
import glob
import random
import datetime
import importlib
import statistics
import numpy as np
import pandas as pd
from marcap import marcap_data
import FinanceDataReader as fdr
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from prettytable import PrettyTable

In [2]:
import models.qp as model # 1/PBR + GP/A
#import models.ncav as model # NCAV
#import models.super_value as model # PBR + PCR + PER + PSR
#import models.super_quality as model
#import models.lu_zhang as model

importlib.reload(model)

<module 'models.qp' from '/Users/seunghun/Documents/Documents - Seunghun’s MacBook Pro/Projects/stock-backtesting/backtest-playground/quantitative-value/models/qp.py'>

In [3]:
MODEL_NAME = 'qp'
#DATA_DIR = '../data/quarterly/3-6-9-12' # quarter
# DATA_DIR = '../data/half-year/2-8' # half year
DATA_DIR = '../data/yearly/8-1'
REBALANCE = 1 # 1: yearly, 2: half-yearly 4: quarterly

In [4]:
df_account = pd.DataFrame(columns=['start_date', 'end_date', 'filepath', 'mc_filepath'])
filepaths = glob.glob(DATA_DIR + "/*.csv")
filepaths = sorted(filepaths)

days = []
for i in range(len(filepaths)):
    if i < len(filepaths) - 1: # exclude last item
        start_date = os.path.basename(filepaths[i])[:-4]
        end_date = os.path.basename(filepaths[i + 1])[:-4]
        
        day = {'start_date':start_date, 'end_date':end_date, 'filepath':filepaths[i]}
        days.append(day)

df_account = df_account.append(days)

In [5]:
filepaths

['../data/yearly/8-1/2002-08-01.csv',
 '../data/yearly/8-1/2003-08-01.csv',
 '../data/yearly/8-1/2004-08-02.csv',
 '../data/yearly/8-1/2005-08-01.csv',
 '../data/yearly/8-1/2006-08-01.csv',
 '../data/yearly/8-1/2007-08-01.csv',
 '../data/yearly/8-1/2008-08-01.csv',
 '../data/yearly/8-1/2009-08-03.csv',
 '../data/yearly/8-1/2010-08-02.csv',
 '../data/yearly/8-1/2011-08-01.csv',
 '../data/yearly/8-1/2012-08-01.csv',
 '../data/yearly/8-1/2013-08-01.csv',
 '../data/yearly/8-1/2014-08-01.csv',
 '../data/yearly/8-1/2015-08-03.csv',
 '../data/yearly/8-1/2016-08-01.csv',
 '../data/yearly/8-1/2017-08-01.csv',
 '../data/yearly/8-1/2018-08-01.csv',
 '../data/yearly/8-1/2019-08-01.csv']

### 80% 기업만 샘플링

In [8]:
stock_codes = set()
i = 0
for filepath in filepaths:
    df_daily = pd.read_csv(filepath, dtype={"기업코드":"string", "종목코드":"string"})
    stock_codes.update(df_daily['종목코드'].tolist())
    
#     i += 1
#     if i == 2:
#         break

In [11]:
cagrs = []
for e in range(100):
    # 랜덤 80% 기업 리스트 생성
    sampled_stock_codes = random.sample(stock_codes, int(len(stock_codes) * 0.8))
    mc_filepaths = []
    
    # 80%에 포함된 기업으로만 daily file 생성
    for i, filepath in enumerate(filepaths):
        df_daily = pd.read_csv(filepath, dtype={"기업코드":"string", "종목코드":"string"})
        df_sampled = df_daily[df_daily['종목코드'].isin(sampled_stock_codes)]
        #print(len(df_sampled) / len(df_daily))

        filename = os.path.basename(filepath)
        file_dir = os.path.dirname(filepath) + '/mc/'
        filepath = file_dir + filename

        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        df_sampled.to_csv(filepath, index=False)
        df_account['mc_filepath'][i] = filepath
        mc_filepaths.append(filepath)
        
    # 시뮬레이션 시작
    
    # 연도별 종목을 구한다.
    df_account['stocks'] = df_account.apply(lambda x: model.get_stocks(x.mc_filepath, x.start_date, False), axis=1)
    
    # 연도별 수익률을 구한다.
    df_account['yield'] = 0.0
    df_account['yields'] = 0.0
    df_account['yields'] = df_account['yields'].astype('object')

    for i in range(len(mc_filepaths)):
        if i < len(mc_filepaths) - 1: # exclude last item
            start_date = df_account.iloc[i].start_date
            end_date = df_account.iloc[i].end_date

            df_this_year = pd.read_csv(mc_filepaths[i], dtype={"기업코드":"string", "종목코드":"string"})[['종목코드', '주가']]
            df_next_year = pd.read_csv(mc_filepaths[i + 1], dtype={"기업코드":"string", "종목코드":"string"})[['종목코드', '주가']]

            stocks = df_account.iloc[i].stocks   
            yields = []

            # 종목별 수익률, 상장폐지 기업 수를 구한다.
            delisted = 0 # 상장폐지 기업 수
            delisted_stocks  = []
            for stock in stocks:
                start_price = df_this_year[df_this_year['종목코드'] == stock]['주가'].iloc[0]
                end_price = df_next_year[df_next_year['종목코드'] == stock]['주가']

                if len(end_price) == 0:
                    end_price = 0
                else:                    
                    end_price = end_price.fillna(0) 
                    end_price = end_price.iloc[0]

                if end_price == 0: # 가격 정보가 없을 경우 상폐
                    delisted += 1
                    delisted_stocks.append(stock)

                # get yield
                yields.append(end_price / start_price)

            # stock 수익률들의 avg가 total 수익률이 된다. (동일 비중이기 때문에)
            mean_yields = statistics.mean(yields)
            df_account.at[i, 'yield'] = mean_yields
            df_account.at[i, 'yields'] = yields
            #print(start_date, '~', end_date, ":", mean_yields)
    
    df_account['total'] = df_account['yield'].cumprod()
    cagr = pow(df_account['total'][len(df_account) - 1], 1/(len(df_account)/REBALANCE))
    print(e, cagr)
    
    cagrs.append(cagr)

0 1.38035724290533


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1 1.4094700812569338
2 1.4369384076603002
3 1.4290831342580879
4 1.4229557308563363
5 1.4141389380701297
6 1.3602019737126765
7 1.427249048946543
8 1.4450726593404917
9 1.365563916285668
10 1.4057922688239237
11 1.4036415855037438
12 1.3815497254960882
13 1.4044191461156543
14 1.3863190953348652
15 1.4500249609773483
16 1.4702132548298763
17 1.4108600983728483
18 1.4144292766892708
19 1.4220393832382392
20 1.4197983052657703
21 1.4207428688583554
22 1.435923629727126
23 1.4283622826510265
24 1.4380884591855096
25 1.4473270530788094
26 1.4045990772785515
27 1.3565741841535082
28 1.4248399514708012
29 1.4402815212714586
30 1.3889531148042307
31 1.4417057999218115
32 1.3802427639847266
33 1.4258916499355683
34 1.3998133580293644
35 1.397397493597402
36 1.4602645631659785
37 1.3820272583516098
38 1.4329960262304855
39 1.4178560888479976
40 1.4931951771083545
41 1.408770997393518
42 1.4316230485901078
43 1.4483656701818455
44 1.4164962454571077
45 1.4193015332576586
46 1.41295253740105
47 1

In [12]:
np.mean(cagrs)

1.4214970302124785

In [13]:
np.std(cagrs)

0.025966703696751015