In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# import tushare as ts
# import QUANTAXIS as QA
import talib as ta
import datetime, time
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from tqdm import tqdm_notebook

In [3]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.style.use('ggplot')

### load the crsp data

In [4]:
crsp = pd.read_csv('data/return_ensembled.zip',
                   parse_dates=['date'], infer_datetime_format=True)
crsp.date = pd.to_datetime(crsp.date)
crsp = crsp.set_index('date')

In [5]:
crsp.head()

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1973-01-31,63845,106005.375,526734.998484,0.724138,0.724138,0.945299
1973-01-31,15580,4425.0,21987.586651,1.043478,1.043478,0.999527
1973-01-31,28820,6768.75,33633.554157,1.1875,1.1875,0.988575
1973-01-31,11253,3038.0,15095.658361,0.942308,0.942308,0.999527
1973-01-31,61903,766.5,3808.697213,0.954545,0.954545,1.015606


In [6]:
def calendar_resample(df):
    res = df.resample('Y').apply({'ret_p1':'cumprod', 'retx_p1':'cumprod',
                                 'dret_p1':'cumprod'})
    res = pd.concat([df.mv, df.mv_adj, res], axis=1)
    res = res.resample('Y').last()
    
#     ret_p1 = df.ret_p1.cumprod().iloc[-1]
#     retx_p1 = df.retx_p1.cumprod().iloc[-1]
#     mv = df.mv.resample('Y').last().iloc[-1]
#     mv_adj = df.mv_adj.resample('Y').last().iloc[-1]
#     res = pd.DataFrame({'ret_p1':ret_p1, 'retx_p1':retx_p1,
#                         'mv':mv, 'mv_adj':mv_adj}, index=[df.index[-1]])
    return res

In [13]:
# crsp[crsp.permno==77418].groupby('permno').apply(calendar_resample)

In [9]:
tic = time.perf_counter()
crsp_ca = crsp.groupby('permno').apply(calendar_resample)
toc = time.perf_counter()
print(toc-tic)
crsp_ca.reset_index().to_csv('data/crsp_ca.csv', index=False)

215.8994486


In [15]:
crsp_ca = crsp_ca.reset_index()

In [90]:
crsp_ca[crsp_ca.mv_adj > 5e+4].shape

(160224, 7)

In [22]:
def calc_exf(df):
    exf = df.mv - df.mv.shift(1) * df.retx_p1
    res = pd.DataFrame({
        'date':df.date,
        'mv':df.mv,
        'mv_adj':df.mv_adj,
        'ret_p1':df.ret_p1,
        'retx_p1':df.retx_p1,
        'dret_p1':df.dret_p1,
        'exf':exf
    })
    return res

In [24]:
sample = crsp_ca.query('permno == 77418').groupby('permno').apply(calc_exf)

Unnamed: 0,date,mv,mv_adj,ret_p1,retx_p1,dret_p1,exf
168264,1992-12-31,167807.2,249291.1,1.918033,1.918033,1.080662,
168265,1993-12-31,411196.5,594163.5,2.000764,2.000001,1.170278,75581.84
168266,1994-12-31,929152.0,1308600.0,1.914528,1.914528,0.97026,141904.7
168267,1995-12-31,3246112.0,4458880.0,2.678568,2.678568,1.260227,757315.6
168268,1996-12-31,3104486.0,4124966.0,0.886665,0.886665,1.222912,226270.6
168269,1997-12-31,9313355.0,12168260.0,2.72181,2.72181,1.322655,863534.7
168270,1998-12-31,71069900.0,91387070.0,6.934765,6.934712,1.271768,6484468.0
168271,1999-12-31,169617500.0,212421800.0,1.956485,1.956485,1.232681,30570330.0
168272,2000-12-31,80879100.0,97924850.0,0.458649,0.458649,0.8869,3084190.0
168273,2001-12-31,136599900.0,162778700.0,0.922413,0.922413,0.863769,61995920.0


In [77]:
writer = pd.ExcelWriter('data/example.xlsx', datetime_format='YYYY-MM-DD')
sample.to_excel(writer, sheet_name='sample', index=False)
writer.save()

In [73]:
sample.set_index('date')['2002':'2006'].ret_p1.cumprod() / \
sample.set_index('date')['2002':'2006'].dret_p1.cumprod()

date
2002-12-31    0.520230
2003-12-31    0.549838
2004-12-31    0.532392
2005-12-31    0.447775
2006-12-31    0.487503
dtype: float64

In [37]:
def calc_wrt(df):
    ret_p1_t = df.ret_p1.prod()
#     retx_p1_t = df.retx_p1.prod()
    dret_p1_t = df.dret_p1_t.prod()
    return pd.DataFrame({
        'ret_p1_t':ret_p1_t,
        'dret_p1_t':dret_p1_t
    })

In [52]:
sample.ret_p1[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1] / \
sample.dret_p1[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1]

168264     9.227099
168265    10.698182
168266    34.121402
168267    27.445991
168268     6.677762
168269     9.835455
168270     2.486446
168271     0.481942
168272     0.294013
168273     0.478177
168274     0.487503
168275     0.663855
168276     0.630132
168277     0.650781
168278     0.773761
168279     0.710705
168280     1.003224
dtype: float64

In [None]:
sample.cumprod()

In [79]:
crsp_fa = pd.read_csv('data/crsp_fa.csv', 
                     parse_dates=['date'], infer_datetime_format=True)

In [80]:
crsp_fa.head()

Unnamed: 0,date,permno,ret_p1,retx_p1,dret_p1,mv,mv_adj
0,1986-10-31,10000,0.178571,0.178571,1.059979,3002.34375,5911.919982
1,1986-06-30,10001,1.03245,1.0,1.252963,6033.125,11992.517933
2,1987-06-30,10001,1.023885,0.959184,1.018842,5822.125,11155.037611
3,1988-06-30,10001,1.140535,1.063829,0.827947,6200.0,11426.022034
4,1989-06-30,10001,1.199811,1.12,1.005611,7007.0,12278.511209


In [92]:
crsp_fa[crsp_fa.mv_adj > 5e+4].shape

(145087, 7)

In [84]:
5e+6

5000000.0

In [45]:
sample_fa = crsp_fa.query('permno == 77418').groupby('permno').apply(calc_exf)

In [47]:
sample_fa

Unnamed: 0,date,mv,mv_adj,ret_p1,retx_p1,dret_p1,exf
147787,1992-06-30,73993.5,114852.6,0.885246,0.885246,0.902617,
147788,1993-06-30,218004.0,328536.4,2.741783,2.740738,1.256202,15207.21
147789,1994-06-30,412965.0,607198.2,1.540542,1.540542,1.046047,77120.78
147790,1995-06-30,1652420.0,2357875.0,3.087717,3.087717,1.174147,377301.0
147791,1996-06-30,3935138.0,5461052.0,1.988631,1.988631,1.220468,649084.0
147792,1997-06-30,5443073.0,7388683.0,1.27143,1.27143,1.321836,439823.2
147793,1998-06-30,22728240.0,30359650.0,3.823007,3.822978,1.302787,1919487.0
147794,1999-06-30,119515800.0,156567800.0,4.185493,4.185493,1.216303,24386900.0
147795,2000-06-30,121650700.0,153626700.0,0.957955,0.957955,1.091818,7159994.0
147796,2001-12-31,136599900.0,162778700.0,0.922413,0.922413,0.863769,24387660.0


In [48]:
sample_fa.ret_p1[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1] / \
sample_fa.dret_p1[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1]

147787    13.508259
147788    13.248084
147789    17.811951
147790    41.619246
147791    13.885897
147792     9.100687
147793     4.922155
147794     1.772812
147795     0.498832
147796     0.478177
147797     0.487503
147798     0.663855
147799     0.630132
147800     0.650781
147801     0.773761
147802     0.710705
147803     1.003224
dtype: float64

In [58]:
sample_ret = sample.ret_p1.round(4)

In [59]:
sample_dret = sample.dret_p1.round(4)

In [60]:
sample_ret[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1] / \
sample_dret[::-1].rolling(5, min_periods=0).apply(np.prod, raw=True)[::-1]

168264     9.226849
168265    10.698055
168266    34.120349
168267    27.446476
168268     6.676942
168269     9.833269
168270     2.485860
168271     0.481856
168272     0.293970
168273     0.478157
168274     0.487516
168275     0.663914
168276     0.630169
168277     0.650802
168278     0.773788
168279     0.710719
168280     1.003232
dtype: float64

In [61]:
decile_return = pd.read_csv('data/decile_return.zip')

In [69]:
(decile_return[decile_return.permno==77418].set_index('date')['2000':'2001'].decret + 1).\
cumprod()

date
2000-01-31    0.955385
2000-02-29    0.973068
2000-03-31    1.039210
2000-04-28    0.983929
2000-05-31    0.948141
2000-06-30    0.991265
2000-07-31    0.976189
2000-08-31    1.049117
2000-09-29    0.993515
2000-10-31    0.974344
2000-11-30    0.876169
2000-12-29    0.886900
Name: decret, dtype: float64