In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# import tushare as ts
# import wrds
import talib as ta
import datetime, time

from scipy import stats
from dateutil.relativedelta import *
from pandas.tseries.offsets import *

In [3]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.style.use('ggplot')

In [4]:
crsp = pd.read_csv('data/return.zip', 
                   parse_dates=['date'], infer_datetime_format=True)
crsp[['PERMNO', 'PERMCO']] = crsp[['PERMNO', 'PERMCO']].astype(int)
crsp_name = pd.read_csv('data/crsp_name.csv', 
                   parse_dates=['namedt','nameendt'], infer_datetime_format=True)
crsp_name = crsp_name.drop('Unnamed: 0', axis=1)
crsp_name[['permno', 'shrcd', 'exchcd']] = \
crsp_name[['permno', 'shrcd', 'exchcd']].astype(int)

crsp_name['namedt'] = pd.to_datetime(crsp_name['namedt'])
crsp_name['nameendt'] = pd.to_datetime(crsp_name['nameendt'])
# if nameendt is missing then set to today date
crsp_name['nameendt'] = crsp_name['nameendt'].fillna(pd.to_datetime('today'))
crsp_merge = pd.merge(crsp, crsp_name, 'inner', left_on='PERMNO', right_on='permno')
crsp_merge = crsp_merge[(crsp_merge.date <= crsp_merge.nameendt) &
                       (crsp_merge.date >= crsp_merge.namedt)]
crsp_merge.RET = crsp_merge.RET.replace(['B', 'C'], [np.nan, np.nan]).astype(np.float64)
crsp_merge.RETX = crsp_merge.RETX.replace(['B', 'C'], [np.nan, np.nan]).astype(np.float64)

crsp_merge.RET = crsp_merge.RET.fillna(0)
crsp_merge.RETX = crsp_merge.RETX.fillna(0)

crsp_merge = crsp_merge.drop(['permno', 'namedt', 'nameendt'], axis=1)

crsp_merge.columns = [i.lower() for i in crsp_merge.columns]

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
crsp_merge = crsp_merge[['permno', 'date', 'ticker', 'comnam',  
                         'prc', 'ret', 'shrout', 'retx']]

### calculate the mv, ret+1, retx+1

In [10]:
# calculate the mv, ret+1, retx+1
crsp1 = crsp_merge.assign(mv = crsp_merge.prc.abs() * crsp_merge.shrout,
                         ret_p1 = crsp_merge.ret+1,
                         retx_p1 = crsp_merge.retx+1)

In [12]:
# resample : use calendar year
def normal_resample(df):
    res = df.resample('Y').apply({'ret_p1':'cumprod', 'retx_p1':'cumprod'})
    res = pd.concat([res, df.mv], axis=1)
    res = res.resample('Y').last()
    return res

In [15]:
# test_code_list = crsp1.permno.unique().tolist()[-5:]
# test_data = crsp1.set_index('permno').iloc[test_code_list]
# test_data = test_data.reset_index().set_index('date')
# test_data.groupby('permno').apply(normal_resample)

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,mv
permno,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10234,1974-12-31,1.0,1.0,
17778,1981-12-31,1.119318,1.119318,485605.0
19271,2000-12-31,1.072917,1.072917,137801.125


In [16]:
tic = time.perf_counter()
crsp2 = crsp1.set_index('date').groupby('permno').apply(normal_resample)
toc = time.perf_counter()
print(toc-tic)

210.2191183


In [17]:
crsp2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,mv
permno,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000,1986-12-31,0.117857,0.117857,1981.56609
10000,1987-12-31,0.424243,0.424243,851.59375
10001,1986-12-31,1.217369,1.142857,6937.0
10001,1987-12-31,0.898725,0.839285,5828.0
10001,1988-12-31,1.16316,1.085106,6362.25


example : America Online

In [24]:
code = 77418 # America Online的permno
sample_data = crsp2.loc[(code, slice(None))]

In [29]:
# calculate the EXF
sample_data = sample_data.assign(
    exf = sample_data.mv - sample_data.mv.shift(1) * sample_data.retx_p1
)

In [30]:
sample_data

Unnamed: 0_level_0,ret_p1,retx_p1,mv,exf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-12-31,1.918033,1.918033,167807.2,
1993-12-31,2.000764,2.000001,411196.5,75581.84
1994-12-31,1.914528,1.914528,929152.0,141904.7
1995-12-31,2.678568,2.678568,3246112.0,757315.6
1996-12-31,0.886665,0.886665,3104486.0,226270.6
1997-12-31,2.72181,2.72181,9313355.0,863534.7
1998-12-31,6.934765,6.934712,71069900.0,6484468.0
1999-12-31,1.956485,1.956485,169617500.0,30570330.0
2000-12-31,0.458649,0.458649,80879100.0,3084190.0
2001-12-31,0.922413,0.922413,136599900.0,61995920.0


In [36]:
mv_2000 = sample_data['2000'].mv.iloc[0]
mv_2001 = sample_data['2001'].mv.iloc[0]
rx_2001 = sample_data['2001'].retx_p1.iloc[0] - 1
exf_2001 = sample_data['2001'].exf.iloc[0]
print(mv_2000, mv_2001, rx_2001, exf_2001)

80879097.6 136599880.8 -0.0775866884424 61995924.547


通过上表,我们可以观察到$MV_{2000}$为80879, $MV_{2001}$为136600(四舍五入),

$rx_{2001}$为-7.76%, $EXF_{2001}$为61996.

为了方便计算$WRT_{t+1, t+T}$, 我们需要对normal_resample函数进行改造

In [56]:
def normal_resample_t(df, period):
    if type(period) is int:
        period = str(period)
#     res = df.rolling(int(period)).apply({'ret_p1':'cumprod', 'retx_p1':'cumprod'})
    ret_rolling = df.ret_p1.rolling(int(period)).apply(pd.Series.cumprod, raw=False)
    retx_rolling = df.retx_p1.rolling(int(period)).apply(pd.Series.cumprod, raw=False)
    res = pd.concat([ret_rolling, retx_rolling, df.mv], axis=1)
#     res = res.resample(period+'Y').last()
    return res

In [57]:
test_code_list = crsp1.permno.unique().tolist()[-5:]
test_data = crsp1.set_index('permno').iloc[test_code_list]
test_data = test_data.reset_index().set_index('date')
test_data.groupby('permno').apply(normal_resample_t, period=5)

Unnamed: 0_level_0,ret_p1,retx_p1,mv
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1974-02-28,,,
2000-07-31,,,137801.125
1974-03-29,,,
1974-05-31,,,
1981-02-27,,,485605.0


In [58]:
tic = time.perf_counter()
crsp3 = crsp2.reset_index().set_index('date')
crsp3 = crsp3.groupby('permno').apply(normal_resample_t, period=5)
toc = time.perf_counter()
print(toc-tic)

TypeError: cannot convert the series to <class 'float'>

In [48]:
crsp3.head()

Unnamed: 0_level_0,ret_p1,retx_p1,mv
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1986-12-31,0.117857,0.117857,1981.56609
1987-12-31,0.424243,0.424243,851.59375
1986-12-31,1.217369,1.142857,6937.0
1987-12-31,0.898725,0.839285,5828.0
1988-12-31,1.045361,0.910713,6362.25


In [47]:
crsp3.loc[(code, slice(None))]

TypeError: cannot do index indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> with these indexers [77418] of <class 'int'>