In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import datetime, time
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from tqdm import tqdm_notebook

In [3]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.style.use('ggplot')

In [4]:
# 这里选用筛选过后的fundamental数据
comp = pd.read_csv('data/fundamental_fiscal_filtered.zip',
                  parse_dates=['datadate', 'fysm'], infer_datetime_format=True)
comp.datadate = pd.to_datetime(comp.datadate)
comp.fysm = pd.to_datetime(comp.fysm)

In [5]:
comp.head()

Unnamed: 0,gvkey,datadate,fyear,fystr,fysm,permno
0,1000,1973-12-31,1973.0,A-Dec,1973-01-01,25881
1,1000,1974-12-31,1974.0,A-Dec,1974-01-01,25881
2,1000,1975-12-31,1975.0,A-Dec,1975-01-01,25881
3,1000,1976-12-31,1976.0,A-Dec,1976-01-01,25881
4,1000,1977-12-31,1977.0,A-Dec,1977-01-01,25881


In [4]:
crsp = pd.read_csv('data/return_ensembled.zip',
                   parse_dates=['date'], infer_datetime_format=True)
crsp.date = pd.to_datetime(crsp.date)
crsp = crsp.set_index('date')

验证mv_adjust是否正确

In [10]:
cpi = pd.read_excel('data/CPIAUCSL.xlsx', index_col=0)

In [37]:
cpi['1986']

Unnamed: 0_level_0,cpiaucsl,cpi_adjust
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1986-01-01,109.9,0.517971
1986-02-01,109.7,0.515781
1986-03-01,109.1,0.511132
1986-04-01,108.7,0.508082
1986-05-01,109.0,0.506487
1986-06-01,109.4,0.503074
1986-07-01,109.5,0.499963
1986-08-01,109.6,0.501166
1986-09-01,110.0,0.502565
1986-10-01,110.2,0.507846


In [7]:
crsp.head()

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1986-01-31,10000,16100.0,31214.773929,1.0,1.0,1.051443
1986-01-31,10001,6033.125,11697.057948,1.0,1.0,1.062222
1986-01-31,10002,13659.375,26482.87594,1.0,1.0,1.066751
1986-01-31,10003,41800.0,81042.083865,1.0,1.0,1.030587
1986-01-31,10005,1745.625,3384.427934,1.0,1.0,1.08672


 在1986-01-31这一天，选择permno==10000的mv值，除以1986-02的cpi_adjust,得到mv的adjust值;
 
 然后和我们已经计算好的mv_adj做比较, 比较的方法就是两个相减, 可以看到误差非常小

In [31]:
(crsp['1986-01-31'].query('permno == 10000').mv.iloc[0] / \
cpi['1986-02'].cpi_adjust.iloc[0]) - \
(crsp['1986-01-31'].query('permno == 10000').mv_adj.iloc[0])

-3.637978807091713e-12

因为cpi的adjust在1986的1月和2月差距并不大,为了排除是这个

In [29]:
(crsp['1986-01-31'].query('permno == 10000').mv.iloc[0] / \
cpi['1986-01'].cpi_adjust.iloc[0]) - \
(crsp['1986-01-31'].query('permno == 10000').mv_adj.iloc[0])

-131.95864227279526

In [10]:
crsp.head()

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,vwretd_p1,vwretx_p1,ewretd_p1,ewretx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986-01-31,10000,16100.0,31214.773929,1.0,1.0,1.009829,1.008006,1.044071,1.043082,1.051443
1986-01-31,10001,6033.125,11697.057948,1.0,1.0,1.009829,1.008006,1.044071,1.043082,1.062222
1986-01-31,10002,13659.375,26482.87594,1.0,1.0,1.009829,1.008006,1.044071,1.043082,1.066751
1986-01-31,10003,41800.0,81042.083865,1.0,1.0,1.009829,1.008006,1.044071,1.043082,1.030587
1986-01-31,10005,1745.625,3384.427934,1.0,1.0,1.009829,1.008006,1.044071,1.043082,1.08672


In [11]:
def fiscal_resample(df, fys, per):
    # 将当年
    ret_p1 = df.ret_p1.prod()
    retx_p1 = df.retx_p1.prod()
    dret_p1 = df.dret_p1.prod()
    vwretd_p1 = df.vwretd_p1.prod()
    vwretx_p1 = df.vwretx_p1.prod()
    ewretd_p1 = df.ewretd_p1.prod()
    ewretx_p1 = df.ewretx_p1.prod()
    
    mv = df.mv.iloc[-1]
    mv_adj = df.mv_adj.iloc[-1]
    
    res = pd.DataFrame({'ret_p1':ret_p1, 
                        'retx_p1':retx_p1, 
                        'dret_p1':dret_p1,
                        'vwretd_p1':vwretd_p1,
                        'vwretx_p1':vwretx_p1,
                        'ewretd_p1':ewretd_p1,
                        'ewretx_p1':ewretx_p1,
                        'mv':mv, 'mv_adj':mv_adj
                       }, index=[per])
    return res

In [13]:
comp[comp.gvkey==25056]

Unnamed: 0,gvkey,datadate,fyear,fystr,fysm,permno
189670,25056,1992-06-30,1992.0,A-Jun,1991-07-01,77418
189671,25056,1993-06-30,1993.0,A-Jun,1992-07-01,77418
189672,25056,1994-06-30,1994.0,A-Jun,1993-07-01,77418
189673,25056,1995-06-30,1995.0,A-Jun,1994-07-01,77418
189674,25056,1996-06-30,1996.0,A-Jun,1995-07-01,77418
189675,25056,1997-06-30,1997.0,A-Jun,1996-07-01,77418
189676,25056,1998-06-30,1998.0,A-Jun,1997-07-01,77418
189677,25056,1999-06-30,1999.0,A-Jun,1998-07-01,77418
189678,25056,2000-06-30,2000.0,A-Jun,1999-07-01,77418
189679,25056,2001-12-31,2001.0,A-Dec,2001-01-01,77418


In [10]:
crsp[crsp.permno==77418]

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-03-31,77418,8.337175e+04,1.279334e+05,1.000000,1.000000,0.979203
1992-04-30,77418,6.423725e+04,9.858713e+04,0.770492,0.770492,0.957847
1992-05-31,77418,6.833750e+04,1.052740e+05,1.063830,1.063830,1.007207
1992-06-30,77418,7.399350e+04,1.148526e+05,1.080000,1.080000,0.955467
1992-07-31,77418,7.741912e+04,1.206835e+05,1.046296,1.046296,1.036125
1992-08-31,77418,7.399350e+04,1.149264e+05,0.955752,0.955752,0.974647
1992-09-30,77418,8.358000e+04,1.296509e+05,1.111111,1.111111,1.021800
1992-10-31,77418,9.333100e+04,1.429242e+05,1.116667,1.116667,1.024671
1992-11-30,77418,1.100470e+05,1.650728e+05,1.179104,1.179104,1.085735
1992-12-31,77418,1.678072e+05,2.492911e+05,1.481013,1.481013,1.042923


In [11]:
g = comp[comp.gvkey==25056].groupby(['permno', 'fyear'])

In [14]:
g.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,gvkey
permno,fyear,Unnamed: 2_level_1
77418,1991.0,25056
77418,1992.0,25056
77418,1993.0,25056
77418,1994.0,25056
77418,1995.0,25056
77418,1996.0,25056
77418,1997.0,25056
77418,1998.0,25056
77418,1999.0,25056
77418,2000.0,25056


In [46]:
comp.query('gvkey == 25056')

Unnamed: 0,gvkey,datadate,fyear,fystr,fysm,permno
189670,25056,1992-06-30,1992.0,A-Jun,1991-07-01,77418
189671,25056,1993-06-30,1993.0,A-Jun,1992-07-01,77418
189672,25056,1994-06-30,1994.0,A-Jun,1993-07-01,77418
189673,25056,1995-06-30,1995.0,A-Jun,1994-07-01,77418
189674,25056,1996-06-30,1996.0,A-Jun,1995-07-01,77418
189675,25056,1997-06-30,1997.0,A-Jun,1996-07-01,77418
189676,25056,1998-06-30,1998.0,A-Jun,1997-07-01,77418
189677,25056,1999-06-30,1999.0,A-Jun,1998-07-01,77418
189678,25056,2000-06-30,2000.0,A-Jun,1999-07-01,77418
189679,25056,2001-12-31,2001.0,A-Dec,2001-01-01,77418


In [57]:
crsp.query('permno == 77418')['1992-07-01':'1993-06-30']

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-07-31,77418,77419.125,120247.077033,1.046296,1.046296,1.036125
1992-08-31,77418,73993.5,114780.122605,0.955752,0.955752,0.974647
1992-09-30,77418,83580.0,127991.828511,1.111111,1.111111,1.0218
1992-10-31,77418,93331.0,139998.470394,1.116667,1.116667,1.024671
1992-11-30,77418,110047.0,163483.595966,1.179104,1.179104,1.085735
1992-12-31,77418,167807.25,249330.080263,1.481013,1.481013,1.042923
1993-01-31,77418,164938.75,245145.548017,0.982906,0.982906,1.030472
1993-02-28,77418,141990.75,211497.847913,0.86087,0.86087,0.977384
1993-03-31,77418,157767.5,234722.492942,1.111111,1.111111,1.0219
1993-04-30,77418,150596.25,224753.937379,0.954909,0.954545,0.977536


In [58]:
crsp.query('permno == 77418')['1992-07-01':'1993-06-30'].resample('Y').mean()

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-12-31,77418,101029.645833,152638.529129,1.148324,1.148324,1.030984
1993-12-31,77418,171153.5,256389.895594,1.048634,1.048574,1.008362


In [56]:
crsp.query('permno == 77418')['1992-07-01':'1993-06-30'].resample('A-Jun').mean()

Unnamed: 0_level_0,permno,mv,mv_adj,ret_p1,retx_p1,dret_p1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-06-30,77418,136091.572917,204514.212361,1.098479,1.098449,1.019673


In [52]:
g = comp[comp.gvkey==25056].groupby(['permno', 'fyear'])
for name, group in g:
    print(name[0])

77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418
77418


In [12]:
# test some data
tic = time.perf_counter()
g = comp[comp.gvkey==25056].groupby(['permno', 'fyear'])

# code_list = comp.gvkey.unique().tolist()[:30]
# test_comp = comp.set_index('gvkey').loc[code_list].reset_index()
# g = test_comp.groupby(['permno', 'fyear'])

df_list = []
for name, group in tqdm_notebook(g):
    start = group.fysm.iloc[0].strftime('%Y-%m-%d')
    end = group.datadate.iloc[0].strftime('%Y-%m-%d')
    fystring = group.fystr.iloc[0]
    sample = crsp[crsp.permno==name[0]][start:end]
    if len(sample) > 0:
        r = sample.resample(fystring)
        df_list.append(r.apply(fiscal_resample, fys=fystring, per=name[0]))

df_test = pd.concat(df_list, axis=0)
df_test.index.names = ['date', 'permno']
toc= time.perf_counter()
print(toc - tic)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))


0.4710479000000021


In [14]:
df_test = df_test.reset_index().set_index('date')

In [19]:
df_test.columns

Index(['permno', 'ret_p1', 'retx_p1', 'dret_p1', 'vwretd_p1', 'vwretx_p1',
       'ewretd_p1', 'ewretx_p1', 'mv', 'mv_adj'],
      dtype='object')

In [16]:
df_test['2002':'2006'].ret_p1.prod()

0.69040020436816008

In [17]:
df_test['2002':'2006'].dret_p1.prod()

1.4161960357477106

In [25]:
df_test['2002':'2006'].ret_p1.prod() / df_test['2002':'2006'].vwretd_p1.prod()

0.46491053907605395

In [27]:
df_test['2002':'2006'].ewretd_p1.prod() / df_test['2002':'2006'].dret_p1.prod()

1.65742858843602

In [29]:
df_test.loc['2002':'2006', 'ret_p1'].prod()

0.69040020436816008

In [39]:
df_col_list = df_test.columns[1:8].tolist()
for i in range(len(df_col_list)):
    for j in range(len(df_col_list)):
        if i != j:
            print(df_col_list[i]+'/'+df_col_list[j],
                  df_test.loc['2002':'2006', df_col_list[i]].prod() / \
            df_test.loc['2002':'2006', df_col_list[j]].prod())

ret_p1/retx_p1 1.01753081251
ret_p1/dret_p1 0.487503274223
ret_p1/vwretd_p1 0.464910539076
ret_p1/vwretx_p1 0.50875799136
ret_p1/ewretd_p1 0.294132294823
ret_p1/ewretx_p1 0.320858673878
retx_p1/ret_p1 0.982771221964
retx_p1/dret_p1 0.479104188519
retx_p1/vwretd_p1 0.456900698592
retx_p1/vwretx_p1 0.499992712853
retx_p1/ewretd_p1 0.289064754803
retx_p1/ewretx_p1 0.315330671005
dret_p1/ret_p1 2.05126827424
dret_p1/retx_p1 2.08722867377
dret_p1/vwretd_p1 0.953656239166
dret_p1/vwretx_p1 1.04359912694
dret_p1/ewretd_p1 0.6033442448
dret_p1/ewretx_p1 0.658167218239
vwretd_p1/ret_p1 2.15095145399
vwretd_p1/retx_p1 2.18865938065
vwretd_p1/dret_p1 1.04859587651
vwretd_p1/vwretx_p1 1.09431374124
vwretd_p1/ewretd_p1 0.632664287215
vwretd_p1/ewretx_p1 0.690151431102
vwretx_p1/ret_p1 1.96557109074
vwretx_p1/retx_p1 2.00002914901
vwretx_p1/dret_p1 0.958222342452
vwretx_p1/vwretd_p1 0.913814715388
vwretx_p1/ewretd_p1 0.578137935558
vwretx_p1/ewretx_p1 0.630670533586
ewretd_p1/ret_p1 3.39983068027
ew

In [40]:
df_col_list = df_test.columns[1:8].tolist()
wrt_dict = {}
for i in range(len(df_col_list)):
    for j in range(len(df_col_list)):
        if i != j:                  
            wrt_dict[df_col_list[i]+'/'+df_col_list[j]] = \
            df_test.loc['2002':'2006', df_col_list[i]].prod() / \
            df_test.loc['2002':'2006', df_col_list[j]].prod()

In [48]:
dict(pd.DataFrame(wrt_dict,index=[1]).T.sort_values(1))[1]

retx_p1/ewretd_p1      0.289065
ret_p1/ewretd_p1       0.294132
retx_p1/ewretx_p1      0.315331
ret_p1/ewretx_p1       0.320859
retx_p1/vwretd_p1      0.456901
ret_p1/vwretd_p1       0.464911
retx_p1/dret_p1        0.479104
ret_p1/dret_p1         0.487503
retx_p1/vwretx_p1      0.499993
ret_p1/vwretx_p1       0.508758
vwretx_p1/ewretd_p1    0.578138
dret_p1/ewretd_p1      0.603344
vwretx_p1/ewretx_p1    0.630671
vwretd_p1/ewretd_p1    0.632664
dret_p1/ewretx_p1      0.658167
vwretd_p1/ewretx_p1    0.690151
vwretx_p1/vwretd_p1    0.913815
ewretx_p1/ewretd_p1    0.916704
dret_p1/vwretd_p1      0.953656
vwretx_p1/dret_p1      0.958222
retx_p1/ret_p1         0.982771
ret_p1/retx_p1         1.017531
dret_p1/vwretx_p1      1.043599
vwretd_p1/dret_p1      1.048596
ewretd_p1/ewretx_p1    1.090865
vwretd_p1/vwretx_p1    1.094314
ewretx_p1/vwretd_p1    1.448957
ewretx_p1/dret_p1      1.519371
ewretd_p1/vwretd_p1    1.580617
ewretx_p1/vwretx_p1    1.585614
ewretd_p1/dret_p1      1.657429
ewretd_p

In [65]:
df_list[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,dret_p1,mv,mv_adj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-06-30,77418,0.885246,0.885246,0.902617,73993.5,115343.490363


In [66]:
df_list[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,dret_p1,mv,mv_adj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-06-30,77418,2.741783,2.740738,1.256202,218004.0,330424.664803


In [64]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,dret_p1,mv,mv_adj
date,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-06-30,77418,0.885246,0.885246,0.902617,73993.5,115343.5
1993-06-30,77418,2.741783,2.740738,1.256202,218004.0,330424.7
1994-06-30,77418,1.540542,1.540542,1.046047,412965.0,609474.0
1995-06-30,77418,3.087717,3.087717,1.174147,1652420.0,2371602.0
1996-06-30,77418,1.988631,1.988631,1.220468,3935138.0,5489542.0
1997-06-30,77418,1.27143,1.27143,1.321836,5443073.0,7432170.0
1998-06-30,77418,3.823007,3.822978,1.302787,22728240.0,30501510.0
1999-06-30,77418,4.185493,4.185493,1.216303,119515800.0,157023800.0
2000-06-30,77418,0.957955,0.957955,1.091818,121650700.0,154275900.0
2001-12-31,77418,0.922413,0.922413,0.863769,136599900.0,163100400.0


In [20]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_p1,retx_p1,dret_p1,mv,mv_adj
date,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-06-30,77418,0.885246,0.885246,0.902617,73993.5,114852.6
1993-06-30,77418,2.741783,2.740738,1.256202,218004.0,328536.4
1994-06-30,77418,1.540542,1.540542,1.046047,412965.0,607198.2
1995-06-30,77418,3.087717,3.087717,1.174147,1652420.0,2357875.0
1996-06-30,77418,1.988631,1.988631,1.220468,3935138.0,5461052.0
1997-06-30,77418,1.27143,1.27143,1.321836,5443073.0,7388683.0
1998-06-30,77418,3.823007,3.822978,1.302787,22728240.0,30359650.0
1999-06-30,77418,4.185493,4.185493,1.216303,119515800.0,156567800.0
2000-06-30,77418,0.957955,0.957955,1.091818,121650700.0,153626700.0
2001-12-31,77418,0.922413,0.922413,0.863769,136599900.0,162778700.0


In [14]:
tic = time.perf_counter()
g = comp.groupby(['permno', 'fyear'])

df_list = []
    
for name, group in tqdm_notebook(g):

    start = group.fysm.iloc[0].strftime('%Y-%m-%d')
    end = group.datadate.iloc[0].strftime('%Y-%m-%d')
    fystring = group.fystr.iloc[0]
    sample = crsp[crsp.permno==name[0]][start:end]
    if len(sample) > 0:
        r = sample.resample(fystring)
        df_list.append(r.apply(fiscal_resample, fys=fystring, per=name[0]))

crsp1 = pd.concat(df_list, axis=0)
crsp1.index.names = ['date', 'permno']
# df_test
crsp1.to_csv('data/crsp_fa_filtered.csv')
toc= time.perf_counter()
print(toc - tic)

HBox(children=(IntProgress(value=0, max=248144), HTML(value='')))


4502.843347700001
