In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# import tushare as ts
# import QUANTAXIS as QA
import talib as ta
import datetime, time
from dateutil.relativedelta import *
from pandas.tseries.offsets import *

In [3]:
from tqdm import tqdm_notebook

In [4]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.style.use('ggplot')

1. 数据年化处理

1.1. crsp 没有fiscal end month 数据, 所以我们需要把comp中的相关数据link到ccm中, 然后再和crsp结合起来

1.2. 把crsp按照每个permno进行年化处理

In [5]:
# 先看看fundamental final数据有何不同
comp = pd.read_csv('data/fundamental_final_1.1.zip',
                  parse_dates=['datadate'], infer_datetime_format=True)
# comp = comp.drop(['indfmt', 'consol', 'popsrc', 'datafmt'], axis=1)
comp = comp.drop(['cusip'], axis=1)
comp = comp[comp.fyr > 0]

要么全本都是按照一个fyr去做计算，然后提取相应年份的值返回
要么先按照月份去取数据，然后按照fyr去做年化处理，返回值

那我选择第二个方案

首先对comp添加fiscal year end month字段，方便后面去做处理
然后循环

In [6]:
comp = comp.assign(fystr = comp.datadate.apply(lambda x :  'A-' + x.strftime('%b')),
                  fysm = comp.datadate - MonthBegin(12))

In [7]:
# comp.head()

In [34]:
# test = comp[(comp.gvkey==1003) | (comp.gvkey==1001) ]

In [35]:
# g = test.groupby('gvkey')

引入ccm, 链接上permno

In [None]:
ccm = pd.read_csv('data/ccm_hist.csv')
ccm = ccm.drop('Unnamed: 0', axis=1)
ccm = ccm[((ccm.linktype == 'LU') | (ccm.linktype == 'LC')) & \
   ((ccm.linkprim == 'P') | (ccm.linkprim == 'C'))]
ccm['linkdt']=pd.to_datetime(ccm['linkdt'])
ccm['linkenddt']=pd.to_datetime(ccm['linkenddt'])
# if linkenddt is missing then set to today date
ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today'))

ccm.columns = ['gvkey', 'linkprim', 'liid', 'linktype', 'permno', 'permco', 'linkdt',
       'linkenddt']

In [None]:
ccm1 = ccm[['gvkey', 'permno']]
ccm1.permno = ccm1.permno.astype(int)

In [None]:
comp1 = pd.merge(left=comp, right=ccm1, on=['gvkey'])

In [None]:
comp1 = comp1[['gvkey', 'datadate', 'fyear', 'fystr', 'fysm', 'permno']]

取crsp数据, 我们开始做实验

In [11]:
crsp = pd.read_csv('data/return.zip', 
                   parse_dates=['date'], infer_datetime_format=True)
crsp[['PERMNO', 'PERMCO']] = crsp[['PERMNO', 'PERMCO']].astype(int)

crsp.RET = crsp.RET.replace(['B', 'C'], [np.nan, np.nan]).astype(np.float64)
crsp.RETX = crsp.RETX.replace(['B', 'C'], [np.nan, np.nan]).astype(np.float64)

crsp.RET = crsp.RET.fillna(0)
crsp.RETX = crsp.RETX.fillna(0)

crsp = crsp.drop(['PERMCO', 'CUSIP', 'TICKER', 'COMNAM'], axis=1)
crsp.columns = [i.lower() for i in crsp.columns]

crsp = crsp.assign(mv = crsp.prc.abs() * crsp.shrout,
                         ret_p1 = crsp.ret+1,
                         retx_p1 = crsp.retx+1)
crsp.date = pd.to_datetime(crsp.date) + MonthEnd(0)

crsp = crsp.reset_index()


cpi = pd.read_excel('data/CPIAUCSL.xlsx')
crsp = pd.merge(crsp, cpi[['date', 'cpi_adjust']], on=['date'])
crsp = crsp.assign(ret_p1_adj = crsp.ret * crsp.cpi_adjust + 1)
crsp = crsp.set_index('date')
crsp = crsp[['permno', 'mv', 'ret_p1', 'retx_p1', 'ret_p1_adj']]

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
# crsp.head()

In [12]:
def fiscal_resample(df, fys, per):
#     res = df.apply({'ret_p1':'cumprod', 'retx_p1':'cumprod'})
    ret_p1 = df.ret_p1.cumprod().iloc[-1]
    retx_p1 = df.retx_p1.cumprod().iloc[-1]
    ret_p1_adj = df.ret_p1_adj.cumprod().iloc[-1]
#     res = pd.concat([res, df.mv], axis=1).resample(fys).last()
    mv = df.mv.resample(fys).last().iloc[-1]
#     res.index = res.index.droplevel(0)
    res = pd.DataFrame({'ret_p1':ret_p1, 'retx_p1':retx_p1,
                        'ret_p1_adj':ret_p1_adj, 'mv':mv}, index=[per])
    return res
    
# def fiscal_resample(df, fys, per):
#     res = df.apply({'ret_p1':'cumprod', 'retx_p1':'cumprod'})
#     res = pd.concat([res, df.mv], axis=1).resample(fys).last()
# #     res = pd.DataFrame({'ret_p1':res.ret_p1, 'retx_p1':res.retx_p1, 'mv':res.mv}, 
# #                        index=[per])
#     return res

In [None]:
tic = time.perf_counter()
code_list = comp1.gvkey.unique().tolist()[:30]
test_comp = comp1.set_index('gvkey').loc[code_list].reset_index()
g = test_comp.groupby(['permno', 'fyear'])
# g = comp1.groupby(['permno', 'fyear'])

# g = comp1[comp1.gvkey==25056].groupby(['permno', 'fyear'])

df_list = []
    
for name, group in tqdm_notebook(g):
#     crsp[crsp.permno == name][]
#     print(name)
#     g.apply(fiscal_resample)
#     print(group.fysm.iloc[0], group.datadate.iloc[0])
    start = group.fysm.iloc[0].strftime('%Y-%m-%d')
    end = group.datadate.iloc[0].strftime('%Y-%m-%d')
    fystring = group.fystr.iloc[0]
    sample = crsp[crsp.permno==name[0]][start:end]
    if len(sample) > 0:
#         print(sample.resample(fystring).mean())
        r = sample.resample(fystring)
        df_list.append(r.apply(fiscal_resample, fys=fystring, per=name[0]))

df_test = pd.concat(df_list, axis=0)
df_test.index.names = ['date', 'permno']
# crsp1 = pd.concat(df_list, axis=0)
# crsp1.index.names = ['date', 'permno']
# df_test
toc= time.perf_counter()
print(toc - tic)

HBox(children=(IntProgress(value=0, max=300526), HTML(value='')))

In [None]:
tic = time.perf_counter()
# code_list = comp1.gvkey.unique().tolist()[:30]
# test_comp = comp1.set_index('gvkey').loc[code_list].reset_index()
# g = test_comp.groupby(['permno', 'fyear'])
g = comp1.groupby(['permno', 'fyear'])

# g = comp1[comp1.gvkey==25056].groupby(['permno', 'fyear'])

df_list = []
    
for name, group in tqdm_notebook(g):
#     crsp[crsp.permno == name][]
#     print(name)
#     g.apply(fiscal_resample)
#     print(group.fysm.iloc[0], group.datadate.iloc[0])
    start = group.fysm.iloc[0].strftime('%Y-%m-%d')
    end = group.datadate.iloc[0].strftime('%Y-%m-%d')
    fystring = group.fystr.iloc[0]
    sample = crsp[crsp.permno==name[0]][start:end]
    if len(sample) > 0:
#         print(sample.resample(fystring).mean())
        r = sample.resample(fystring)
        df_list.append(r.apply(fiscal_resample, fys=fystring, per=name[0]))

# df_test = pd.concat(df_list, axis=0)
# df_test.index.names = ['date', 'permno']
crsp1 = pd.concat(df_list, axis=0)
crsp1.index.names = ['date', 'permno']
# df_test
toc= time.perf_counter()
print(toc - tic)

HBox(children=(IntProgress(value=0, max=300526), HTML(value='')))

In [40]:
# 先实验少量的sample, 看需要多少时间，然后再估算整体要的时间
3e+5 * 0.001 / 60

5.0

In [None]:
comp1.index.drop()

'gvkey', 
'datadate', 
'fyear', 
'indfmt', 
'consol', 
'popsrc', 
'datafmt',
'tic', 
'cusip', 
'conm', 
'curcd', 
'fyr', Fiscal Yearend Month of Data
'bkvlps', 
'dlto', 
'dltt',
'ebit', 
'ib', 
'pstk', 
'revt', 
'xrd', 
'costat', 
'cshtr_f', 
'fyrc' Fiscal Year-end Month - Current

In [7]:
comp.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'bkvlps', 'dlto', 'dltt',
       'ebit', 'ib', 'pstk', 'revt', 'xrd', 'costat', 'cshtr_f', 'fyrc'],
      dtype='object')

In [12]:
comp.to_csv('data/fundamental_final_1.1.csv', index=False)

In [40]:
comp_mon = pd.read_csv('data/fundamental.zip', 
                   parse_dates=['datadate'], infer_datetime_format=True)
comp_mon = comp_mon.drop(['indfmt', 'consol', 'popsrc', 'datafmt'], axis=1)
comp_mon = comp_mon.reset_index().drop('index', axis=1)

In [46]:
def retrieve_year_end_month(df):
    res = (df[df.fqtr == 1].datadate + MonthEnd(9)).\
        apply(lambda x : 'A-' + x.strftime('%b'))
    return pd.DataFrame({'fsm':res})

In [41]:
data_ANTQ = comp_mon[(comp_mon.gvkey==1003)]
data_ANTQ_fsm = data_ANTQ.groupby(['gvkey', 'fyearq']).apply(retrieve_year_end_month)
pd.merge(data_ANTQ, data_ANTQ_fsm, on=['gvkey', 'fyearq']).shape

In [55]:
sample = comp_mon[(comp_mon.gvkey==1003) | (comp_mon.gvkey==1001)]
sample_fsm = sample.groupby(['gvkey', 'fyearq']).apply(retrieve_year_end_month)
pd.merge(sample, sample_fsm, on=['gvkey', 'fyearq']).shape

In [61]:
tic = time.perf_counter()
comp_mon_fsm = comp_mon.groupby(['gvkey', 'fyearq']).apply(retrieve_year_end_month)
toc= time.perf_counter()
print(toc - tic)

649.5835717


下一步就是根据year end month去做resample

http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [10]:
p = pd.Period('2011', freq='A-DEC')

In [11]:
p

Period('2011', 'A-DEC')

In [12]:
p.asfreq('M', how='start')

Period('2011-01', 'M')