# 03 - Data Summary Stats



## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import datetime as dt

import sys

sys.path.append('../')
import src

# sys.path.append('../../kungfu/')
# import kungfu as kf

  from pandas.util.testing import assert_frame_equal


#### Assets & EW & VW

In [23]:
%%time
df_stats = pd.DataFrame()
for year in range(1994, 2021):
    for month in range(1, 12+1):
        df_back = src.loader.load_monthly_crsp(year, month, which='back')
        df_back['retadj'] = (df_back['retadj'].unstack() - src.loader.load_rf(year=year, month=month, which='back').values).stack()
        sampling_date = df_back.unstack().index[-1]
        n_months = len(df_back.unstack().index.to_period('M').unique())
        
        # estimate
        back_stats = src.utils.summarise_returns(df_back)
        back_stats.index = pd.MultiIndex.from_product([[sampling_date], back_stats.index], names=['sampling_date', 'permno'])
        back_stats.columns = [column+'_back_{}M'.format(n_months) for column in back_stats.columns]
       
        ## forward part
        if not (year==2020 and month==12):
            df_forward = src.loader.load_monthly_crsp(year, month, which='forward')
            df_forward['retadj'] = (df_forward['retadj'].unstack() - src.loader.load_rf(year=year, month=month, which='forward').values).stack()
            months = df_forward.unstack().index.to_period('M').unique().tolist()
        
        # estimate
        forward_stats = pd.DataFrame(index=back_stats.index)
        for i in range(1, len(months)+1):
            if not (year==2020 and month==12):
                data = (df_forward.unstack()[df_forward.unstack().index.to_period('M').isin(months[:i])]).stack()
                month_stats = src.utils.summarise_returns(data)
                month_stats.index = pd.MultiIndex.from_product([[sampling_date], month_stats.index], names=['sampling_date', 'permno'])
                month_stats.columns = [column+'_forward_{}M'.format(i) for column in month_stats.columns]
                forward_stats = forward_stats.join(month_stats)
                
        # combine
        stats = back_stats.join(forward_stats)
        df_stats = df_stats.append(stats)
    print('Done estimating year {}'.format(year))
        
    df_stats.to_csv('../data/estimated/summary_stats.csv')

Done estimating year 1994
Done estimating year 1995
Done estimating year 1996
Done estimating year 1997
Done estimating year 1998
Done estimating year 1999
Done estimating year 2000
Done estimating year 2001
Done estimating year 2002
Done estimating year 2003
Done estimating year 2004
Done estimating year 2005
Done estimating year 2006
Done estimating year 2007
Done estimating year 2008
Done estimating year 2009
Done estimating year 2010
Done estimating year 2011
Done estimating year 2012
Done estimating year 2013
Done estimating year 2014
Done estimating year 2015
Done estimating year 2016
Done estimating year 2017
Done estimating year 2018
Done estimating year 2019
Done estimating year 2020
CPU times: user 13min 27s, sys: 6.5 s, total: 13min 34s
Wall time: 13min 39s


#### SPY

In [6]:
%%time
spy = src.loader.load_spy()
spy_stats = pd.DataFrame()
for year in range(1994, 2021):
    for month in range(1, 12+1):
        back_dates = src.loader.load_monthly_crsp(year, month, which='back').index.get_level_values('date').unique()
        sampling_date = back_dates[-1]
        n_months = len(back_dates.to_period('M').unique())
        
        # estimate
        back_spy = spy.loc[spy.index.isin(back_dates), 'ret']
        stats = pd.DataFrame(data = back_spy.agg([lambda x: (1+x).prod()-1, lambda x: x.var()*252]).values.reshape(1, -1),
                                  index=[sampling_date],
                                  columns=['ret', 'var'])
        stats.columns = [column+'_back_{}M'.format(n_months) for column in stats.columns]
       
        ## forward part
        forward_dates = src.loader.load_monthly_crsp(year, month, which='forward').index.get_level_values('date').unique()
        months = forward_dates.to_period('M').unique().tolist()
        
        # estimate
        forward_stats = pd.DataFrame(index=back_stats.index)
        for i in range(1, len(months)+1):
            months_spy = spy.loc[spy.index.to_period('M').isin(months[:i]), 'ret']
            stats['ret_forward_{}M'.format(i)] = (1+months_spy).prod()-1
            stats['var_forward_{}M'.format(i)] = months_spy.var()*252
            
        # combine
        spy_stats = spy_stats.append(stats)
    print('Done estimating year {}'.format(year))
        
    spy_stats.to_csv('../data/estimated/spy_stats.csv')

Done estimating year 1994
Done estimating year 1995
Done estimating year 1996
Done estimating year 1997
Done estimating year 1998
Done estimating year 1999
Done estimating year 2000
Done estimating year 2001
Done estimating year 2002
Done estimating year 2003
Done estimating year 2004
Done estimating year 2005
Done estimating year 2006
Done estimating year 2007
Done estimating year 2008
Done estimating year 2009
Done estimating year 2010
Done estimating year 2011
Done estimating year 2012
Done estimating year 2013
Done estimating year 2014
Done estimating year 2015
Done estimating year 2016
Done estimating year 2017
Done estimating year 2018
Done estimating year 2019
Done estimating year 2020
CPU times: user 46.9 s, sys: 14 s, total: 1min
Wall time: 1min 1s
