In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime as dt
import pandas_datareader.data as web
import timeit

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
helper_dir = '../helper'

%run {helper_dir}/utils.py
%run {helper_dir}/rolling_funcs.py
%run {helper_dir}/rolling_apply_funcs.py
%run {helper_dir}/rolling_calc_beta.py   # super fast beta calculation

## Make some large data

In [3]:
num_sec_dfs, num_periods = 10000, 480

dates = pd.date_range('1995-12-31', periods=num_periods, freq='M', name='Date')
stocks = pd.DataFrame(data=np.random.rand(num_periods, num_sec_dfs), index=dates,
                      columns=['s{:04d}'.format(i) for i in range(num_sec_dfs)]).pct_change().dropna()
market = pd.DataFrame(data=np.random.rand(num_periods), index=dates, 
                      columns=['Market']).pct_change().dropna()
rets = stocks.join(market)

In [4]:
stocks.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9993,s9994,s9995,s9996,s9997,s9998,s9999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.2552,-0.436,0.474,-0.1088,-0.6849,8.0987,-0.1609,...,3.3021,16.4735,3.284,-0.4234,-0.0316,-0.5841,-0.1496
1996-02-29,1.8324,0.3984,-0.3393,-0.7651,-0.7041,-0.0158,-0.0424,...,-0.7185,-0.8019,-0.6368,1.456,-0.3674,0.1784,-0.0986
1996-03-31,0.1321,-0.5633,0.4187,0.0924,2.3899,0.276,-0.6598,...,5.1903,6.1442,3.3365,-0.8595,-0.4619,0.2862,0.4565
1996-04-30,-0.5596,2.7147,0.1948,1.4398,2.9091,-0.1054,3.6864,...,-0.5549,-0.7296,-0.4175,2.0233,0.942,1.3871,-0.0217
1996-05-31,0.6188,-0.5385,0.2699,0.6606,-0.3397,-0.6707,-0.3234,...,0.9743,1.6078,-0.3892,1.9815,-0.1723,-0.4115,0.0164


In [5]:
market.head()

Unnamed: 0_level_0,Market
Date,Unnamed: 1_level_1
1996-01-31,0.1814
1996-02-29,1.8237
1996-03-31,0.7323
1996-04-30,-0.1911
1996-05-31,-0.2124


In [6]:
rets.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9994,s9995,s9996,s9997,s9998,s9999,Market
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.2552,-0.436,0.474,-0.1088,-0.6849,8.0987,-0.1609,...,16.4735,3.284,-0.4234,-0.0316,-0.5841,-0.1496,0.1814
1996-02-29,1.8324,0.3984,-0.3393,-0.7651,-0.7041,-0.0158,-0.0424,...,-0.8019,-0.6368,1.456,-0.3674,0.1784,-0.0986,1.8237
1996-03-31,0.1321,-0.5633,0.4187,0.0924,2.3899,0.276,-0.6598,...,6.1442,3.3365,-0.8595,-0.4619,0.2862,0.4565,0.7323
1996-04-30,-0.5596,2.7147,0.1948,1.4398,2.9091,-0.1054,3.6864,...,-0.7296,-0.4175,2.0233,0.942,1.3871,-0.0217,-0.1911
1996-05-31,0.6188,-0.5385,0.2699,0.6606,-0.3397,-0.6707,-0.3234,...,1.6078,-0.3892,1.9815,-0.1723,-0.4115,0.0164,-0.2124


## Our function and statsmodels should give the same result. Let's check.

In [7]:
import statsmodels.api as sm

X = sm.add_constant(market)

tot_diff = 0 # should still be zero after the loop
for yvar in stocks.columns:
    y = stocks[yvar]
    results = sm.OLS(y,X).fit()
    beta_statsmod = results.params['Market']
    beta_mine = calc_beta(y, market['Market'])
    tot_diff += abs(beta_statsmod - beta_mine)
assert tot_diff < 1e-8

## Calculate rolling beta of each stock against the market

In [8]:
ndays = 30

In [9]:
betas_00 = rolling_calc_beta(stocks, market, ndays)

In [10]:
betas_01 = roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, 0], x['Market']))
betas_02 = pd.concat([pd.Series(calc_beta(subdf.iloc[:, 0], subdf['Market']), index=[subdf.index[-1]]) for subdf in groll(rets, ndays)])
betas_03 = rolling_apply_pd(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)
betas_04 = rolling_apply_np(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)

In [12]:
print(betas_00.dropna().head().iloc[:,0], '\n\n')
print(betas_01.head(), '\n\n')
print(betas_02.head(), '\n\n')
print(betas_03.dropna().head(), '\n\n')
print(betas_04.dropna().head(), '\n\n')

Date
1998-06-30   -0.0841
1998-07-31   -0.0787
1998-08-31   -0.0829
1998-09-30   -0.0967
1998-10-31   -0.1121
Freq: M, Name: s0000, dtype: float64 


Date
1998-06-30   -0.0841
1998-07-31   -0.0787
1998-08-31   -0.0829
1998-09-30   -0.0967
1998-10-31   -0.1121
dtype: float64 


1998-06-30   -0.0841
1998-07-31   -0.0787
1998-08-31   -0.0829
1998-09-30   -0.0967
1998-10-31   -0.1121
dtype: float64 


Date
1998-06-30   -0.0841
1998-07-31   -0.0787
1998-08-31   -0.0829
1998-09-30   -0.0967
1998-10-31   -0.1121
Freq: M, dtype: float64 


Date
1998-06-30   -0.0841
1998-07-31   -0.0787
1998-08-31   -0.0829
1998-09-30   -0.0967
1998-10-31   -0.1121
Freq: M, dtype: float64 




In [13]:
print(betas_00.dropna().tail().iloc[:,0], '\n\n')
print(betas_01.tail(), '\n\n')
print(betas_02.tail(), '\n\n')
print(betas_03.dropna().tail(), '\n\n')
print(betas_04.dropna().tail(), '\n\n')

Date
2035-07-31   -0.0426
2035-08-31   -0.1754
2035-09-30   -0.1735
2035-10-31   -0.1621
2035-11-30   -0.1687
Freq: M, Name: s0000, dtype: float64 


Date
2035-07-31   -0.0426
2035-08-31   -0.1754
2035-09-30   -0.1735
2035-10-31   -0.1621
2035-11-30   -0.1687
dtype: float64 


2035-07-31   -0.0426
2035-08-31   -0.1754
2035-09-30   -0.1735
2035-10-31   -0.1621
2035-11-30   -0.1687
dtype: float64 


Date
2035-07-31   -0.0426
2035-08-31   -0.1754
2035-09-30   -0.1735
2035-10-31   -0.1621
2035-11-30   -0.1687
Freq: M, dtype: float64 


Date
2035-07-31   -0.0426
2035-08-31   -0.1754
2035-09-30   -0.1735
2035-10-31   -0.1621
2035-11-30   -0.1687
Freq: M, dtype: float64 




In [15]:
print(equal(betas_00.iloc[:,0], betas_01))
print(equal(betas_01, betas_02))
print(equal(betas_02, betas_03))
print(equal(betas_03, betas_04))

True
True
True
True


### Remark: all methods give the same results. Let's compare their speed.

In [16]:
%%timeit
betas_00 = rolling_calc_beta(stocks, market, ndays)

1.3 s ± 4.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
betas_01 = pd.concat([roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, j], x['Market'])) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 

In [None]:
%%timeit
betas_04 = pd.concat([rolling_apply_np(stocks.iloc[:, j], market['Market'], ndays, calc_beta) for j in range(stocks.shape[1])], axis=1)

### On this large dataset with 10,000 stocks (columns), our `rolling_calc_beta()` only took 1.3s, while the other methods took forever. 