In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime as dt
import pandas_datareader.data as web
import timeit

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
helper_dir = '../helper'

%run {helper_dir}/utils.py
%run {helper_dir}/rolling_funcs.py
%run {helper_dir}/rolling_apply_funcs.py
%run {helper_dir}/rolling_calc_beta.py   # super fast beta calculation

## Make some large data

In [3]:
num_sec_dfs, num_periods = 10000, 480

dates = pd.date_range('1995-12-31', periods=num_periods, freq='M', name='Date')
stocks = pd.DataFrame(data=np.random.rand(num_periods, num_sec_dfs), index=dates,
                      columns=['s{:04d}'.format(i) for i in range(num_sec_dfs)]).pct_change().dropna()
market = pd.DataFrame(data=np.random.rand(num_periods), index=dates, 
                      columns=['Market']).pct_change().dropna()
rets = stocks.join(market)

In [4]:
stocks.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9993,s9994,s9995,s9996,s9997,s9998,s9999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,2.0716,-0.7195,-0.8851,0.7121,-0.5315,0.6447,0.6147,...,-0.397,0.3772,4.2316,1.381,-0.7467,1.3127,2.3029
1996-02-29,0.1994,1.7505,7.9473,-0.2553,1.0365,0.7656,-0.7554,...,0.825,-0.0204,0.6434,-0.6399,1.2617,-0.0953,-0.012
1996-03-31,1.0193,-0.3647,-0.5895,-0.6796,-0.234,-0.0423,4.376,...,-0.9697,-0.1974,-0.015,-0.1762,-0.883,0.3798,-0.982
1996-04-30,-0.5489,0.8102,0.3462,0.2527,-0.6725,-0.7887,-0.0401,...,31.1022,1.3547,-0.2044,-0.4182,8.7875,-0.9766,22.5053
1996-05-31,-0.2855,-0.9701,0.8056,1.5557,0.3439,3.631,-0.3826,...,-0.8966,-0.6407,-0.2172,2.1814,-0.7637,27.9006,-0.9496


In [5]:
market.head()

Unnamed: 0_level_0,Market
Date,Unnamed: 1_level_1
1996-01-31,-0.8216
1996-02-29,1.1235
1996-03-31,0.9962
1996-04-30,0.6555
1996-05-31,-0.9904


In [6]:
rets.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9994,s9995,s9996,s9997,s9998,s9999,Market
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,2.0716,-0.7195,-0.8851,0.7121,-0.5315,0.6447,0.6147,...,0.3772,4.2316,1.381,-0.7467,1.3127,2.3029,-0.8216
1996-02-29,0.1994,1.7505,7.9473,-0.2553,1.0365,0.7656,-0.7554,...,-0.0204,0.6434,-0.6399,1.2617,-0.0953,-0.012,1.1235
1996-03-31,1.0193,-0.3647,-0.5895,-0.6796,-0.234,-0.0423,4.376,...,-0.1974,-0.015,-0.1762,-0.883,0.3798,-0.982,0.9962
1996-04-30,-0.5489,0.8102,0.3462,0.2527,-0.6725,-0.7887,-0.0401,...,1.3547,-0.2044,-0.4182,8.7875,-0.9766,22.5053,0.6555
1996-05-31,-0.2855,-0.9701,0.8056,1.5557,0.3439,3.631,-0.3826,...,-0.6407,-0.2172,2.1814,-0.7637,27.9006,-0.9496,-0.9904


## Our function and statsmodels should give the same result. Let's check.

In [7]:
import statsmodels.api as sm

X = sm.add_constant(market)

tot_diff = 0 # should still be zero after the loop
for yvar in stocks.columns:
    y = stocks[yvar]
    results = sm.OLS(y,X).fit()
    beta_statsmod = results.params['Market']
    beta_mine = calc_beta(y, market['Market'])
    tot_diff += abs(beta_statsmod - beta_mine)
assert tot_diff < 1e-8

## Calculate rolling beta of each stock against the market

In [8]:
ndays = 30

In [9]:
betas_00 = rolling_calc_beta(stocks, market, ndays)

In [10]:
betas_01 = roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, 0], x['Market']))
betas_02 = pd.concat([pd.Series(calc_beta(subdf.iloc[:, 0], subdf['Market']), index=[subdf.index[-1]]) for subdf in groll(rets, ndays)])
betas_03 = rolling_apply_pd(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)
betas_04 = rolling_apply_np(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)

In [11]:
print(betas_00.dropna().head().iloc[:,0], '\n\n')
print(betas_01.head(), '\n\n')
print(betas_02.head(), '\n\n')
print(betas_03.dropna().head(), '\n\n')
print(betas_04.dropna().head(), '\n\n')

Date
1998-06-30   -0.0004
1998-07-31    0.0007
1998-08-31    0.0005
1998-09-30    0.0013
1998-10-31    0.0008
Freq: M, Name: s0000, dtype: float64 


Date
1998-06-30   -0.0004
1998-07-31    0.0007
1998-08-31    0.0005
1998-09-30    0.0013
1998-10-31    0.0008
dtype: float64 


1998-06-30   -0.0004
1998-07-31    0.0007
1998-08-31    0.0005
1998-09-30    0.0013
1998-10-31    0.0008
dtype: float64 


Date
1998-06-30   -0.0004
1998-07-31    0.0007
1998-08-31    0.0005
1998-09-30    0.0013
1998-10-31    0.0008
Freq: M, dtype: float64 


Date
1998-06-30   -0.0004
1998-07-31    0.0007
1998-08-31    0.0005
1998-09-30    0.0013
1998-10-31    0.0008
Freq: M, dtype: float64 




In [12]:
print(betas_00.dropna().tail().iloc[:,0], '\n\n')
print(betas_01.tail(), '\n\n')
print(betas_02.tail(), '\n\n')
print(betas_03.dropna().tail(), '\n\n')
print(betas_04.dropna().tail(), '\n\n')

Date
2035-07-31   -0.0030
2035-08-31   -0.0029
2035-09-30   -0.0029
2035-10-31   -0.0028
2035-11-30   -0.0028
Freq: M, Name: s0000, dtype: float64 


Date
2035-07-31   -0.0030
2035-08-31   -0.0029
2035-09-30   -0.0029
2035-10-31   -0.0028
2035-11-30   -0.0028
dtype: float64 


2035-07-31   -0.0030
2035-08-31   -0.0029
2035-09-30   -0.0029
2035-10-31   -0.0028
2035-11-30   -0.0028
dtype: float64 


Date
2035-07-31   -0.0030
2035-08-31   -0.0029
2035-09-30   -0.0029
2035-10-31   -0.0028
2035-11-30   -0.0028
Freq: M, dtype: float64 


Date
2035-07-31   -0.0030
2035-08-31   -0.0029
2035-09-30   -0.0029
2035-10-31   -0.0028
2035-11-30   -0.0028
Freq: M, dtype: float64 




In [13]:
print(equal(betas_00.iloc[:,0], betas_01))
print(equal(betas_01, betas_02))
print(equal(betas_02, betas_03))
print(equal(betas_03, betas_04))

True
True
True
True


### Remark: all methods give the same results. Let's compare their speed.

In [14]:
%%timeit
betas_00 = rolling_calc_beta(stocks, market, ndays)

1.3 s ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### On this large dataset with 10,000 stocks (columns), our `rolling_calc_beta()` only took 1.3s, while the other methods took forever that I had to kill the run since they don't finish.

In [15]:
%%timeit
betas_04 = pd.concat([rolling_apply_np(stocks.iloc[:, j], market['Market'], ndays, calc_beta) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 

In [15]:
%%timeit
betas_01 = pd.concat([roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, j], x['Market'])) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 