In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime as dt
import pandas_datareader.data as web
import timeit

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
helper_dir = '../helper'

%run {helper_dir}/utils.py
%run {helper_dir}/rolling_funcs.py
%run {helper_dir}/rolling_apply_funcs.py
%run {helper_dir}/rolling_calc_beta.py   # super fast beta calculation

## Make some large data

In [3]:
num_sec_dfs, num_periods = 10000, 480

dates = pd.date_range('1995-12-31', periods=num_periods, freq='M', name='Date')
stocks = pd.DataFrame(data=np.random.rand(num_periods, num_sec_dfs), index=dates,
                      columns=['s{:04d}'.format(i) for i in range(num_sec_dfs)]).pct_change().dropna()
market = pd.DataFrame(data=np.random.rand(num_periods), index=dates, 
                      columns=['Market']).pct_change().dropna()
rets = stocks.join(market)

In [4]:
stocks.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9993,s9994,s9995,s9996,s9997,s9998,s9999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.1793,0.6279,-0.3482,-0.1508,-0.4307,-0.6558,-0.6637,...,-0.4682,0.4026,-0.1473,-0.9064,1.4639,0.7412,0.0527
1996-02-29,0.4921,-0.9326,0.014,-0.0102,1.4229,12.3782,0.5114,...,2.0332,0.1416,0.5034,11.2329,-0.9751,-0.0525,-0.6542
1996-03-31,-0.6947,8.8882,-0.0632,-0.8839,0.1535,-0.5721,-0.198,...,-0.876,-0.801,-0.182,-0.9339,6.7918,0.4074,0.1519
1996-04-30,1.7662,-0.9612,0.0419,6.4507,0.2836,0.2843,0.8433,...,6.1072,1.9199,0.2962,13.1883,3.4692,-0.5948,1.6535
1996-05-31,-0.6943,24.831,-0.1244,0.1969,-0.1332,0.0822,-0.7083,...,0.46,-0.9443,-0.6734,-0.7406,-0.4233,-0.9666,-0.6909


In [5]:
market.head()

Unnamed: 0_level_0,Market
Date,Unnamed: 1_level_1
1996-01-31,0.1122
1996-02-29,-0.2348
1996-03-31,0.3248
1996-04-30,-0.8459
1996-05-31,-0.8646


In [6]:
rets.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9994,s9995,s9996,s9997,s9998,s9999,Market
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.1793,0.6279,-0.3482,-0.1508,-0.4307,-0.6558,-0.6637,...,0.4026,-0.1473,-0.9064,1.4639,0.7412,0.0527,0.1122
1996-02-29,0.4921,-0.9326,0.014,-0.0102,1.4229,12.3782,0.5114,...,0.1416,0.5034,11.2329,-0.9751,-0.0525,-0.6542,-0.2348
1996-03-31,-0.6947,8.8882,-0.0632,-0.8839,0.1535,-0.5721,-0.198,...,-0.801,-0.182,-0.9339,6.7918,0.4074,0.1519,0.3248
1996-04-30,1.7662,-0.9612,0.0419,6.4507,0.2836,0.2843,0.8433,...,1.9199,0.2962,13.1883,3.4692,-0.5948,1.6535,-0.8459
1996-05-31,-0.6943,24.831,-0.1244,0.1969,-0.1332,0.0822,-0.7083,...,-0.9443,-0.6734,-0.7406,-0.4233,-0.9666,-0.6909,-0.8646


## Our function and statsmodels should give the same result. Let's check.

In [7]:
import statsmodels.api as sm

X = sm.add_constant(market)

tot_diff = 0 # should still be zero after the loop
for yvar in stocks.columns:
    y = stocks[yvar]
    results = sm.OLS(y,X).fit()
    beta_statsmod = results.params['Market']
    beta_mine = calc_beta(y, market['Market'])
    tot_diff += abs(beta_statsmod - beta_mine)
assert tot_diff < 1e-8

## Calculate rolling beta of each stock against the market

In [8]:
ndays = 30

In [9]:
betas_00 = rolling_calc_beta(stocks, market, ndays)

In [10]:
betas_01 = roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, 0], x['Market']))
betas_02 = pd.concat([pd.Series(calc_beta(subdf.iloc[:, 0], subdf['Market']), index=[subdf.index[-1]]) for subdf in groll(rets, ndays)])
betas_03 = rolling_apply_pd(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)
betas_04 = rolling_apply_np(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)

In [11]:
print(betas_00.dropna().head().iloc[:,0], '\n\n')
print(betas_01.head(), '\n\n')
print(betas_02.head(), '\n\n')
print(betas_03.dropna().head(), '\n\n')
print(betas_04.dropna().head(), '\n\n')

Date
1998-06-30   -0.0713
1998-07-31   -0.0710
1998-08-31   -0.0722
1998-09-30   -0.0794
1998-10-31   -0.0783
Freq: M, Name: s0000, dtype: float64 


Date
1998-06-30   -0.0713
1998-07-31   -0.0710
1998-08-31   -0.0722
1998-09-30   -0.0794
1998-10-31   -0.0783
dtype: float64 


1998-06-30   -0.0713
1998-07-31   -0.0710
1998-08-31   -0.0722
1998-09-30   -0.0794
1998-10-31   -0.0783
dtype: float64 


Date
1998-06-30   -0.0713
1998-07-31   -0.0710
1998-08-31   -0.0722
1998-09-30   -0.0794
1998-10-31   -0.0783
Freq: M, dtype: float64 


Date
1998-06-30   -0.0713
1998-07-31   -0.0710
1998-08-31   -0.0722
1998-09-30   -0.0794
1998-10-31   -0.0783
Freq: M, dtype: float64 




In [12]:
print(betas_00.dropna().tail().iloc[:,0], '\n\n')
print(betas_01.tail(), '\n\n')
print(betas_02.tail(), '\n\n')
print(betas_03.dropna().tail(), '\n\n')
print(betas_04.dropna().tail(), '\n\n')

Date
2035-07-31   -0.0086
2035-08-31   -0.0086
2035-09-30   -0.0094
2035-10-31   -0.0091
2035-11-30   -0.0093
Freq: M, Name: s0000, dtype: float64 


Date
2035-07-31   -0.0086
2035-08-31   -0.0086
2035-09-30   -0.0094
2035-10-31   -0.0091
2035-11-30   -0.0093
dtype: float64 


2035-07-31   -0.0086
2035-08-31   -0.0086
2035-09-30   -0.0094
2035-10-31   -0.0091
2035-11-30   -0.0093
dtype: float64 


Date
2035-07-31   -0.0086
2035-08-31   -0.0086
2035-09-30   -0.0094
2035-10-31   -0.0091
2035-11-30   -0.0093
Freq: M, dtype: float64 


Date
2035-07-31   -0.0086
2035-08-31   -0.0086
2035-09-30   -0.0094
2035-10-31   -0.0091
2035-11-30   -0.0093
Freq: M, dtype: float64 




In [13]:
print(equal(betas_00.iloc[:,0], betas_01))
print(equal(betas_01, betas_02))
print(equal(betas_02, betas_03))
print(equal(betas_03, betas_04))

True
True
True
True


### Remark: all methods give the same results. Let's compare their speed.

In [14]:
%%timeit
betas_00 = rolling_calc_beta(stocks, market, ndays)

1.3 s ± 8.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### On this large dataset with 10,000 stocks (columns), our `rolling_calc_beta()` only took 1.3s, while the other methods took forever that I had to kill the run since they don't finish.

In [15]:
%%timeit
betas_01 = pd.concat([roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, j], x['Market'])) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 

In [None]:
%%timeit
betas_04 = pd.concat([rolling_apply_np(stocks.iloc[:, j], market['Market'], ndays, calc_beta) for j in range(stocks.shape[1])], axis=1)