In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime as dt
import pandas_datareader.data as web
import timeit

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
helper_dir = '../helper'

%run {helper_dir}/utils.py
%run {helper_dir}/rolling_funcs.py
%run {helper_dir}/rolling_apply_funcs.py
%run {helper_dir}/rolling_calc_beta.py   # super fast beta calculation

## Make some large data

In [3]:
num_sec_dfs, num_periods = 10000, 480

dates = pd.date_range('1995-12-31', periods=num_periods, freq='M', name='Date')
stocks = pd.DataFrame(data=np.random.rand(num_periods, num_sec_dfs), index=dates,
                      columns=['s{:04d}'.format(i) for i in range(num_sec_dfs)]).pct_change().dropna()
market = pd.DataFrame(data=np.random.rand(num_periods), index=dates, 
                      columns=['Market']).pct_change().dropna()
rets = stocks.join(market)

In [4]:
stocks.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9993,s9994,s9995,s9996,s9997,s9998,s9999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.417,-0.8087,-0.1262,0.4313,0.2713,7.5734,-0.1859,...,0.353,-0.732,0.7611,0.7108,-0.17,1.329,-0.2942
1996-02-29,0.0678,2.8339,-0.4521,0.8239,-0.0275,-0.334,-0.5956,...,0.0296,11.5064,-0.2647,-0.6271,-0.9468,1.2059,0.0905
1996-03-31,-0.7101,1.2186,1.805,-0.0518,-0.13,-0.7163,2.1421,...,-0.1175,-0.2209,-0.1098,-0.193,18.2286,-0.8276,-0.8006
1996-04-30,4.0682,0.0462,-0.4999,-0.9466,-0.5576,-0.2221,-0.8269,...,0.1484,0.4351,0.3432,0.081,0.859,6.7228,6.4918
1996-05-31,0.0326,-0.7567,0.0539,9.7919,0.7413,3.9235,4.2968,...,-0.752,-0.5217,0.0501,0.4294,-0.2384,-0.6531,0.0865


In [5]:
market.head()

Unnamed: 0_level_0,Market
Date,Unnamed: 1_level_1
1996-01-31,0.0325
1996-02-29,0.1505
1996-03-31,-0.0414
1996-04-30,-0.7863
1996-05-31,-0.8236


In [6]:
rets.head()

Unnamed: 0_level_0,s0000,s0001,s0002,s0003,s0004,s0005,s0006,...,s9994,s9995,s9996,s9997,s9998,s9999,Market
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-31,-0.417,-0.8087,-0.1262,0.4313,0.2713,7.5734,-0.1859,...,-0.732,0.7611,0.7108,-0.17,1.329,-0.2942,0.0325
1996-02-29,0.0678,2.8339,-0.4521,0.8239,-0.0275,-0.334,-0.5956,...,11.5064,-0.2647,-0.6271,-0.9468,1.2059,0.0905,0.1505
1996-03-31,-0.7101,1.2186,1.805,-0.0518,-0.13,-0.7163,2.1421,...,-0.2209,-0.1098,-0.193,18.2286,-0.8276,-0.8006,-0.0414
1996-04-30,4.0682,0.0462,-0.4999,-0.9466,-0.5576,-0.2221,-0.8269,...,0.4351,0.3432,0.081,0.859,6.7228,6.4918,-0.7863
1996-05-31,0.0326,-0.7567,0.0539,9.7919,0.7413,3.9235,4.2968,...,-0.5217,0.0501,0.4294,-0.2384,-0.6531,0.0865,-0.8236


## Our function and statsmodels should give the same result. Let's check.

In [7]:
import statsmodels.api as sm

X = sm.add_constant(market)

tot_diff = 0 # should still be zero after the loop
for yvar in stocks.columns:
    y = stocks[yvar]
    results = sm.OLS(y,X).fit()
    beta_statsmod = results.params['Market']
    beta_mine = calc_beta(y, market['Market'])
    tot_diff += abs(beta_statsmod - beta_mine)
assert tot_diff < 1e-8

## Calculate rolling beta of each stock against the market

In [8]:
ndays = 30

In [9]:
betas_00 = rolling_calc_beta(stocks, market, ndays)

In [10]:
betas_01 = roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, 0], x['Market']))
betas_02 = pd.concat([pd.Series(calc_beta(subdf.iloc[:, 0], subdf['Market']), index=[subdf.index[-1]]) for subdf in groll(rets, ndays)])
betas_03 = rolling_apply_pd(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)
betas_04 = rolling_apply_np(stocks.iloc[:, 0], market['Market'], ndays, calc_beta)

In [11]:
print(betas_00.dropna().head().iloc[:,0], '\n\n')
print(betas_01.head(), '\n\n')
print(betas_02.head(), '\n\n')
print(betas_03.dropna().head(), '\n\n')
print(betas_04.dropna().head(), '\n\n')

Date
1998-06-30   -0.1236
1998-07-31   -0.1187
1998-08-31    0.0531
1998-09-30    0.0431
1998-10-31    0.0668
Freq: M, Name: s0000, dtype: float64 


Date
1998-06-30   -0.1236
1998-07-31   -0.1187
1998-08-31    0.0531
1998-09-30    0.0431
1998-10-31    0.0668
dtype: float64 


1998-06-30   -0.1236
1998-07-31   -0.1187
1998-08-31    0.0531
1998-09-30    0.0431
1998-10-31    0.0668
dtype: float64 


Date
1998-06-30   -0.1236
1998-07-31   -0.1187
1998-08-31    0.0531
1998-09-30    0.0431
1998-10-31    0.0668
Freq: M, dtype: float64 


Date
1998-06-30   -0.1236
1998-07-31   -0.1187
1998-08-31    0.0531
1998-09-30    0.0431
1998-10-31    0.0668
Freq: M, dtype: float64 




In [12]:
print(betas_00.dropna().tail().iloc[:,0], '\n\n')
print(betas_01.tail(), '\n\n')
print(betas_02.tail(), '\n\n')
print(betas_03.dropna().tail(), '\n\n')
print(betas_04.dropna().tail(), '\n\n')

Date
2035-07-31   0.0265
2035-08-31   0.0263
2035-09-30   0.0283
2035-10-31   0.0256
2035-11-30   0.0579
Freq: M, Name: s0000, dtype: float64 


Date
2035-07-31   0.0265
2035-08-31   0.0263
2035-09-30   0.0283
2035-10-31   0.0256
2035-11-30   0.0579
dtype: float64 


2035-07-31   0.0265
2035-08-31   0.0263
2035-09-30   0.0283
2035-10-31   0.0256
2035-11-30   0.0579
dtype: float64 


Date
2035-07-31   0.0265
2035-08-31   0.0263
2035-09-30   0.0283
2035-10-31   0.0256
2035-11-30   0.0579
Freq: M, dtype: float64 


Date
2035-07-31   0.0265
2035-08-31   0.0263
2035-09-30   0.0283
2035-10-31   0.0256
2035-11-30   0.0579
Freq: M, dtype: float64 




In [13]:
print(equal(betas_00.iloc[:,0], betas_01))
print(equal(betas_01, betas_02))
print(equal(betas_02, betas_03))
print(equal(betas_03, betas_04))

True
True
True
True


### Remark: all methods give the same results. Let's compare their speed.

In [14]:
%%timeit
betas_00 = rolling_calc_beta(stocks, market, ndays)

1.33 s ± 28.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### On this large dataset with 10,000 stocks (columns), our `rolling_calc_beta()` only took 1.3s, while the other methods took forever that I had to kill the runs before they could finish.

In [15]:
%%timeit
betas_04 = pd.concat([rolling_apply_np(stocks.iloc[:, j], market['Market'], ndays, calc_beta) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 

In [15]:
%%timeit
betas_01 = pd.concat([roll(rets, ndays).apply(lambda x: calc_beta(x.iloc[:, j], x['Market'])) for j in range(stocks.shape[1])], axis=1)

KeyboardInterrupt: 