In [71]:
# Use np.nan for NaN values, do not import NaN directly from numpy

from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2023-09-27'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date, auto_adjust=False).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

[*********************100%***********************]  503 of 503 completed

4 Failed downloads:
['VLTO', 'SOLV', 'SW', 'GEV']: YFPricesMissingError('possibly delisted; no price data found  (1d 2015-09-29 00:00:00 -> 2023-09-27) (Yahoo error = "Data doesn\'t exist for startDate = 1443499200, endDate = 1695787200")')


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-09-29,A,31.251011,33.740002,34.060001,33.240002,33.360001,2252400.0
2015-09-29,AAPL,24.536383,27.264999,28.377501,26.965000,28.207500,293461600.0
2015-09-29,ABBV,35.061211,52.790001,54.189999,51.880001,53.099998,12842800.0
2015-09-29,ABT,32.820744,39.500000,40.150002,39.029999,39.259998,12287500.0
2015-09-29,ACGL,23.217773,24.416668,24.456667,24.100000,24.170000,1888800.0
...,...,...,...,...,...,...,...
2023-09-26,XYL,87.701057,89.519997,90.849998,89.500000,90.379997,1322400.0
2023-09-26,YUM,119.860718,124.010002,124.739998,123.449997,124.239998,1500600.0
2023-09-26,ZBH,110.800148,112.459999,117.110001,112.419998,116.769997,3610500.0
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0


In [72]:

# Compute RSI manually
def compute_rsi(series, length=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=length, min_periods=length).mean()
    avg_loss = loss.rolling(window=length, min_periods=length).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Compute Bollinger Bands manually
def compute_bbands(series, length=20, num_std=2):
    sma = series.rolling(window=length, min_periods=length).mean()
    std = series.rolling(window=length, min_periods=length).std()
    upper_band = sma + num_std * std
    lower_band = sma - num_std * std
    return lower_band, sma, upper_band

# Compute Bollinger Bands (on log1p of adj close)
def bbands_transform(x):
    log_prices = np.log1p(x)
    low, mid, high = compute_bbands(log_prices, length=20)
    return pd.DataFrame({
        'bb_low': low,
        'bb_mid': mid,
        'bb_high': high
    }, index=x.index)

# Compute ATR manually and standardize it
def compute_atr(stock_data, length=14):
    high = stock_data['high']
    low = stock_data['low']
    close = stock_data['adj close']

    prev_close = close.shift(1)
    tr = pd.concat([
        high - low,
        (high - prev_close).abs(),
        (low - prev_close).abs()
    ], axis=1).max(axis=1)

    atr = tr.rolling(window=length, min_periods=length).mean()
    atr_zscore = (atr - atr.mean()) / atr.std()
    return atr_zscore

# Compute MACD manually and standardize it
def compute_macd(series, fast=12, slow=26):
    ema_fast = series.ewm(span=fast, min_periods=fast).mean()
    ema_slow = series.ewm(span=slow, min_periods=slow).mean()
    macd_line = ema_fast - ema_slow
    macd_zscore = (macd_line - macd_line.mean()) / macd_line.std()
    return macd_zscore

# Garman-Klass volatility estimator
df['garman_klass_vol'] = (
    ((np.log(df['high']) - np.log(df['low'])) ** 2) / 2
    - (2 * np.log(2) - 1) * ((np.log(df['adj close']) - np.log(df['open'])) ** 2)
)

# Compute RSI grouped by level 1
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: compute_rsi(x, length=20))

# Compute Bollinger Bands grouped by level 1
bb_df = df.groupby(level=1)['adj close'].apply(bbands_transform)
df[['bb_low', 'bb_mid', 'bb_high']] = bb_df.reset_index(level=0, drop=True)

# Compute ATR grouped by level 1
df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

# Compute MACD grouped by level 1
df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].transform(lambda x: compute_macd(x))

df['dollar_volume'] = (df['adj close']* df['volume'])/1e6

df


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-29,A,31.251011,33.740002,34.060001,33.240002,33.360001,2252400.0,-0.001351,,,,,,,70.389777
2015-09-29,AAPL,24.536383,27.264999,28.377501,26.965000,28.207500,293461600.0,-0.006207,,,,,,,7200.486118
2015-09-29,ABBV,35.061211,52.790001,54.189999,51.880001,53.099998,12842800.0,-0.065607,,,,,,,450.284116
2015-09-29,ABT,32.820744,39.500000,40.150002,39.029999,39.259998,12287500.0,-0.011997,,,,,,,403.284887
2015-09-29,ACGL,23.217773,24.416668,24.456667,24.100000,24.170000,1888800.0,-0.000516,,,,,,,43.853730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,XYL,87.701057,89.519997,90.849998,89.500000,90.379997,1322400.0,-0.000238,22.653337,4.472030,4.556072,4.640113,-3.011218,-2.157410,115.975878
2023-09-26,YUM,119.860718,124.010002,124.739998,123.449997,124.239998,1500600.0,-0.000443,36.971543,4.791669,4.822408,4.853147,-2.828853,-1.367168,179.862993
2023-09-26,ZBH,110.800148,112.459999,117.110001,112.419998,116.769997,3610500.0,-0.000229,41.303575,4.738303,4.778997,4.819692,-2.199154,-0.878965,400.043934
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0,0.000133,21.657597,5.397402,5.539167,5.680932,-0.078248,-1.600810,79.595386


In [73]:
# Unstack the DataFrame to get a multi-index DataFrame with tickers as columns
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open', 'high', 'low', 'close', 'close']]
# Unstack the DataFrame to get a multi-index DataFrame with tickers as columns

data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
            df.unstack()[last_cols].resample('M').last().stack('ticker')], axis=1)).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-11-30,A,134.988344,38.734943,-0.002430,70.422442,3.536910,3.611226,3.685542,-0.915930,0.440816
2015-11-30,AAPL,4005.252647,26.729130,-0.003654,48.630926,3.276063,3.320493,3.364922,-0.494415,-0.209164
2015-11-30,ABBV,325.730935,38.977566,-0.070930,46.106818,3.690632,3.740093,3.789553,0.344526,0.018850
2015-11-30,ABT,207.499349,37.540985,-0.013992,50.738837,3.636157,3.658567,3.680977,0.049593,0.121017
2015-11-30,ACGL,28.174423,22.970539,-0.001121,30.793654,3.177527,3.195190,3.212853,-1.119424,-0.551905
...,...,...,...,...,...,...,...,...,...,...
2023-09-30,EXE,116.689768,79.277855,-0.000348,43.297817,4.368068,4.422967,4.477865,-1.840199,-0.828975
2023-09-30,COIN,506.793576,70.519997,0.001007,46.416531,4.270839,4.378785,4.486731,-1.113593,0.008323
2023-09-30,CEG,195.364200,107.145668,-0.000064,56.041815,4.644504,4.685718,4.726932,-0.095322,0.371209
2023-09-30,GEHC,211.929508,66.022316,0.000183,41.379348,4.152334,4.211363,4.270392,-0.700161,-1.176709


In [74]:
data['dollar_volume'] = (data.loc[:, 'dollar_volume'].unstack('ticker').rolling(5*12, min_periods=12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending=False))


data = data[data['dollar_vol_rank']<150].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

data


Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-10-31,AAPL,26.090458,-0.002767,53.857856,3.288994,3.318620,3.348245,-1.004341,-0.194257
2016-10-31,ABBV,38.834366,-0.056807,25.623775,3.717208,3.772733,3.828259,-0.101567,-0.758456
2016-10-31,ABT,33.619499,-0.009785,33.584997,3.534045,3.585802,3.637559,-1.060312,-0.646878
2016-10-31,ACN,101.760155,-0.006263,38.045414,4.619587,4.631524,4.643462,-0.491809,-0.132574
2016-10-31,ADBE,107.510002,0.000059,46.160151,4.679120,4.694639,4.710159,-1.180271,-0.107781
...,...,...,...,...,...,...,...,...,...
2023-09-30,CRWD,160.479996,0.000144,65.687578,5.024174,5.103696,5.183218,-0.894312,0.250658
2023-09-30,PLTR,13.960000,0.000214,45.277776,2.699917,2.779743,2.859570,-0.490510,-0.433592
2023-09-30,DASH,74.580002,0.000326,40.373286,4.327250,4.403906,4.480561,-1.083109,-0.102635
2023-09-30,ABNB,132.279999,0.000213,56.841285,4.854868,4.940924,5.026980,-0.961350,-0.010573


In [75]:
# Calculate returns for each ticker in the DataFrame
# This function calculates the returns for each ticker based on the adjusted close prices
def calculate_returns(df):

    outlier_cutoff = 0.005

    lags = [1,2, 3, 6, 9, 12]

    for lag in lags:
        df[f'return_{lag}m'] = (df['adj close'].pct_change(lag).pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff), 
                                                                            upper=x.quantile(1-outlier_cutoff))).add(1).pow(1/lag).sub(1))
    return df
    

data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()
data 

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-10-31,AAPL,39.529030,-0.001263,75.669832,3.588975,3.637059,3.685143,-0.543957,-0.037583,0.096807,0.015249,0.044955,0.028875,0.038941,0.035228
2017-10-31,ABBV,65.125298,-0.042703,52.833437,4.160361,4.207901,4.255442,1.795616,0.477132,0.022728,0.098590,0.091379,0.056495,0.047273,0.044026
2017-10-31,ABT,47.540348,-0.007128,54.700622,3.872516,3.896688,3.920860,-0.349540,0.282161,0.021276,0.034308,0.034801,0.038672,0.031320,0.029294
2017-10-31,ACN,127.138985,-0.005423,81.931427,4.784164,4.824869,4.865574,-0.206881,0.355248,0.064180,0.048454,0.037203,0.028692,0.027398,0.018728
2017-10-31,ADBE,175.160004,0.000067,77.957910,4.948186,5.089292,5.230398,-0.710748,0.613325,0.174152,0.062497,0.061392,0.045993,0.049515,0.041515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30,CRWD,160.479996,0.000144,65.687578,5.024174,5.103696,5.183218,-0.894312,0.250658,-0.015641,-0.003656,0.029981,0.026391,0.047942,-0.002216
2023-09-30,PLTR,13.960000,0.000214,45.277776,2.699917,2.779743,2.859570,-0.490510,-0.433592,-0.068091,-0.161174,-0.030723,0.087272,0.090143,0.046083
2023-09-30,DASH,74.580002,0.000326,40.373286,4.327250,4.403906,4.480561,-1.083109,-0.102635,-0.113515,-0.093658,-0.008091,0.027006,0.048207,0.034568
2023-09-30,ABNB,132.279999,0.000213,56.841285,4.854868,4.940924,5.026980,-0.961350,-0.010573,0.005549,-0.067704,0.010603,0.010289,0.049124,0.019401


In [85]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3', 'famafrench', start='2010')[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()

factor_data = factor_data.resample('M').last().div(100)

factor_data.index.names = ['date']

factor_data = factor_data.join(data['return_1m']).sort_index()

factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,AAPL,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.096807
2017-10-31,ABBV,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.022728
2017-10-31,ABT,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.021276
2017-10-31,ACN,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.064180
2017-10-31,ADBE,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.174152
...,...,...,...,...,...,...,...
2023-09-30,VZ,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.056890
2023-09-30,WDAY,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.062413
2023-09-30,WFC,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.015500
2023-09-30,WMT,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.000676


In [91]:


observations = factor_data.groupby(level=1).size()

valid_stocks = observations[observations >= 10]

factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]

factor_data



Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,AAPL,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.096807
2017-10-31,ABBV,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.022728
2017-10-31,ABT,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.021276
2017-10-31,ACN,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.064180
2017-10-31,ADBE,0.0225,-0.0191,0.0013,0.0092,-0.0314,0.174152
...,...,...,...,...,...,...,...
2023-09-30,VZ,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.056890
2023-09-30,WDAY,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.062413
2023-09-30,WFC,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.015500
2023-09-30,WMT,-0.0523,-0.0176,0.0148,0.0193,-0.0077,-0.000676


In [95]:
betas = (factor_data.groupby(level=1, group_keys=False).apply(lambda x: RollingOLS(endog=x['return_1m'], 
                                                                          exog=sm.add_constant(x.drop('return_1m', axis=1)), 
                                                                          window=min(24, x.shape[0]), 
                                                                          min_nobs=len(x.columns)+1).fit(params_only=True).params.drop('const', axis=1)))


In [97]:
data = (data.join(betas.groupby('ticker').shift()))

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,Mkt-RF,SMB,HML,RMW,CMA
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2017-10-31,AAPL,39.529030,-0.001263,75.669832,3.588975,3.637059,3.685143,-0.543957,-0.037583,0.096807,0.015249,0.044955,0.028875,0.038941,0.035228,,,,,
2017-10-31,ABBV,65.125298,-0.042703,52.833437,4.160361,4.207901,4.255442,1.795616,0.477132,0.022728,0.098590,0.091379,0.056495,0.047273,0.044026,,,,,
2017-10-31,ABT,47.540348,-0.007128,54.700622,3.872516,3.896688,3.920860,-0.349540,0.282161,0.021276,0.034308,0.034801,0.038672,0.031320,0.029294,,,,,
2017-10-31,ACN,127.138985,-0.005423,81.931427,4.784164,4.824869,4.865574,-0.206881,0.355248,0.064180,0.048454,0.037203,0.028692,0.027398,0.018728,,,,,
2017-10-31,ADBE,175.160004,0.000067,77.957910,4.948186,5.089292,5.230398,-0.710748,0.613325,0.174152,0.062497,0.061392,0.045993,0.049515,0.041515,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30,CRWD,160.479996,0.000144,65.687578,5.024174,5.103696,5.183218,-0.894312,0.250658,-0.015641,-0.003656,0.029981,0.026391,0.047942,-0.002216,0.590055,-0.991339,-0.681599,-2.499766,-0.831525
2023-09-30,PLTR,13.960000,0.000214,45.277776,2.699917,2.779743,2.859570,-0.490510,-0.433592,-0.068091,-0.161174,-0.030723,0.087272,0.090143,0.046083,,,,,
2023-09-30,DASH,74.580002,0.000326,40.373286,4.327250,4.403906,4.480561,-1.083109,-0.102635,-0.113515,-0.093658,-0.008091,0.027006,0.048207,0.034568,,,,,
2023-09-30,ABNB,132.279999,0.000213,56.841285,4.854868,4.940924,5.026980,-0.961350,-0.010573,0.005549,-0.067704,0.010603,0.010289,0.049124,0.019401,,,,,
