In [2]:
# Use np.nan for NaN values, do not import NaN directly from numpy

from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2023-09-27'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date, auto_adjust=False).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

[*********************100%***********************]  503 of 503 completed

4 Failed downloads:
['VLTO', 'SOLV', 'SW', 'GEV']: YFPricesMissingError('possibly delisted; no price data found  (1d 2015-09-29 00:00:00 -> 2023-09-27) (Yahoo error = "Data doesn\'t exist for startDate = 1443499200, endDate = 1695787200")')


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-09-29,A,31.251009,33.740002,34.060001,33.240002,33.360001,2252400.0
2015-09-29,AAPL,24.536381,27.264999,28.377501,26.965000,28.207500,293461600.0
2015-09-29,ABBV,35.061218,52.790001,54.189999,51.880001,53.099998,12842800.0
2015-09-29,ABT,32.820751,39.500000,40.150002,39.029999,39.259998,12287500.0
2015-09-29,ACGL,23.217773,24.416668,24.456667,24.100000,24.170000,1888800.0
...,...,...,...,...,...,...,...
2023-09-26,XYL,87.701065,89.519997,90.849998,89.500000,90.379997,1322400.0
2023-09-26,YUM,119.860710,124.010002,124.739998,123.449997,124.239998,1500600.0
2023-09-26,ZBH,110.800163,112.459999,117.110001,112.419998,116.769997,3610500.0
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0


In [3]:
import numpy as np
import pandas as pd

# Compute RSI manually
def compute_rsi(series, length=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=length, min_periods=length).mean()
    avg_loss = loss.rolling(window=length, min_periods=length).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Compute Bollinger Bands manually
def compute_bbands(series, length=20, num_std=2):
    sma = series.rolling(window=length, min_periods=length).mean()
    std = series.rolling(window=length, min_periods=length).std()
    upper_band = sma + num_std * std
    lower_band = sma - num_std * std
    return lower_band, sma, upper_band

# Compute Bollinger Bands (on log1p of adj close)
def bbands_transform(x):
    log_prices = np.log1p(x)
    low, mid, high = compute_bbands(log_prices, length=20)
    return pd.DataFrame({
        'bb_low': low,
        'bb_mid': mid,
        'bb_high': high
    }, index=x.index)

# Compute ATR manually and standardize it
def compute_atr(stock_data, length=14):
    high = stock_data['high']
    low = stock_data['low']
    close = stock_data['adj close']

    prev_close = close.shift(1)
    tr = pd.concat([
        high - low,
        (high - prev_close).abs(),
        (low - prev_close).abs()
    ], axis=1).max(axis=1)

    atr = tr.rolling(window=length, min_periods=length).mean()
    atr_zscore = (atr - atr.mean()) / atr.std()
    return atr_zscore

# Compute MACD manually and standardize it
def compute_macd(series, fast=12, slow=26):
    ema_fast = series.ewm(span=fast, min_periods=fast).mean()
    ema_slow = series.ewm(span=slow, min_periods=slow).mean()
    macd_line = ema_fast - ema_slow
    macd_zscore = (macd_line - macd_line.mean()) / macd_line.std()
    return macd_zscore

# Garman-Klass volatility estimator
df['garman_klass_vol'] = (
    ((np.log(df['high']) - np.log(df['low'])) ** 2) / 2
    - (2 * np.log(2) - 1) * ((np.log(df['adj close']) - np.log(df['open'])) ** 2)
)

# Compute RSI grouped by level 1
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: compute_rsi(x, length=20))

# Compute Bollinger Bands grouped by level 1
bb_df = df.groupby(level=1)['adj close'].apply(bbands_transform)
df[['bb_low', 'bb_mid', 'bb_high']] = bb_df.reset_index(level=0, drop=True)

# Compute ATR grouped by level 1
df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

# Compute MACD grouped by level 1
df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].transform(lambda x: compute_macd(x))

df['dollar_volume'] = (df['adj close']* df['volume'])/1e6

df


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-29,A,31.251009,33.740002,34.060001,33.240002,33.360001,2252400.0,-0.001351,,,,,,,70.389773
2015-09-29,AAPL,24.536381,27.264999,28.377501,26.965000,28.207500,293461600.0,-0.006207,,,,,,,7200.485558
2015-09-29,ABBV,35.061218,52.790001,54.189999,51.880001,53.099998,12842800.0,-0.065607,,,,,,,450.284214
2015-09-29,ABT,32.820751,39.500000,40.150002,39.029999,39.259998,12287500.0,-0.011997,,,,,,,403.284980
2015-09-29,ACGL,23.217773,24.416668,24.456667,24.100000,24.170000,1888800.0,-0.000516,,,,,,,43.853730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,XYL,87.701065,89.519997,90.849998,89.500000,90.379997,1322400.0,-0.000238,22.653298,4.472030,4.556072,4.640113,-3.011215,-2.157409,115.975888
2023-09-26,YUM,119.860710,124.010002,124.739998,123.449997,124.239998,1500600.0,-0.000443,36.971512,4.791669,4.822408,4.853147,-2.828855,-1.367168,179.862982
2023-09-26,ZBH,110.800163,112.459999,117.110001,112.419998,116.769997,3610500.0,-0.000229,41.303626,4.738303,4.778997,4.819692,-2.199154,-0.878964,400.043989
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0,0.000133,21.657597,5.397402,5.539167,5.680932,-0.078248,-1.600810,79.595386


In [5]:
# Unstack the DataFrame to get a multi-index DataFrame with tickers as columns
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open', 'high', 'low', 'close', 'close']]
# Unstack the DataFrame to get a multi-index DataFrame with tickers as columns
df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume')

df.unstack()[last_cols].resample('M').last().stack('ticker')

Unnamed: 0_level_0,Price,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-09-30,A,31.797482,-0.001607,,,,,,
2015-09-30,AAPL,24.815359,-0.003874,,,,,,
2015-09-30,ABBV,36.137157,-0.059472,,,,,,
2015-09-30,ABT,33.419003,-0.011689,,,,,,
2015-09-30,ACGL,23.287504,-0.000921,,,,,,
...,...,...,...,...,...,...,...,...,...
2023-09-30,XYL,87.701065,-0.000238,22.653298,4.472030,4.556072,4.640113,-3.011215,-2.157409
2023-09-30,YUM,119.860710,-0.000443,36.971512,4.791669,4.822408,4.853147,-2.828855,-1.367168
2023-09-30,ZBH,110.800163,-0.000229,41.303626,4.738303,4.778997,4.819692,-2.199154,-0.878964
2023-09-30,ZBRA,223.960007,0.000133,21.657597,5.397402,5.539167,5.680932,-0.078248,-1.600810
