In [23]:
# importing necessary libraries
import pandas as pd
import yfinance as yf
import requests
from datetime import timedelta


In [24]:
spy_stocklist = pd.read_csv(r'raw_data/sp_500_historical_components.csv')

# Data cleaning
In this portion of the code we will 
1) Get the list of stocks that ever existed from sp_500_historical_components.csv from https://github.com/hanshof/sp500_constituents/blob/main/sp_500_historical_components.csv
2) Use yfinance library to get historical data (2012-2020) OHLCV, PE ratio, PB ratio from the lists of stocks, indicate their presence in the stock data at any point of time on a daily level. we get 2012 data for a buffer to calculate moving averages later on
3) Filter out stocks that newly entered the stock market before 2016, this is to ensure that we have sufficient training data for each stock

In [None]:

# Convert 'date' column to datetime
spy_stocklist['date'] = pd.to_datetime(spy_stocklist['date'])

# Ensure 'tickers' is string
spy_stocklist['tickers'] = spy_stocklist['tickers'].astype(str)
spy_stocklist

Unnamed: 0,date,tickers
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
...,...,...
3477,2025-08-19,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3478,2025-08-20,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3479,2025-08-21,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3480,2025-08-22,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."


In [26]:
# Filter date range for 2012-01-01 to 2021-01-07, we include the year 2012 to introduce a buffer for moving averages calculation
spy_stocklist_filtered = spy_stocklist[(spy_stocklist['date'] >= '2013-01-01') & (spy_stocklist['date'] <= '2020-12-31')]

In [27]:
spy_stocklist_filtered_expanded = (
    spy_stocklist_filtered.assign(ticker=spy_stocklist_filtered["tickers"].str.split(","))
      .explode("ticker")
      .drop(columns="tickers")
      .reset_index(drop=True)
)

spy_stocklist_filtered_expanded

Unnamed: 0,date,ticker
0,2013-01-02,A
1,2013-01-02,AAPL
2,2013-01-02,ABBV
3,2013-01-02,ABC
4,2013-01-02,ABT
...,...,...
343971,2020-12-21,YUM
343972,2020-12-21,ZBH
343973,2020-12-21,ZBRA
343974,2020-12-21,ZION


In [28]:
# there are certain cases where a stock may have rebranded, we will update the list accordingly later
## TO DO
updated_ticker_mapping = {
    'GOOGL': 'GOOG',  # Alphabet Inc. Class A to Class C
    'FB': 'META',     # Facebook, Inc. to Meta Platforms, Inc.
    'TWTR': 'X',      # Twitter, Inc. to X Corp.
}

In [29]:
## defining a function to replace the tickers

def update_tickers_in_df_from_dict(df, column, ticker_map):

    df_updated = df.copy()

    # Replace using pandas built-in mapping
    df_updated[column] = df_updated[column].replace(ticker_map)

    return df_updated

In [30]:
spy_stocklist_final = update_tickers_in_df_from_dict(spy_stocklist_filtered_expanded, "ticker", updated_ticker_mapping)
spy_stocklist_final[spy_stocklist_final['ticker'].isin(updated_ticker_mapping.values())]

Unnamed: 0,date,ticker
185,2013-01-02,GOOG
448,2013-01-02,X
643,2013-01-03,GOOG
906,2013-01-03,X
1101,2013-01-08,GOOG
...,...,...
343416,2020-11-17,X
343649,2020-12-21,META
343674,2020-12-21,GOOG
343675,2020-12-21,GOOG


In [31]:
### getting list of unique tickers that ever entered the SP500 between 2013-2020
unique_tickers = spy_stocklist_final['ticker'].unique().tolist()
print(f"Total unique tickers in S&P 500 from 2013 to 2020: {len(unique_tickers)}")
unique_tickers

Total unique tickers in S&P 500 from 2013 to 2020: 639


['A',
 'AAPL',
 'ABBV',
 'ABC',
 'ABT',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AET',
 'AFL',
 'AIG',
 'AIV',
 'AIZ',
 'AKAM',
 'ALL',
 'ALXN',
 'AMAT',
 'AMD',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'AN',
 'ANDV',
 'ANF',
 'ANTM',
 'AON',
 'APA',
 'APC',
 'APD',
 'APH',
 'APOL',
 'APTV',
 'ARG',
 'ATI',
 'AVB',
 'AVP',
 'AVY',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BAX',
 'BBBY',
 'BBT',
 'BBY',
 'BCR',
 'BDX',
 'BEN',
 'BF-B',
 'BIG',
 'BIIB',
 'BK',
 'BKNG',
 'BLK',
 'BLL',
 'BMS',
 'BMY',
 'BRCM',
 'BRK-B',
 'BSX',
 'BTUUQ',
 'BWA',
 'BXP',
 'C',
 'CA',
 'CAG',
 'CAH',
 'CAM',
 'CAT',
 'CB',
 'CBRE',
 'CBS',
 'CCI',
 'CCL',
 'CELG',
 'CERN',
 'CF',
 'CFN',
 'CHRW',
 'CI',
 'CINF',
 'CL',
 'CLF',
 'CLX',
 'CMA',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNP',
 'CNX',
 'COF',
 'COG',
 'COL',
 'COP',
 'COST',
 'COV',
 'CPB',
 'CRM',
 'CSCO',
 'CSX',
 'CTAS',
 'CTL',
 'CTSH',
 'CTXS',
 'CVC',
 'CVS',
 'CVX',
 'D',
 'DD',
 'DE',
 'DF',
 'DFS',
 'DG',
 'DGX',


In [32]:
### now we will use the yfinance library to get the historical data for these tickers that ever existed in the SP500 between 2013-2020
def get_stock_data(ticker, start_date, end_date):
    try:
        # Convert string to datetime so we can safely add 1 day, because yfinance end date is exclusive we add one day to include the end date
        end_date_dt = pd.to_datetime(end_date) + pd.Timedelta(days=1)
        
        stock = yf.Ticker(ticker)
        stock_data = stock.history(
            start=start_date,
            end=end_date_dt,
            auto_adjust=True,  # adjust for stock splits/dividends
            actions=True       # include Dividends and Stock Splits columns
        )
        
        if stock_data.empty:
            return pd.DataFrame()
        
        stock_data["ticker"] = ticker
        return stock_data.reset_index()

    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

In [None]:
start_date = '2012-01-01' # we start from 2012 to have a buffer for moving averages calculation
end_date = '2020-12-31' # we end at 2020-12-31, 2019-2020 will be held out for stress testing
yfinance_data = []
failed_tickers = []
successful_tickers = []
for ticker in unique_tickers:
    print(f"Fetching data for {ticker}...")
    ticker_data = get_stock_data(ticker, start_date, end_date)
    if not ticker_data.empty:
        yfinance_data.append(ticker_data)
        successful_tickers.append(ticker)
    else:
        failed_tickers.append(ticker)

    

In [34]:
# combine all individual DataFrames into one big DataFrame
yfinance_df = pd.concat(yfinance_data, ignore_index=True)

# optional: ensure 'Date' is a proper datetime column
yfinance_df["Date"] = pd.to_datetime(yfinance_df["Date"])

# preview result
yfinance_df.drop(columns=['Capital Gains'], inplace=True)
yfinance_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,2012-01-03 00:00:00-05:00,22.776919,23.507763,22.713367,23.183651,4156394,0.0,0.0,A
1,2012-01-04 00:00:00-05:00,22.973931,23.107388,22.61804,22.99935,4651845,0.0,0.0,A
2,2012-01-05 00:00:00-05:00,22.802346,23.71749,22.700664,23.514126,6842651,0.0,0.0,A
3,2012-01-06 00:00:00-05:00,23.571322,23.870016,23.393379,23.768333,4711400,0.0,0.0,A
4,2012-01-09 00:00:00-05:00,23.908147,24.416561,23.812819,24.39114,4429563,0.0,0.0,A


In [35]:
# checking for stock split price adjustment, for example apple, check the dates around 2020-08-31 when apple had a 4-for-1 stock split
yfinance_df[yfinance_df['Date'].between('2020-08-24', '2020-09-07') & (yfinance_df['ticker'] == 'AAPL')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
4439,2020-08-24 00:00:00-04:00,125.186343,125.271465,120.556212,122.423828,345937600,0.0,0.0,AAPL
4440,2020-08-25 00:00:00-04:00,121.295478,121.764813,119.695353,121.419495,211495600,0.0,0.0,AAPL
4441,2020-08-26 00:00:00-04:00,122.737532,123.527865,121.66997,123.070686,163022400,0.0,0.0,AAPL
4442,2020-08-27 00:00:00-04:00,123.673769,124.006924,120.454069,121.599449,155552400,0.0,0.0,AAPL
4443,2020-08-28 00:00:00-04:00,122.5746,122.992869,121.178753,121.402481,187630000,0.0,0.0,AAPL
4444,2020-08-31 00:00:00-04:00,124.099326,127.426019,122.56243,125.519485,225702700,0.0,4.0,AAPL
4445,2020-09-01 00:00:00-04:00,129.137999,131.122352,126.968843,130.519257,151948100,0.0,0.0,AAPL
4446,2020-09-02 00:00:00-04:00,133.836243,134.215602,123.535164,127.815117,200119000,0.0,0.0,AAPL
4447,2020-09-03 00:00:00-04:00,123.447617,125.324955,117.212493,117.582123,257599600,0.0,0.0,AAPL
4448,2020-09-04 00:00:00-04:00,116.794209,120.325172,107.864661,117.659927,332607200,0.0,0.0,AAPL


In [36]:
# Ensure both columns are in the same format
yfinance_df["Date"] = pd.to_datetime(yfinance_df["Date"]).dt.date
spy_stocklist_filtered_expanded["date"] = pd.to_datetime(spy_stocklist_filtered_expanded["date"]).dt.date

# Create a set of (date, ticker) pairs for fast lookup
spy_set = set(zip(spy_stocklist_filtered_expanded["date"], spy_stocklist_filtered_expanded["ticker"]))

# Add indicator column: 1 if (Date, Ticker) is in SPY list, else 0
yfinance_df["is_in_sp500"] = [
    1 if (d, t) in spy_set else 0 for d, t in zip(yfinance_df["Date"], yfinance_df["ticker"])
]

yfinance_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,0.0,A,0
1,2012-01-04,22.973931,23.107388,22.61804,22.99935,4651845,0.0,0.0,A,0
2,2012-01-05,22.802346,23.71749,22.700664,23.514126,6842651,0.0,0.0,A,0
3,2012-01-06,23.571322,23.870016,23.393379,23.768333,4711400,0.0,0.0,A,0
4,2012-01-09,23.908147,24.416561,23.812819,24.39114,4429563,0.0,0.0,A,0


In [37]:
# filtering stocks that entered the stock market after 2016-01-01 to ensure we have enough training data per stock

first_trade = yfinance_df.groupby("ticker")["Date"].min().reset_index()
first_trade["Date"] = pd.to_datetime(first_trade["Date"])
first_trade.columns = ["ticker", "first_trade_date"]

# keep only tickers that started before or on 2016-12-31
eligible_tickers = first_trade[first_trade["first_trade_date"] <= "2016-12-31"]["ticker"]

yfinance_df_filtered_relavant_stocks = filtered_df = yfinance_df[yfinance_df["ticker"].isin(eligible_tickers)].copy()


print(f"Total tickers after filtering: {eligible_tickers.nunique()}, tickers filtered out: {len(unique_tickers) - eligible_tickers.nunique()}")

Total tickers after filtering: 509, tickers filtered out: 130


In [38]:
# getting daily returns, monthly returns and adding to the dataframe
# we take 21 trading days as approximately 1 month

yfinance_df_filtered_relavant_stocks.sort_values(by=['ticker', 'Date'], inplace=True)
yfinance_df_filtered_relavant_stocks['daily_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change()
yfinance_df_filtered_relavant_stocks['monthly_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change(periods=21)

### Creating new features
#### Moving averages
https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp we will use short:20 days, medium: 50 days, long: 100 days moving averages
1) Simple moving average SMA: SMA_20, SMA_50, SMA_100
2) Exponential Moving Average EMA https://www.investopedia.com/terms/e/ema.asp#toc-formula-for-exponential-moving-average-ema: EMA_20, EMA_50, EMA_100

#### RSI and MACD
1) Relative Strength Index 14 days https://www.investopedia.com/terms/r/rsi.asp : RSI
2) Moving Average Convergence/Divergence indicator: https://www.investopedia.com/terms/m/macd.asp: MACD_26, MACD_12, MACD_9 

#### Stock Splits
1) Forward stock split (Stock split > 1): Commonly known as a bullish indicator which provides more liquidity (when fractional shares werent common)
2) Reverse stock split (0 < Stock split < 1): Bearish indicator
3) Days since stock forward stock split
4) Days since reverse stock split

In [39]:
def sma(series: pd.Series, window: int) -> pd.Series:
    """Simple Moving Average."""
    return series.rolling(window, min_periods=window).mean()

In [40]:
def ema(series: pd.Series, span: int) -> pd.Series:
    """Exponential Moving Average."""
    return series.ewm(span=span, adjust=False, min_periods=span).mean()

In [41]:
def rsi_wilder(series: pd.Series, period: int = 14) -> pd.Series:
    """Wilder's RSI (default 14)."""
    delta = series.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [42]:
def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    """
    MACD parts: returns DataFrame with MACD_Line, MACD_Signal, MACD_Hist.
    """
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    macd_signal = macd_line.ewm(span=signal, adjust=False, min_periods=signal).mean()
    macd_hist = macd_line - macd_signal
    return pd.DataFrame(
        {"MACD_Line": macd_line, "MACD_Signal": macd_signal, "MACD_Hist": macd_hist},
        index=series.index
    )

In [44]:
temp_df = yfinance_df_filtered_relavant_stocks.copy()

# Ensure types/order; compute per-ticker
temp_df['Date'] = pd.to_datetime(temp_df['Date'])
temp_df = temp_df.drop_duplicates(subset=['ticker','Date']).sort_values(['ticker','Date'])

# Prefer adjusted close if available
price_col = 'Adj Close' if 'Adj Close' in temp_df.columns else 'Close'

g = temp_df.groupby('ticker', group_keys=False)

# SMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'SMA_{w}'] = g[price_col].transform(lambda s, w=w: sma(s, w))

# EMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'EMA_{w}'] = g[price_col].transform(lambda s, w=w: ema(s, w))

# RSI 14
temp_df['RSI'] = g[price_col].transform(rsi_wilder)

# MACD (12,26,9)
macd_df = g[price_col].apply(macd)
temp_df = temp_df.join(macd_df)

# Stock Splits
temp_df["is_forward_split"] = (yfinance_df["Stock Splits"] > 1).astype(int)
temp_df["is_reverse_split"] = ((yfinance_df["Stock Splits"] > 0) & 
                                   (yfinance_df["Stock Splits"] < 1)).astype(int) 

SP500_all_stock_data_Final = temp_df


SP500_all_stock_data_Final

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,0.0,A,0,...,,,,,,,,,0,0
1,2012-01-04,22.973931,23.107388,22.618040,22.999350,4651845,0.0,0.0,A,0,...,,,,,,,,,0,0
2,2012-01-05,22.802346,23.717490,22.700664,23.514126,6842651,0.0,0.0,A,0,...,,,,,,,,,0,0
3,2012-01-06,23.571322,23.870016,23.393379,23.768333,4711400,0.0,0.0,A,0,...,,,,,,,,,0,0
4,2012-01-09,23.908147,24.416561,23.812819,24.391140,4429563,0.0,0.0,A,0,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836856,2020-12-24,153.424264,155.295055,153.357115,154.191772,417400,0.0,0.0,ZTS,0,...,154.709183,154.477101,154.663485,151.699310,48.711796,-0.298371,-0.514776,0.216405,0,0
836857,2020-12-28,154.882506,156.216045,153.798400,155.793915,1522400,0.0,0.0,ZTS,0,...,154.755877,154.602512,154.707816,151.780391,52.813310,-0.160577,-0.443936,0.283359,0,0
836858,2020-12-29,156.580603,158.393833,155.803492,156.494247,1188400,0.0,0.0,ZTS,0,...,154.774984,154.782677,154.777872,151.873735,54.525236,0.005079,-0.354133,0.359212,0,0
836859,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,0.0,ZTS,0,...,154.828601,155.050761,154.888448,151.987078,57.161989,0.222820,-0.238742,0.461563,0,0


In [45]:
# we dont want to backward fill the data as it may introduce lookahead bias, 
# nor do we want to delete the entries, so for stocks that newly entered the SP500 with insufficient data we fill with 0
SP500_all_stock_data_Final.fillna(0, inplace=True)
SP500_all_stock_data_Final.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,0.0,A,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,2012-01-04,22.973931,23.107388,22.61804,22.99935,4651845,0.0,0.0,A,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2012-01-05,22.802346,23.71749,22.700664,23.514126,6842651,0.0,0.0,A,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2012-01-06,23.571322,23.870016,23.393379,23.768333,4711400,0.0,0.0,A,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,2012-01-09,23.908147,24.416561,23.812819,24.39114,4429563,0.0,0.0,A,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [None]:
# splitting the data appropriately into training, test and covid stress test data 
# Covid data 
training_data = SP500_all_stock_data_Final[SP500_all_stock_data_Final['Date'] < '2019-01-01']
