In [39]:
# importing necessary libraries
import pandas as pd
import yfinance as yf


In [40]:
spy_stocklist = pd.read_csv(r'raw_data/sp_500_historical_components.csv')

# Data cleaning
In this portion of the code we will 
1) Get the list of stocks that ever existed from sp_500_historical_components.csv from https://github.com/hanshof/sp500_constituents/blob/main/sp_500_historical_components.csv
2) Use yfinance library to get historical data (2012-2020) OHLCV, PE ratio, PB ratio from the lists of stocks, indicate their presence in the stock data at any point of time on a daily level. we get 2012 data for a buffer to calculate moving averages later on
3) Filter out stocks that newly entered the stock market before 2016, this is to ensure that we have sufficient training data for each stock

In [41]:

# Convert 'date' column to datetime
spy_stocklist['date'] = pd.to_datetime(spy_stocklist['date'])

# Ensure 'tickers' is string
spy_stocklist['tickers'] = spy_stocklist['tickers'].astype(str)
spy_stocklist

Unnamed: 0,date,tickers
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
...,...,...
3477,2025-08-19,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3478,2025-08-20,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3479,2025-08-21,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3480,2025-08-22,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."


In [42]:
# Filter date range for 2012-01-01 to 2021-01-07, we include the year 2012 to introduce a buffer for moving averages calculation
spy_stocklist_filtered = spy_stocklist[(spy_stocklist['date'] >= '2013-01-01') & (spy_stocklist['date'] <= '2020-12-31')]

In [43]:
spy_stocklist_filtered_expanded = (
    spy_stocklist_filtered.assign(ticker=spy_stocklist_filtered["tickers"].str.split(","))
      .explode("ticker")
      .drop(columns="tickers")
      .reset_index(drop=True)
)

spy_stocklist_filtered_expanded

Unnamed: 0,date,ticker
0,2013-01-02,A
1,2013-01-02,AAPL
2,2013-01-02,ABBV
3,2013-01-02,ABC
4,2013-01-02,ABT
...,...,...
343971,2020-12-21,YUM
343972,2020-12-21,ZBH
343973,2020-12-21,ZBRA
343974,2020-12-21,ZION


In [44]:
# update tickers symbols for known companies that changed their ticker symbols
updated_ticker_mapping = {
    'GOOGL': 'GOOG',  
    'FB': 'META',     
    'TWTR': 'X',      
}

In [45]:
## defining a function to replace the tickers

def update_tickers_in_df_from_dict(df, column, ticker_map):

    df_updated = df.copy()

    # Replace using pandas built-in mapping
    df_updated[column] = df_updated[column].replace(ticker_map)

    return df_updated

In [46]:
spy_stocklist_final = update_tickers_in_df_from_dict(spy_stocklist_filtered_expanded, "ticker", updated_ticker_mapping)
spy_stocklist_final[spy_stocklist_final['ticker'].isin(updated_ticker_mapping.values())]

Unnamed: 0,date,ticker
185,2013-01-02,GOOG
448,2013-01-02,X
643,2013-01-03,GOOG
906,2013-01-03,X
1101,2013-01-08,GOOG
...,...,...
343416,2020-11-17,X
343649,2020-12-21,META
343674,2020-12-21,GOOG
343675,2020-12-21,GOOG


In [47]:
### getting list of unique tickers that ever entered the SP500 between 2013-2020
unique_tickers = spy_stocklist_final['ticker'].unique().tolist()
print(f"Total unique tickers in S&P 500 from 2013 to 2020: {len(unique_tickers)}")
unique_tickers

Total unique tickers in S&P 500 from 2013 to 2020: 639


['A',
 'AAPL',
 'ABBV',
 'ABC',
 'ABT',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AET',
 'AFL',
 'AIG',
 'AIV',
 'AIZ',
 'AKAM',
 'ALL',
 'ALXN',
 'AMAT',
 'AMD',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'AN',
 'ANDV',
 'ANF',
 'ANTM',
 'AON',
 'APA',
 'APC',
 'APD',
 'APH',
 'APOL',
 'APTV',
 'ARG',
 'ATI',
 'AVB',
 'AVP',
 'AVY',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BAX',
 'BBBY',
 'BBT',
 'BBY',
 'BCR',
 'BDX',
 'BEN',
 'BF-B',
 'BIG',
 'BIIB',
 'BK',
 'BKNG',
 'BLK',
 'BLL',
 'BMS',
 'BMY',
 'BRCM',
 'BRK-B',
 'BSX',
 'BTUUQ',
 'BWA',
 'BXP',
 'C',
 'CA',
 'CAG',
 'CAH',
 'CAM',
 'CAT',
 'CB',
 'CBRE',
 'CBS',
 'CCI',
 'CCL',
 'CELG',
 'CERN',
 'CF',
 'CFN',
 'CHRW',
 'CI',
 'CINF',
 'CL',
 'CLF',
 'CLX',
 'CMA',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNP',
 'CNX',
 'COF',
 'COG',
 'COL',
 'COP',
 'COST',
 'COV',
 'CPB',
 'CRM',
 'CSCO',
 'CSX',
 'CTAS',
 'CTL',
 'CTSH',
 'CTXS',
 'CVC',
 'CVS',
 'CVX',
 'D',
 'DD',
 'DE',
 'DF',
 'DFS',
 'DG',
 'DGX',


In [48]:
### now we will use the yfinance library to get the historical data for these tickers that ever existed in the SP500 between 2013-2020
def get_stock_data(ticker, start_date, end_date):
    try:
        # Convert string to datetime so we can safely add 1 day, because yfinance end date is exclusive we add one day to include the end date
        end_date_dt = pd.to_datetime(end_date) + pd.Timedelta(days=1)
        
        stock = yf.Ticker(ticker)
        stock_data = stock.history(
            start=start_date,
            end=end_date_dt,
            auto_adjust=True,  # adjust for stock splits/dividends
            actions=True       # include Dividends and Stock Splits columns
        )
        
        if stock_data.empty:
            return pd.DataFrame()
        
        stock_data["ticker"] = ticker
        return stock_data.reset_index()

    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

In [None]:
start_date = '2012-01-01' # we start from 2012 to have a buffer for moving averages calculation
end_date = '2020-12-31' # we end at 2020-12-31, 2019-2020 will be held out for stress testing
yfinance_data = []
failed_tickers = []
successful_tickers = []
for ticker in unique_tickers:
    #print(f"Fetching data for {ticker}...")
    ticker_data = get_stock_data(ticker, start_date, end_date)
    if not ticker_data.empty:
        yfinance_data.append(ticker_data)
        successful_tickers.append(ticker)
    else:
        failed_tickers.append(ticker)

    

$ABC: possibly delisted; no timezone found
$ALXN: possibly delisted; no timezone found
$ANTM: possibly delisted; no timezone found
$APC: possibly delisted; no timezone found
$APOL: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$ARG: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$AVP: possibly delisted; no timezone found
$BCR: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$BIG: possibly delisted; no timezone found
$BLL: possibly delisted; no timezone found
$BRCM: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$BTUUQ: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$CA: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")
$CAM: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error 

In [None]:
# combine all individual DataFrames into one big DataFrame
yfinance_df = pd.concat(yfinance_data, ignore_index=True)

# optional: ensure 'Date' is a proper datetime column
yfinance_df["Date"] = pd.to_datetime(yfinance_df["Date"])

# preview result
yfinance_df.drop(columns=['Capital Gains'], inplace=True)
yfinance_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,2012-01-03 00:00:00-05:00,22.776925,23.507769,22.713373,23.183657,4156394,0.0,0.0,A
1,2012-01-04 00:00:00-05:00,22.973927,23.107384,22.618036,22.999346,4651845,0.0,0.0,A
2,2012-01-05 00:00:00-05:00,22.802342,23.717486,22.70066,23.514122,6842651,0.0,0.0,A
3,2012-01-06 00:00:00-05:00,23.571324,23.870017,23.39338,23.768335,4711400,0.0,0.0,A
4,2012-01-09 00:00:00-05:00,23.908141,24.416555,23.812814,24.391134,4429563,0.0,0.0,A


In [None]:
# checking for stock split price adjustment, for example apple, check the dates around 2020-08-31 when apple had a 4-for-1 stock split
yfinance_df[yfinance_df['Date'].between('2020-08-24', '2020-09-07') & (yfinance_df['ticker'] == 'AAPL')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
4439,2020-08-24 00:00:00-04:00,125.186359,125.27148,120.556227,122.423843,345937600,0.0,0.0,AAPL
4440,2020-08-25 00:00:00-04:00,121.295471,121.764805,119.695346,121.419487,211495600,0.0,0.0,AAPL
4441,2020-08-26 00:00:00-04:00,122.737516,123.52785,121.669955,123.070671,163022400,0.0,0.0,AAPL
4442,2020-08-27 00:00:00-04:00,123.673769,124.006924,120.454069,121.599449,155552400,0.0,0.0,AAPL
4443,2020-08-28 00:00:00-04:00,122.5746,122.992869,121.178753,121.402481,187630000,0.0,0.0,AAPL
4444,2020-08-31 00:00:00-04:00,124.099356,127.42605,122.56246,125.519516,225702700,0.0,4.0,AAPL
4445,2020-09-01 00:00:00-04:00,129.137984,131.122336,126.968828,130.519241,151948100,0.0,0.0,AAPL
4446,2020-09-02 00:00:00-04:00,133.836227,134.215586,123.53515,127.815102,200119000,0.0,0.0,AAPL
4447,2020-09-03 00:00:00-04:00,123.447601,125.324939,117.212478,117.582108,257599600,0.0,0.0,AAPL
4448,2020-09-04 00:00:00-04:00,116.79424,120.325203,107.864689,117.659958,332607200,0.0,0.0,AAPL


In [None]:
# Ensure both columns are in the same format
yfinance_df["Date"] = pd.to_datetime(yfinance_df["Date"]).dt.date
spy_stocklist_filtered_expanded["date"] = pd.to_datetime(spy_stocklist_filtered_expanded["date"]).dt.date

# Create a set of (date, ticker) pairs for fast lookup
spy_set = set(zip(spy_stocklist_filtered_expanded["date"], spy_stocklist_filtered_expanded["ticker"]))

# Add indicator column: 1 if (Date, Ticker) is in SPY list, else 0
yfinance_df["is_in_sp500"] = [
    1 if (d, t) in spy_set else 0 for d, t in zip(yfinance_df["Date"], yfinance_df["ticker"])
]

yfinance_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500
0,2012-01-03,22.776925,23.507769,22.713373,23.183657,4156394,0.0,0.0,A,0
1,2012-01-04,22.973927,23.107384,22.618036,22.999346,4651845,0.0,0.0,A,0
2,2012-01-05,22.802342,23.717486,22.70066,23.514122,6842651,0.0,0.0,A,0
3,2012-01-06,23.571324,23.870017,23.39338,23.768335,4711400,0.0,0.0,A,0
4,2012-01-09,23.908141,24.416555,23.812814,24.391134,4429563,0.0,0.0,A,0


In [None]:
# filtering stocks that entered the stock market after 2016-01-01 to ensure we have enough training data per stock

first_trade = yfinance_df.groupby("ticker")["Date"].min().reset_index()
first_trade["Date"] = pd.to_datetime(first_trade["Date"])
first_trade.columns = ["ticker", "first_trade_date"]

# keep only tickers that started before or on 2015-12-31
eligible_tickers = first_trade[first_trade["first_trade_date"] <= "2015-12-31"]["ticker"]

yfinance_df_filtered_relavant_stocks = filtered_df = yfinance_df[yfinance_df["ticker"].isin(eligible_tickers)].copy()


print(f"Total tickers after filtering: {eligible_tickers.nunique()}, tickers filtered out: {len(unique_tickers) - eligible_tickers.nunique()}")

Total tickers after filtering: 506, tickers filtered out: 133


In [None]:
# getting daily returns, monthly returns and adding to the dataframe
# we take 21 trading days as approximately 1 month

yfinance_df_filtered_relavant_stocks.sort_values(by=['ticker', 'Date'], inplace=True)
yfinance_df_filtered_relavant_stocks['daily_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change()
yfinance_df_filtered_relavant_stocks['monthly_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change(periods=21)

In [None]:
# getting monthly standard deviation (volatility) 
yfinance_df_filtered_relavant_stocks["monthly_var_3"] = (
    yfinance_df_filtered_relavant_stocks.groupby("ticker")["monthly_return"]
      .transform(lambda x: x.rolling(window=3, min_periods=3).var(ddof=1))
)

yfinance_df_filtered_relavant_stocks["daily_vol_21"] = (
    yfinance_df_filtered_relavant_stocks
    .groupby("ticker")["daily_return"]
    .transform(lambda x: x.rolling(window=21, min_periods=21).std(ddof=1))
)

### EDA for new stocks that entered after our 2013 cut off
- the problem with these stocks is that they might not have enough historical data to compute fields such as:
1) monthly returns
2) monthly variance (3 months)
3) EMA/SMA

In [None]:
start_date = pd.Timestamp("2013-01-01")

# ensure Date is Timestamp
yfinance_df_filtered_relavant_stocks["Date"] = pd.to_datetime(yfinance_df_filtered_relavant_stocks["Date"])

# find earliest date per ticker
first_dates = (
    yfinance_df_filtered_relavant_stocks
    .groupby("ticker")["Date"]
    .min()
    .reset_index(name="first_date")
)

# flag tickers that started trading after 2013-01-01
first_dates["entered_after_2013"] = (first_dates["first_date"] > start_date).astype(int)

# count and preview
num_after = first_dates["entered_after_2013"].sum()
num_total = len(first_dates)

print(f"{num_after} out of {num_total} tickers started trading after {start_date.date()} before 2015-12-31.")
print(first_dates.query("entered_after_2013 == 1").head())

25 out of 506 tickers started trading after 2013-01-01 before 2015-12-31.
   ticker first_date  entered_after_2013
4    ABBV 2013-01-02                   1
26   ALLE 2013-11-18                   1
38   ANET 2014-06-06                   1
88    CDW 2013-06-27                   1
91    CFG 2014-09-24                   1


### Creating new features
#### Moving averages
https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp we will use short:20 days, medium: 50 days, long: 100 days moving averages
1) Simple moving average SMA: SMA_20, SMA_50, SMA_100
2) Exponential Moving Average EMA https://www.investopedia.com/terms/e/ema.asp#toc-formula-for-exponential-moving-average-ema: EMA_20, EMA_50, EMA_100

#### RSI and MACD
1) Relative Strength Index 14 days https://www.investopedia.com/terms/r/rsi.asp : RSI
2) Moving Average Convergence/Divergence indicator: https://www.investopedia.com/terms/m/macd.asp: MACD_26, MACD_12, MACD_9 

#### Stock Splits
1) Forward stock split (Stock split > 1): Commonly known as a bullish indicator which provides more liquidity (when fractional shares werent common)
2) Reverse stock split (0 < Stock split < 1): Bearish indicator

In [None]:
def sma(series: pd.Series, window: int) -> pd.Series:
    """Simple Moving Average."""
    return series.rolling(window, min_periods=window).mean()

In [None]:
def ema(series: pd.Series, span: int) -> pd.Series:
    """Exponential Moving Average."""
    return series.ewm(span=span, adjust=False, min_periods=span).mean()

In [None]:
def rsi_wilder(series: pd.Series, period: int = 14) -> pd.Series:
    """Wilder's RSI (default 14)."""
    delta = series.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [None]:
def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    """
    MACD parts: returns DataFrame with MACD_Line, MACD_Signal, MACD_Hist.
    """
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    macd_signal = macd_line.ewm(span=signal, adjust=False, min_periods=signal).mean()
    macd_hist = macd_line - macd_signal
    return pd.DataFrame(
        {"MACD_Line": macd_line, "MACD_Signal": macd_signal, "MACD_Hist": macd_hist},
        index=series.index
    )

In [None]:
temp_df = yfinance_df_filtered_relavant_stocks.copy()

# Ensure types/order; compute per-ticker
temp_df['Date'] = pd.to_datetime(temp_df['Date'])
temp_df = temp_df.drop_duplicates(subset=['ticker','Date']).sort_values(['ticker','Date'])


g = temp_df.groupby('ticker', group_keys=False)

# SMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'SMA_{w}'] = g['Close'].transform(lambda s, w=w: sma(s, w))

# EMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'EMA_{w}'] = g['Close'].transform(lambda s, w=w: ema(s, w))

# RSI 14
temp_df['RSI'] = g['Close'].transform(rsi_wilder)

# MACD (12,26,9)
macd_df = g['Close'].apply(macd)
temp_df = temp_df.join(macd_df)

# Stock Splits
temp_df["forward_split"] = yfinance_df["Stock Splits"].apply(
    lambda x: x if x > 1 else 0
)
temp_df["reverse_split"] = yfinance_df["Stock Splits"].apply(
    lambda x: 1 / x if 0 < x < 1 else 0
)

SP500_all_stock_data = temp_df


SP500_all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,forward_split,reverse_split
0,2012-01-03,22.776925,23.507769,22.713373,23.183657,4156394,0.0,0.0,A,0,...,,,,,,,,,0.0,0.0
1,2012-01-04,22.973927,23.107384,22.618036,22.999346,4651845,0.0,0.0,A,0,...,,,,,,,,,0.0,0.0
2,2012-01-05,22.802342,23.717486,22.700660,23.514122,6842651,0.0,0.0,A,0,...,,,,,,,,,0.0,0.0
3,2012-01-06,23.571324,23.870017,23.393380,23.768335,4711400,0.0,0.0,A,0,...,,,,,,,,,0.0,0.0
4,2012-01-09,23.908141,24.416555,23.812814,24.391134,4429563,0.0,0.0,A,0,...,,,,,,,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,153.424280,155.295071,153.357131,154.191788,417400,0.0,0.0,ZTS,0,...,154.709183,154.477101,154.663484,151.699310,48.711848,-0.298368,-0.514775,0.216407,0.0,0.0
839122,2020-12-28,154.882521,156.216060,153.798415,155.793930,1522400,0.0,0.0,ZTS,0,...,154.755877,154.602513,154.707815,151.780391,52.813336,-0.160573,-0.443935,0.283362,0.0,0.0
839123,2020-12-29,156.580633,158.393864,155.803522,156.494278,1188400,0.0,0.0,ZTS,0,...,154.774985,154.782681,154.777873,151.873736,54.525288,0.005084,-0.354131,0.359215,0.0,0.0
839124,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,0.0,ZTS,0,...,154.828601,155.050764,154.888448,151.987079,57.161955,0.222825,-0.238740,0.461565,0.0,0.0


In [None]:
# for daily prices we use spreads instead of absolute prices
SP500_all_stock_data['high_low_spread'] = (SP500_all_stock_data['High'] - SP500_all_stock_data['Low'])/SP500_all_stock_data['Close']
SP500_all_stock_data['open_close_spread'] = (SP500_all_stock_data['Open'] - SP500_all_stock_data['Close'])/SP500_all_stock_data['Close']

# for moving averages we use moving averages of returns instead of absolute prices
for w in [20, 50, 100]:
    SP500_all_stock_data[f"SMA_ret_{w}"] = g["daily_return"].transform(lambda s, w=w: s.rolling(window=w, min_periods=w).mean())

# EMA (Exponential Moving Average) of daily returns
for w in [20, 50, 100]:
    SP500_all_stock_data[f"EMA_ret_{w}"] = g["daily_return"].transform(lambda s, w=w: s.ewm(span=w, adjust=False, min_periods=w).mean())

In [None]:
# filter for stock data from 2013-01-01 onwards 
SP500_all_stock_data_final = SP500_all_stock_data[SP500_all_stock_data['Date'] >= pd.to_datetime('2013-01-01')]
SP500_all_stock_data_final

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,forward_split,reverse_split,high_low_spread,open_close_spread,SMA_ret_20,SMA_ret_50,SMA_ret_100,EMA_ret_20,EMA_ret_50,EMA_ret_100
250,2013-01-02,27.067743,27.067743,26.413034,26.881601,8790205,0.0,0.0,A,1,...,0.0,0.0,0.024355,0.006924,0.005231,0.002119,0.000592,0.004949,0.003209,0.001800
251,2013-01-03,26.920111,27.048486,26.689037,26.977880,5751791,0.0,0.0,A,1,...,0.0,0.0,0.013324,-0.002141,0.005134,0.002936,0.000526,0.004818,0.003224,0.001835
252,2013-01-04,27.048487,27.568403,26.868762,27.510633,6432897,0.0,0.0,A,0,...,0.0,0.0,0.025432,-0.016799,0.005426,0.003817,0.000689,0.006240,0.003872,0.002190
253,2013-01-07,27.343756,27.472131,27.202544,27.311663,3589505,0.0,0.0,A,0,...,0.0,0.0,0.009871,0.001175,0.005504,0.003533,0.000683,0.004957,0.003436,0.002003
254,2013-01-08,27.260297,27.459278,27.022805,27.093410,3896925,0.0,0.0,A,1,...,0.0,0.0,0.016110,0.006160,0.005770,0.003645,0.000640,0.003724,0.002988,0.001805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,153.424280,155.295071,153.357131,154.191788,417400,0.0,0.0,ZTS,0,...,0.0,0.0,0.012568,-0.004978,0.000207,-0.000220,0.000439,-0.000127,-0.000022,0.000498
839122,2020-12-28,154.882521,156.216060,153.798415,155.793930,1522400,0.0,0.0,ZTS,0,...,0.0,0.0,0.015518,-0.005850,0.000327,0.000070,0.000409,0.000875,0.000387,0.000694
839123,2020-12-29,156.580633,158.393864,155.803522,156.494278,1188400,0.0,0.0,ZTS,0,...,0.0,0.0,0.016552,0.000552,0.000889,0.000091,0.000225,0.001220,0.000548,0.000770
839124,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,0.0,ZTS,0,...,0.0,0.0,0.009984,-0.004627,0.000936,0.000667,0.000447,0.001775,0.000803,0.000894


In [None]:
len(SP500_all_stock_data_final)

1003227

In [None]:
SP500_all_stock_data_final.isna().sum()

Date                    0
Open                    0
High                    0
Low                     0
Close                   0
Volume                  0
Dividends               0
Stock Splits            0
ticker                  0
is_in_sp500             0
daily_return           25
monthly_return        525
monthly_var_3         575
daily_vol_21          525
SMA_20                475
SMA_50               1225
SMA_100              2521
EMA_20                475
EMA_50               1225
EMA_100              2521
RSI                   350
MACD_Line             625
MACD_Signal           825
MACD_Hist             825
forward_split           0
reverse_split           0
high_low_spread         0
open_close_spread       0
SMA_ret_20            500
SMA_ret_50           1250
SMA_ret_100          2547
EMA_ret_20            500
EMA_ret_50           1250
EMA_ret_100          2547
dtype: int64

In [None]:
# dropping new stock data with insufficient data to compute the fields
SP500_all_stock_data_final = SP500_all_stock_data_final.dropna()
len(SP500_all_stock_data_final)

1000680

In [None]:
SP500_all_stock_data_final.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'ticker', 'is_in_sp500', 'daily_return',
       'monthly_return', 'monthly_var_3', 'daily_vol_21', 'SMA_20', 'SMA_50',
       'SMA_100', 'EMA_20', 'EMA_50', 'EMA_100', 'RSI', 'MACD_Line',
       'MACD_Signal', 'MACD_Hist', 'forward_split', 'reverse_split',
       'high_low_spread', 'open_close_spread', 'SMA_ret_20', 'SMA_ret_50',
       'SMA_ret_100', 'EMA_ret_20', 'EMA_ret_50', 'EMA_ret_100'],
      dtype='object')

#### Stationarity checks
we will use ADF and KPSS at test at 95% confidence for stationarity checks and eliminate serial correlation through transformations if required. 

We use both for robustness of the checks

ADF GOAL: checks for unit root
ADF h0: has unit root 
adf h1: no unit root 

KPSS GOAL: checks the stationarity of a series
KPSS ho: series is trend stationary
KPSS h1: the series has unit root

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss
import numpy as np

In [None]:
def check_adf_all_columns(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    results = []

    for col in numeric_cols:
        series = df[col].dropna()

        adf_p = np.nan
        adf_verdict = "error"
        kpss_p = np.nan
        kpss_verdict = "error"
        combined_verdict = "inconclusive"

        # ADF Test
        try:
            p_val = adfuller(series, autolag="AIC")[1]  # uses AIC to determine optimal lags
            adf_p = p_val
            adf_verdict = "stationary" if p_val < 0.05 else "non_stationary"
        except Exception as e:
            adf_p = np.nan
            adf_verdict = f"adf_error: {e}"

        # KPSS Test
        # logic is inverted due to h0 being stationary for KPSS in contrast to ADF 
        try:
            kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, auto will use newey-west lags
            kpss_p = kpss_res[1]
            kpss_verdict = "stationary" if kpss_p >= 0.05 else "non_stationary"
        except Exception as e:
            kpss_p = np.nan
            kpss_verdict = f"kpss_error: {e}"

        # Combine ADF and KPSS results verdict
        if isinstance(adf_verdict, str) and isinstance(kpss_verdict, str):
            if adf_verdict == "stationary" and kpss_verdict == "stationary":
                combined_verdict = "stationary"
            elif adf_verdict == "non_stationary" and kpss_verdict == "non_stationary":
                combined_verdict = "non_stationary"
            elif adf_verdict.startswith("adf_error") or kpss_verdict.startswith("kpss_error"):
                combined_verdict = "inconclusive"
            else:
                # Disagreement → flag for manual review (trend, breaks, low power, etc.)
                combined_verdict = "ambiguous"

        results.append({
            "column": col,
            # ADF
            "ADF_p": adf_p,
            "ADF_Verdict": adf_verdict,
            # KPSS
            "KPSS_p": kpss_p,
            "KPSS_Verdict": kpss_verdict,
            # Combined
            "Combined_Verdict": combined_verdict
        })

    return pd.DataFrame(results)

In [None]:
# Using only one stock (appl) for ADF because my kernel keeps crashing and its computationally expensive :"
appl_stock_data = SP500_all_stock_data_final[SP500_all_stock_data_final['ticker'] == 'AAPL']
adf_results = check_adf_all_columns(appl_stock_data)
display(adf_results)

look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, auto will use newey-west lags
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, auto will use newey-west lags
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, auto will use newey-west lags
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, auto will use newey-west lags
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(series, regression="c", nlags="auto")  # c is chosen to test for level stationarity, 

Unnamed: 0,column,ADF_p,ADF_Verdict,KPSS_p,KPSS_Verdict,Combined_Verdict
0,Open,1.0,non_stationary,0.01,non_stationary,non_stationary
1,High,1.0,non_stationary,0.01,non_stationary,non_stationary
2,Low,1.0,non_stationary,0.01,non_stationary,non_stationary
3,Close,1.0,non_stationary,0.01,non_stationary,non_stationary
4,Volume,0.0003941899,stationary,0.01,non_stationary,ambiguous
5,Stock Splits,0.0,stationary,0.1,stationary,stationary
6,is_in_sp500,0.00162188,stationary,0.01,non_stationary,ambiguous
7,daily_return,8.005959e-27,stationary,0.1,stationary,stationary
8,monthly_return,4.571972e-09,stationary,0.099575,stationary,stationary
9,monthly_var_3,9.980824e-05,stationary,0.01,non_stationary,ambiguous


In [None]:
# dropping non-stationary columns
SP500_all_stock_data_final.drop(columns=['Volume','Open', 'High', 'Low', 'Close', 'SMA_20', 'SMA_50', 'SMA_100', 'EMA_20', 'EMA_50', 'EMA_100'], inplace=True)
SP500_all_stock_data_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SP500_all_stock_data_final.drop(columns=['Volume','Open', 'High', 'Low', 'Close', 'SMA_20', 'SMA_50', 'SMA_100', 'EMA_20', 'EMA_50', 'EMA_100'], inplace=True)


Unnamed: 0,Date,Dividends,Stock Splits,ticker,is_in_sp500,daily_return,monthly_return,monthly_var_3,daily_vol_21,RSI,...,forward_split,reverse_split,high_low_spread,open_close_spread,SMA_ret_20,SMA_ret_50,SMA_ret_100,EMA_ret_20,EMA_ret_50,EMA_ret_100
250,2013-01-02,0.0,0.0,A,1,0.022960,0.096439,0.000508,0.016450,64.959820,...,0.0,0.0,0.024355,0.006924,0.005231,0.002119,0.000592,0.004949,0.003209,0.001800
251,2013-01-03,0.0,0.0,A,1,0.003582,0.111103,0.000424,0.016129,65.650225,...,0.0,0.0,0.013324,-0.002141,0.005134,0.002936,0.000526,0.004818,0.003224,0.001835
252,2013-01-04,0.0,0.0,A,0,0.019748,0.126805,0.000231,0.016441,69.259555,...,0.0,0.0,0.025432,-0.016799,0.005426,0.003817,0.000689,0.006240,0.003872,0.002190
253,2013-01-07,0.0,0.0,A,0,-0.007233,0.103319,0.000143,0.016569,66.451190,...,0.0,0.0,0.009871,0.001175,0.005504,0.003533,0.000683,0.004957,0.003436,0.002003
254,2013-01-08,0.0,0.0,A,1,-0.007991,0.104214,0.000177,0.016537,63.413727,...,0.0,0.0,0.016110,0.006160,0.005770,0.003645,0.000640,0.003724,0.002988,0.001805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,0.0,0.0,ZTS,0,0.005443,-0.002111,0.000204,0.009227,48.711848,...,0.0,0.0,0.012568,-0.004978,0.000207,-0.000220,0.000439,-0.000127,-0.000022,0.000498
839122,2020-12-28,0.0,0.0,ZTS,0,0.010391,0.013734,0.000429,0.009411,52.813336,...,0.0,0.0,0.015518,-0.005850,0.000327,0.000070,0.000409,0.000875,0.000387,0.000694
839123,2020-12-29,0.0,0.0,ZTS,0,0.004495,0.010219,0.000069,0.009306,54.525288,...,0.0,0.0,0.016552,0.000552,0.000889,0.000091,0.000225,0.001220,0.000548,0.000770
839124,2020-12-30,0.0,0.0,ZTS,0,0.007050,0.024255,0.000053,0.009254,57.161955,...,0.0,0.0,0.009984,-0.004627,0.000936,0.000667,0.000447,0.001775,0.000803,0.000894


In [None]:
SP500_all_stock_data_final.isna().sum()

Date                 0
Dividends            0
Stock Splits         0
ticker               0
is_in_sp500          0
daily_return         0
monthly_return       0
monthly_var_3        0
daily_vol_21         0
RSI                  0
MACD_Line            0
MACD_Signal          0
MACD_Hist            0
forward_split        0
reverse_split        0
high_low_spread      0
open_close_spread    0
SMA_ret_20           0
SMA_ret_50           0
SMA_ret_100          0
EMA_ret_20           0
EMA_ret_50           0
EMA_ret_100          0
dtype: int64

#### Covid hold out set 
https://en.wikipedia.org/wiki/2020_stock_market_crash#:~:text=Though%20the%20crash%20began%20on,13%25%20in%20most%20global%20markets.

Covid stock crash happened on: 20 Feb
we will use 2013-01-02 (first trading day of 2013) to 2020-02-19 data as our training test set and 2020-02-19 to 2020-12-12 as our holdout set

Purging will happen later using 100 days during time-series CV

In [None]:
# splitting the data appropriately into training, test and covid stress test data 
# Covid start https://en.wikipedia.org/wiki/2020_stock_market_crash#:~:text=Though%20the%20crash%20began%20on,13%25%20in%20most%20global%20markets.
COVID_start_date = '2020-02-20'
training_data =  SP500_all_stock_data_final[SP500_all_stock_data_final['Date'] < pd.to_datetime(COVID_start_date)]
covid_stress_test_data = SP500_all_stock_data_final[SP500_all_stock_data_final['Date'] >= pd.to_datetime(COVID_start_date)]

In [None]:
# checking the newly listed stocks in the training data
first_trade_training = training_data.groupby("ticker")["Date"].min().reset_index()
first_trade_training["Date"] = pd.to_datetime(first_trade_training["Date"])
first_trade_training.columns = ["ticker", "first_trade_date"]
first_trade_training.query("first_trade_date >= '2013-01-01'")


Unnamed: 0,ticker,first_trade_date
0,A,2013-01-02
1,AAL,2013-01-02
2,AAP,2013-01-02
3,AAPL,2013-01-02
4,ABBV,2013-05-28
...,...,...
501,YUM,2013-01-02
502,ZBH,2013-01-02
503,ZBRA,2013-01-02
504,ZION,2013-01-02


In [None]:
training_data.columns

Index(['Date', 'Dividends', 'Stock Splits', 'ticker', 'is_in_sp500',
       'daily_return', 'monthly_return', 'monthly_var_3', 'daily_vol_21',
       'RSI', 'MACD_Line', 'MACD_Signal', 'MACD_Hist', 'forward_split',
       'reverse_split', 'high_low_spread', 'open_close_spread', 'SMA_ret_20',
       'SMA_ret_50', 'SMA_ret_100', 'EMA_ret_20', 'EMA_ret_50', 'EMA_ret_100'],
      dtype='object')