In [1]:
# importing necessary libraries
import pandas as pd
import yfinance as yf
from datetime import timedelta


In [2]:
spy_stocklist = pd.read_csv(r'raw_data/sp_500_historical_components.csv')

# Data cleaning
In this portion of the code we will 
1) Get the list of stocks that ever existed from sp_500_historical_components.csv from https://github.com/hanshof/sp500_constituents/blob/main/sp_500_historical_components.csv
2) Use yfinance library to get historical data (2012-2020) OHLCV, PE ratio, PB ratio from the lists of stocks, indicate their presence in the stock data at any point of time on a daily level. we get 2012 data for a buffer to calculate moving averages later on
3) Filter out stocks that newly entered the stock market before 2016, this is to ensure that we have sufficient training data for each stock

In [3]:

# Convert 'date' column to datetime
spy_stocklist['date'] = pd.to_datetime(spy_stocklist['date'])

# Ensure 'tickers' is string
spy_stocklist['tickers'] = spy_stocklist['tickers'].astype(str)
spy_stocklist

Unnamed: 0,date,tickers
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
...,...,...
3477,2025-08-19,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3478,2025-08-20,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3479,2025-08-21,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."
3480,2025-08-22,"A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP..."


In [4]:
# Filter date range for 2012-01-01 to 2021-01-07, we include the year 2012 to introduce a buffer for moving averages calculation
spy_stocklist_filtered = spy_stocklist[(spy_stocklist['date'] >= '2013-01-01') & (spy_stocklist['date'] <= '2021-01-08')]

spy_stocklist_filtered

Unnamed: 0,date,tickers
1908,2013-01-02,"A,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,ADM,ADP,ADSK,..."
1909,2013-01-03,"A,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,ADM,ADP,ADSK,..."
1910,2013-01-08,"A,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,ADM,ADP,ADSK,..."
1911,2013-01-14,"A,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,ADM,ADP,ADSK,..."
1912,2013-01-15,"A,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,ADM,ADP,ADSK,..."
...,...,...
2625,2020-10-07,"A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,ADI,..."
2626,2020-10-12,"A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,ADI,..."
2627,2020-11-17,"A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,ADI,..."
2628,2020-12-21,"A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,ADI,..."


In [None]:
# Split the tickers into lists
spy_stocklist_filtered_expanded_unexploded = (
    spy_stocklist_filtered.assign(
        ticker=spy_stocklist_filtered["tickers"].str.split(",")
    )
    .drop(columns="tickers")
    .reset_index(drop=True)
)

# Apply mapping to each list of tickers
updated_ticker_mapping = {
    'GOOGL': 'GOOG',
    'FB': 'META',
    'TWTR': 'X',
}

spy_stocklist_filtered_expanded_unexploded["ticker"] = (
    spy_stocklist_filtered_expanded_unexploded["ticker"]
    .apply(lambda lst: [updated_ticker_mapping.get(t, t) for t in lst])
)


# Explode the lists into separate rows
spy_stocklist_filtered_expanded = (
    spy_stocklist_filtered_expanded_unexploded
    .explode("ticker")
    .reset_index(drop=True)
)

spy_stocklist_filtered_expanded


Unnamed: 0,date,ticker
0,2013-01-02,A
1,2013-01-02,AAPL
2,2013-01-02,ABBV
3,2013-01-02,ABC
4,2013-01-02,ABT
...,...,...
344472,2021-01-07,YUM
344473,2021-01-07,ZBH
344474,2021-01-07,ZBRA
344475,2021-01-07,ZION


In [None]:
def get_monthly_survivors(df):
    df['year_month'] = df['date'].dt.to_period('M')
    
    # Get first date of each month
    month_starts = df.groupby('year_month')['date'].min().reset_index()
    month_starts.columns = ['year_month', 'start_date']
    
    results = []
    
    for i in range(len(month_starts) - 1):
        start = month_starts.loc[i, 'start_date']
        end = month_starts.loc[i + 1, 'start_date']

        # Filter month data
        month_df = df[(df['date'] >= start) & (df['date'] < end)]
        if month_df.empty:
            continue

        # All tickers ever observed in this month
        all_tickers = set().union(*month_df['ticker'])

        # Intersection across all days → surviving tickers
        surviving = set(month_df.iloc[0]['ticker'])
        for tickers in month_df['ticker']:
            surviving &= set(tickers)

        results.append({
            'date': start,
            'ticker_list': sorted(list(surviving)),
        })

    return pd.DataFrame(results)

survivors_df = get_monthly_survivors(spy_stocklist_filtered_expanded_unexploded)

# Convert 'date' to first day of the month and create 'month' column
survivors_df['month'] = survivors_df['date'].dt.to_period('M').dt.to_timestamp()
# Set 'month' as index
survivors_df.set_index('month', inplace=True)
# Create full month range
full_month_range = pd.date_range(start='2013-01-01', end='2020-12-31', freq='MS')
# Reindex to include all months 
survivors_df = survivors_df.reindex(full_month_range)

# Forward-fill missing months
survivors_df.ffill(inplace=True)
survivors_df = survivors_df.reset_index()
survivors_df.rename(columns={'index': 'month'}, inplace=True)

survivors_df

Unnamed: 0,month,date,ticker_list
0,2013-01-01,2013-01-02,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,..."
1,2013-02-01,2013-02-04,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,..."
2,2013-03-01,2013-03-11,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,..."
3,2013-04-01,2013-04-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,..."
4,2013-05-01,2013-05-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,..."
...,...,...,...
91,2020-08-01,2020-06-22,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
92,2020-09-01,2020-09-18,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
93,2020-10-01,2020-10-07,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
94,2020-11-01,2020-11-17,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."


In [8]:
### getting list of unique tickers that ever entered the SP500 between 2013-2020

spy_stocklist_final = spy_stocklist_filtered_expanded[
    (spy_stocklist_filtered_expanded['date'] >= '2013-01-01') &
    (spy_stocklist_filtered_expanded['date'] <= '2020-12-31')]

unique_tickers = spy_stocklist_final['ticker'].unique().tolist()
print(f"Total unique tickers in S&P 500 from 2013 to 2020: {len(unique_tickers)}")
unique_tickers

Total unique tickers in S&P 500 from 2013 to 2020: 639


['A',
 'AAPL',
 'ABBV',
 'ABC',
 'ABT',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AET',
 'AFL',
 'AIG',
 'AIV',
 'AIZ',
 'AKAM',
 'ALL',
 'ALXN',
 'AMAT',
 'AMD',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'AN',
 'ANDV',
 'ANF',
 'ANTM',
 'AON',
 'APA',
 'APC',
 'APD',
 'APH',
 'APOL',
 'APTV',
 'ARG',
 'ATI',
 'AVB',
 'AVP',
 'AVY',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BAX',
 'BBBY',
 'BBT',
 'BBY',
 'BCR',
 'BDX',
 'BEN',
 'BF-B',
 'BIG',
 'BIIB',
 'BK',
 'BKNG',
 'BLK',
 'BLL',
 'BMS',
 'BMY',
 'BRCM',
 'BRK-B',
 'BSX',
 'BTUUQ',
 'BWA',
 'BXP',
 'C',
 'CA',
 'CAG',
 'CAH',
 'CAM',
 'CAT',
 'CB',
 'CBRE',
 'CBS',
 'CCI',
 'CCL',
 'CELG',
 'CERN',
 'CF',
 'CFN',
 'CHRW',
 'CI',
 'CINF',
 'CL',
 'CLF',
 'CLX',
 'CMA',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNP',
 'CNX',
 'COF',
 'COG',
 'COL',
 'COP',
 'COST',
 'COV',
 'CPB',
 'CRM',
 'CSCO',
 'CSX',
 'CTAS',
 'CTL',
 'CTSH',
 'CTXS',
 'CVC',
 'CVS',
 'CVX',
 'D',
 'DD',
 'DE',
 'DF',
 'DFS',
 'DG',
 'DGX',


In [9]:
### now we will use the yfinance library to get the historical data for these tickers that ever existed in the SP500 between 2013-2020
def get_stock_data(ticker, start_date, end_date):
    try:
        # Convert string to datetime so we can safely add 1 day, because yfinance end date is exclusive we add one day to include the end date
        end_date_dt = pd.to_datetime(end_date) + pd.Timedelta(days=1)
        
        stock = yf.Ticker(ticker)
        stock_data = stock.history(
            start=start_date,
            end=end_date_dt,
            auto_adjust=True,  # adjust for stock splits/dividends
            actions=True       # include Dividends and Stock Splits columns
        )
        
        if stock_data.empty:
            return pd.DataFrame()
        
        stock_data["ticker"] = ticker
        return stock_data.reset_index()

    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

In [10]:
start_date = '2012-01-01' # we start from 2012 to have a buffer for moving averages calculation
end_date = '2020-12-31' # we end at 2020-12-31, 2019-2020 will be held out for stress testing
yfinance_data = []
failed_tickers = []
successful_tickers = []
for ticker in unique_tickers:
    print(f"Fetching data for {ticker}...")
    ticker_data = get_stock_data(ticker, start_date, end_date)
    if not ticker_data.empty:
        yfinance_data.append(ticker_data)
        successful_tickers.append(ticker)
    else:
        failed_tickers.append(ticker)

    

Fetching data for A...
Fetching data for AAPL...
Fetching data for ABBV...
Fetching data for ABC...


$ABC: possibly delisted; no timezone found


Fetching data for ABT...
Fetching data for ACN...
Fetching data for ADBE...
Fetching data for ADI...
Fetching data for ADM...
Fetching data for ADP...
Fetching data for ADSK...
Fetching data for AEE...
Fetching data for AEP...
Fetching data for AES...
Fetching data for AET...
Fetching data for AFL...
Fetching data for AIG...
Fetching data for AIV...
Fetching data for AIZ...
Fetching data for AKAM...
Fetching data for ALL...
Fetching data for ALXN...


$ALXN: possibly delisted; no timezone found


Fetching data for AMAT...
Fetching data for AMD...
Fetching data for AMGN...
Fetching data for AMP...
Fetching data for AMT...
Fetching data for AMZN...
Fetching data for AN...
Fetching data for ANDV...
Fetching data for ANF...
Fetching data for ANTM...


$ANTM: possibly delisted; no timezone found


Fetching data for AON...
Fetching data for APA...
Fetching data for APC...


$APC: possibly delisted; no timezone found


Fetching data for APD...
Fetching data for APH...


$APOL: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$ARG: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for APOL...
Fetching data for APTV...
Fetching data for ARG...
Fetching data for ATI...
Fetching data for AVB...
Fetching data for AVP...


$AVP: possibly delisted; no timezone found


Fetching data for AVY...
Fetching data for AXP...
Fetching data for AZO...
Fetching data for BA...
Fetching data for BAC...
Fetching data for BAX...
Fetching data for BBBY...
Fetching data for BBT...


$BCR: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for BBY...
Fetching data for BCR...
Fetching data for BDX...
Fetching data for BEN...
Fetching data for BF-B...
Fetching data for BIG...


$BIG: possibly delisted; no timezone found


Fetching data for BIIB...
Fetching data for BK...
Fetching data for BKNG...
Fetching data for BLK...
Fetching data for BLL...


$BLL: possibly delisted; no timezone found


Fetching data for BMS...
Fetching data for BMY...


$BRCM: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$BTUUQ: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for BRCM...
Fetching data for BRK-B...
Fetching data for BSX...
Fetching data for BTUUQ...
Fetching data for BWA...
Fetching data for BXP...
Fetching data for C...
Fetching data for CA...


$CA: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")


Fetching data for CAG...
Fetching data for CAH...
Fetching data for CAM...


$CAM: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")


Fetching data for CAT...
Fetching data for CB...
Fetching data for CBRE...
Fetching data for CBS...


$CBS: possibly delisted; no timezone found


Fetching data for CCI...
Fetching data for CCL...
Fetching data for CELG...


$CELG: possibly delisted; no timezone found


Fetching data for CERN...


$CERN: possibly delisted; no timezone found
$CFN: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for CF...
Fetching data for CFN...
Fetching data for CHRW...
Fetching data for CI...
Fetching data for CINF...
Fetching data for CL...
Fetching data for CLF...
Fetching data for CLX...
Fetching data for CMA...
Fetching data for CMCSA...
Fetching data for CME...
Fetching data for CMG...
Fetching data for CMI...
Fetching data for CMS...
Fetching data for CNP...
Fetching data for CNX...
Fetching data for COF...
Fetching data for COG...


$COG: possibly delisted; no timezone found


Fetching data for COL...
Fetching data for COP...
Fetching data for COST...
Fetching data for COV...
Fetching data for CPB...
Fetching data for CRM...
Fetching data for CSCO...
Fetching data for CSX...
Fetching data for CTAS...
Fetching data for CTL...


$CTL: possibly delisted; no timezone found


Fetching data for CTSH...
Fetching data for CTXS...


$CTXS: possibly delisted; no timezone found
$CVC: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for CVC...
Fetching data for CVS...
Fetching data for CVX...
Fetching data for D...
Fetching data for DD...
Fetching data for DE...
Fetching data for DF...


$DF: possibly delisted; no timezone found


Fetching data for DFS...


$DFS: possibly delisted; no timezone found


Fetching data for DG...
Fetching data for DGX...
Fetching data for DHI...
Fetching data for DHR...
Fetching data for DIS...
Fetching data for DISCA...


$DISCA: possibly delisted; no timezone found


Fetching data for DLTR...
Fetching data for DNR...


$DNR: possibly delisted; no timezone found


Fetching data for DOV...
Fetching data for DRI...
Fetching data for DTE...
Fetching data for DUK...
Fetching data for DVA...
Fetching data for DVN...
Fetching data for DXC...
Fetching data for EA...
Fetching data for EBAY...
Fetching data for ECL...
Fetching data for ED...
Fetching data for EFX...
Fetching data for EIX...
Fetching data for EL...
Fetching data for EMN...
Fetching data for EMR...
Fetching data for EOG...
Fetching data for EQR...
Fetching data for EQT...
Fetching data for ES...
Fetching data for ESRX...
Fetching data for ESV...


$ESV: possibly delisted; no timezone found


Fetching data for ETFC...


$ETFC: possibly delisted; no timezone found


Fetching data for ETN...
Fetching data for ETR...
Fetching data for EW...
Fetching data for EXC...
Fetching data for EXPD...
Fetching data for EXPE...
Fetching data for F...
Fetching data for FAST...


$FDO: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for FCX...
Fetching data for FDO...
Fetching data for FDX...
Fetching data for FE...
Fetching data for FFIV...
Fetching data for FHN...
Fetching data for FIS...
Fetching data for FISV...


$FISV: possibly delisted; no timezone found


Fetching data for FITB...
Fetching data for FLIR...


$FLIR: possibly delisted; no timezone found


Fetching data for FLR...
Fetching data for FLS...
Fetching data for FMC...
Fetching data for FRX...


$FRX: possibly delisted; no timezone found


Fetching data for FSLR...
Fetching data for FTI...
Fetching data for FTR...


$FTR: possibly delisted; no timezone found


Fetching data for GD...
Fetching data for GE...
Fetching data for GHC...
Fetching data for GILD...
Fetching data for GIS...
Fetching data for GLW...
Fetching data for GME...
Fetching data for GNW...
Fetching data for GOOG...
Fetching data for GPC...
Fetching data for GPS...


$GPS: possibly delisted; no timezone found


Fetching data for GRMN...
Fetching data for GS...
Fetching data for GT...
Fetching data for GWW...
Fetching data for HAL...
Fetching data for HAS...


$HCBK: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for HBAN...
Fetching data for HCBK...
Fetching data for HD...
Fetching data for HES...


$HES: possibly delisted; no timezone found


Fetching data for HIG...
Fetching data for HNZ...
Fetching data for HOG...
Fetching data for HON...
Fetching data for HP...
Fetching data for HPQ...
Fetching data for HRB...
Fetching data for HRL...
Fetching data for HRS...


$HRS: possibly delisted; no timezone found
$HSP: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for HSP...
Fetching data for HST...
Fetching data for HSY...
Fetching data for HUM...
Fetching data for IBM...
Fetching data for ICE...
Fetching data for IFF...
Fetching data for IGT...


$IGT: possibly delisted; no timezone found


Fetching data for INTC...
Fetching data for INTU...
Fetching data for IP...
Fetching data for IPG...
Fetching data for IRM...
Fetching data for ISRG...
Fetching data for ITW...
Fetching data for IVZ...
Fetching data for JBL...
Fetching data for JCI...
Fetching data for JCP...


$JCP: possibly delisted; no timezone found


Fetching data for JEC...


$JEC: possibly delisted; no timezone found


Fetching data for JEF...
Fetching data for JNJ...
Fetching data for JNPR...


$JNPR: possibly delisted; no timezone found
$JOY: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for JOY...
Fetching data for JPM...
Fetching data for JWN...


$JWN: possibly delisted; no timezone found


Fetching data for K...
Fetching data for KDP...
Fetching data for KEY...
Fetching data for KIM...
Fetching data for KLAC...
Fetching data for KMB...
Fetching data for KMI...
Fetching data for KMX...
Fetching data for KO...
Fetching data for KR...
Fetching data for KRFT...


$KRFT: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for KSS...
Fetching data for L...
Fetching data for LB...


$LB: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")


Fetching data for LDOS...
Fetching data for LEG...
Fetching data for LEN...
Fetching data for LH...


$LLTC: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for LLTC...
Fetching data for LLY...
Fetching data for LM...


$LM: possibly delisted; no timezone found


Fetching data for LMT...
Fetching data for LNC...


$LO: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for LO...
Fetching data for LOW...
Fetching data for LRCX...
Fetching data for LSI...


$LSI: possibly delisted; no timezone found


Fetching data for LUV...
Fetching data for LYB...
Fetching data for M...
Fetching data for MA...
Fetching data for MAR...
Fetching data for MAS...
Fetching data for MAT...
Fetching data for MCD...
Fetching data for MCHP...
Fetching data for MCK...
Fetching data for MCO...
Fetching data for MDLZ...
Fetching data for MDT...
Fetching data for MET...


$MJN: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for MJN...
Fetching data for MKC...
Fetching data for MMC...
Fetching data for MMM...
Fetching data for MNST...
Fetching data for MO...
Fetching data for MOLX...
Fetching data for MOS...
Fetching data for MPC...
Fetching data for MRK...
Fetching data for MRO...


$MRO: possibly delisted; no timezone found


Fetching data for MS...
Fetching data for MSFT...
Fetching data for MSI...
Fetching data for MTB...
Fetching data for MU...
Fetching data for MUR...


$MWV: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for MWV...
Fetching data for MYL...


$MYL: possibly delisted; no timezone found


Fetching data for NBL...


$NBL: possibly delisted; no timezone found


Fetching data for NBR...
Fetching data for NDAQ...
Fetching data for NEE...
Fetching data for NEM...
Fetching data for NFLX...
Fetching data for NFX...
Fetching data for NI...
Fetching data for NKE...
Fetching data for NOC...
Fetching data for NOV...
Fetching data for NRG...
Fetching data for NSC...
Fetching data for NTAP...
Fetching data for NTRS...
Fetching data for NUE...
Fetching data for NVDA...
Fetching data for NWL...


$NYX: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for NYX...
Fetching data for OI...
Fetching data for OKE...
Fetching data for OMC...
Fetching data for ORCL...
Fetching data for ORLY...
Fetching data for OXY...
Fetching data for PAYX...
Fetching data for PBCT...


$PBCT: possibly delisted; no timezone found


Fetching data for PBI...
Fetching data for PCAR...
Fetching data for PCG...
Fetching data for PDCO...


$PDCO: possibly delisted; no timezone found


Fetching data for PEG...
Fetching data for PEP...
Fetching data for PETM...
Fetching data for PFE...
Fetching data for PFG...
Fetching data for PG...
Fetching data for PGR...
Fetching data for PH...
Fetching data for PHM...
Fetching data for PKI...


$PKI: possibly delisted; no timezone found


Fetching data for PLD...
Fetching data for PM...
Fetching data for PNC...
Fetching data for PNR...
Fetching data for PNW...
Fetching data for PPG...
Fetching data for PPL...
Fetching data for PRGO...
Fetching data for PRU...
Fetching data for PSA...
Fetching data for PSX...
Fetching data for PWR...
Fetching data for PXD...


$PXD: possibly delisted; no timezone found


Fetching data for QCOM...
Fetching data for QEP...


$QEP: possibly delisted; no timezone found
$RAI: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for R...
Fetching data for RAI...
Fetching data for RDC...


$RDC: possibly delisted; no timezone found


Fetching data for RF...
Fetching data for RHI...
Fetching data for RHT...


$RHT: possibly delisted; no timezone found


Fetching data for RL...
Fetching data for ROK...
Fetching data for ROP...
Fetching data for ROST...
Fetching data for RRC...
Fetching data for RSG...
Fetching data for RTN...


$RTN: possibly delisted; no timezone found


Fetching data for SBUX...
Fetching data for SCG...
Fetching data for SCHW...
Fetching data for SEE...


$SIAL: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for SHW...
Fetching data for SIAL...
Fetching data for SJM...
Fetching data for SLB...
Fetching data for SLM...
Fetching data for SNA...
Fetching data for SNDK...


$SNDK: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")
$SNI: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for SNI...
Fetching data for SO...
Fetching data for SPG...


$SPLS: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for SPGI...
Fetching data for SPLS...
Fetching data for SRCL...


$SRCL: possibly delisted; no timezone found


Fetching data for SRE...
Fetching data for STI...


$STI: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")
$STJ: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for STJ...
Fetching data for STT...
Fetching data for STX...
Fetching data for STZ...
Fetching data for SWK...
Fetching data for SWN...


$SWN: possibly delisted; no timezone found
$SWY: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for SWY...
Fetching data for SYK...
Fetching data for SYMC...


$SYMC: possibly delisted; no timezone found


Fetching data for SYY...
Fetching data for T...
Fetching data for TAP...
Fetching data for TDC...
Fetching data for TE...
Fetching data for TEL...
Fetching data for TER...
Fetching data for TGNA...
Fetching data for TGT...
Fetching data for THC...
Fetching data for TIF...


$TIF: possibly delisted; no timezone found


Fetching data for TJX...
Fetching data for TMK...


$TMK: possibly delisted; no timezone found


Fetching data for TMO...
Fetching data for TMUS...
Fetching data for TPR...
Fetching data for TRIP...
Fetching data for TROW...
Fetching data for TRV...
Fetching data for TSN...
Fetching data for TSS...


$TSS: possibly delisted; no timezone found
$TWC: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for TWC...
Fetching data for TWX...
Fetching data for TXN...
Fetching data for TXT...
Fetching data for UNH...
Fetching data for UNM...
Fetching data for UNP...
Fetching data for UPS...
Fetching data for USB...
Fetching data for UTX...


$UTX: possibly delisted; no timezone found


Fetching data for V...
Fetching data for VAR...


$VAR: possibly delisted; no timezone found


Fetching data for VFC...
Fetching data for VIAB...


$VIAB: possibly delisted; no timezone found


Fetching data for VIAV...
Fetching data for VLO...
Fetching data for VMC...
Fetching data for VNO...
Fetching data for VRSN...
Fetching data for VTR...
Fetching data for VZ...
Fetching data for WAT...
Fetching data for WBA...
Fetching data for WDC...
Fetching data for WEC...
Fetching data for WELL...


$WFM: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for WFC...
Fetching data for WFM...
Fetching data for WHR...
Fetching data for WIN...


$WIN: possibly delisted; no timezone found


Fetching data for WM...
Fetching data for WMB...
Fetching data for WMT...
Fetching data for WPX...


$WPX: possibly delisted; no timezone found


Fetching data for WU...
Fetching data for WY...
Fetching data for WYNN...
Fetching data for X...


$X: possibly delisted; no timezone found


Fetching data for XEL...
Fetching data for XLNX...


$XLNX: possibly delisted; no timezone found


Fetching data for XOM...
Fetching data for XRAY...
Fetching data for XRX...
Fetching data for XYL...
Fetching data for YUM...
Fetching data for ZBH...
Fetching data for ZION...
Fetching data for PVH...
Fetching data for REGN...
Fetching data for MAC...
Fetching data for KSU...


$KSU: possibly delisted; no timezone found


Fetching data for GM...
Fetching data for ZTS...
Fetching data for NWSA...
Fetching data for NLSN...


$NLSN: possibly delisted; no timezone found


Fetching data for DAL...
Fetching data for AME...


$KORS: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for VRTX...
Fetching data for KORS...
Fetching data for ALLE...


$GGP: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for GGP...
Fetching data for ADS...


$ADS: possibly delisted; no timezone found


Fetching data for META...
Fetching data for MHK...
Fetching data for TSCO...


$GMCR: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for GMCR...
Fetching data for ESS...
Fetching data for NAVI...
Fetching data for UAA...
Fetching data for AVGO...
Fetching data for XEC...


$XEC: possibly delisted; no timezone found


Fetching data for AMG...
Fetching data for MLM...
Fetching data for DISCK...


$DISCK: possibly delisted; no timezone found


Fetching data for UHS...
Fetching data for URI...


$LVLT: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for LVLT...
Fetching data for RCL...
Fetching data for ENDP...


$ENDP: possibly delisted; no timezone found


Fetching data for HCA...
Fetching data for SWKS...
Fetching data for HSIC...
Fetching data for AAL...
Fetching data for EQIX...
Fetching data for HBI...
Fetching data for SLG...
Fetching data for O...
Fetching data for QRVO...
Fetching data for BXLT...


$BXLT: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)
$CPGX: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for JBHT...
Fetching data for CPGX...
Fetching data for WRK...


$WRK: possibly delisted; no timezone found


Fetching data for KHC...
Fetching data for AAP...
Fetching data for PYPL...
Fetching data for ATVI...


$ATVI: possibly delisted; no timezone found
$CMCSK: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00)


Fetching data for UAL...
Fetching data for CMCSK...
Fetching data for NWS...
Fetching data for VRSK...
Fetching data for HPE...
Fetching data for FCPT...
Fetching data for SYF...
Fetching data for ILMN...
Fetching data for CSRA...
Fetching data for CHD...
Fetching data for CPRI...
Fetching data for WLTW...


$WLTW: possibly delisted; no timezone found


Fetching data for EXR...
Fetching data for CFG...
Fetching data for FRT...
Fetching data for CXO...


$CXO: possibly delisted; no timezone found


Fetching data for AWK...
Fetching data for UDR...
Fetching data for CNC...
Fetching data for HOLX...
Fetching data for FL...
Fetching data for UA...
Fetching data for ULTA...
Fetching data for GPN...
Fetching data for AYI...
Fetching data for ALK...
Fetching data for DLR...
Fetching data for LKQ...
Fetching data for AJG...
Fetching data for TDG...
Fetching data for FBHS...


$FBHS: possibly delisted; no timezone found


Fetching data for ALB...
Fetching data for LNT...
Fetching data for FTV...
Fetching data for MTD...
Fetching data for CHTR...
Fetching data for COO...
Fetching data for COTY...
Fetching data for EVHC...
Fetching data for MAA...
Fetching data for IDXX...
Fetching data for INCY...
Fetching data for CBOE...
Fetching data for REG...
Fetching data for DISH...


$DISH: possibly delisted; no timezone found


Fetching data for SNPS...
Fetching data for ARE...
Fetching data for RJF...
Fetching data for IT...
Fetching data for INFO...


$INFO: possibly delisted; no price data found  (1d 2012-01-01 -> 2021-01-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1609477200")


Fetching data for ALGN...
Fetching data for ANSS...


$ANSS: possibly delisted; no timezone found


Fetching data for HLT...
Fetching data for RE...


$RE: possibly delisted; no timezone found


Fetching data for AOS...
Fetching data for DRE...


$DRE: possibly delisted; no timezone found


Fetching data for MGM...
Fetching data for PKG...
Fetching data for RMD...
Fetching data for BHF...
Fetching data for IQV...
Fetching data for DWDP...


$DWDP: possibly delisted; no timezone found


Fetching data for SBAC...
Fetching data for CDNS...
Fetching data for NCLH...
Fetching data for HII...
Fetching data for IPGP...
Fetching data for NKTR...
Fetching data for SIVB...


$SIVB: possibly delisted; no timezone found


Fetching data for TTWO...
Fetching data for MSCI...
Fetching data for ABMD...


$ABMD: possibly delisted; no timezone found


Fetching data for EVRG...
Fetching data for BR...
Fetching data for HFC...


$HFC: possibly delisted; no timezone found


Fetching data for FLT...


$FLT: possibly delisted; no timezone found


Fetching data for CPRT...
Fetching data for ANET...
Fetching data for WCG...


$WCG: possibly delisted; no timezone found


Fetching data for ROL...
Fetching data for FTNT...
Fetching data for KEYS...
Fetching data for LIN...
Fetching data for JKHY...
Fetching data for FANG...
Fetching data for LW...
Fetching data for MXIM...


$MXIM: possibly delisted; no timezone found


Fetching data for CE...
Fetching data for FRC...


$FRC: possibly delisted; no timezone found


Fetching data for TFX...
Fetching data for ATO...
Fetching data for WAB...
Fetching data for LHX...
Fetching data for CTVA...
Fetching data for AMCR...
Fetching data for MKTX...
Fetching data for GL...
Fetching data for IEX...
Fetching data for CDW...
Fetching data for NVR...
Fetching data for LVS...
Fetching data for BKR...
Fetching data for NLOK...


$NLOK: possibly delisted; no timezone found


Fetching data for PEAK...


$PEAK: possibly delisted; no timezone found


Fetching data for NOW...
Fetching data for VIAC...


$VIAC: possibly delisted; no timezone found


Fetching data for WRB...
Fetching data for ODFL...
Fetching data for TFC...
Fetching data for J...
Fetching data for LYV...
Fetching data for STE...
Fetching data for ZBRA...
Fetching data for PAYC...
Fetching data for TT...
Fetching data for CARR...
Fetching data for OTIS...
Fetching data for RTX...
Fetching data for HWM...
Fetching data for DPZ...
Fetching data for DXCM...
Fetching data for WST...
Fetching data for BIO...
Fetching data for TDY...
Fetching data for TYL...
Fetching data for LUMN...
Fetching data for CTLT...


$CTLT: possibly delisted; no timezone found


Fetching data for ETSY...
Fetching data for POOL...
Fetching data for VNT...
Fetching data for VTRS...
Fetching data for TSLA...


In [None]:
# combine all individual DataFrames into one big DataFrame
yfinance_df = pd.concat(yfinance_data, ignore_index=True)
yfinance_df["Date"] = pd.to_datetime(yfinance_df["Date"])

# preview result
yfinance_df.drop(columns=['Capital Gains'], inplace=True)
yfinance_df['Date'] = yfinance_df['Date'].dt.tz_localize(None)

yfinance_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,2012-01-03,22.776932,23.507777,22.713380,23.183664,4156394,0.0,0.0,A
1,2012-01-04,22.973933,23.107390,22.618042,22.999352,4651845,0.0,0.0,A
2,2012-01-05,22.802339,23.717482,22.700656,23.514118,6842651,0.0,0.0,A
3,2012-01-06,23.571320,23.870014,23.393377,23.768332,4711400,0.0,0.0,A
4,2012-01-09,23.908135,24.416549,23.812808,24.391129,4429563,0.0,0.0,A
...,...,...,...,...,...,...,...,...,...
1129054,2020-12-24,214.330002,222.029999,213.666672,220.589996,68596800,0.0,0.0,TSLA
1129055,2020-12-28,224.836670,227.133331,220.266663,221.229996,96835800,0.0,0.0,TSLA
1129056,2020-12-29,220.333328,223.300003,218.333328,221.996674,68732400,0.0,0.0,TSLA
1129057,2020-12-30,224.000000,232.199997,222.786667,231.593338,128538000,0.0,0.0,TSLA


In [12]:
# checking for stock split price adjustment, for example apple, check the dates around 2020-08-31 when apple had a 4-for-1 stock split
yfinance_df[yfinance_df['Date'].between('2020-08-24', '2020-09-07') & (yfinance_df['ticker'] == 'AAPL')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
4439,2020-08-24,125.186351,125.271473,120.556219,122.423836,345937600,0.0,0.0,AAPL
4440,2020-08-25,121.295471,121.764805,119.695346,121.419487,211495600,0.0,0.0,AAPL
4441,2020-08-26,122.737516,123.52785,121.669955,123.070671,163022400,0.0,0.0,AAPL
4442,2020-08-27,123.673769,124.006924,120.454069,121.599449,155552400,0.0,0.0,AAPL
4443,2020-08-28,122.574593,122.992862,121.178745,121.402473,187630000,0.0,0.0,AAPL
4444,2020-08-31,124.099341,127.426034,122.562445,125.519501,225702700,0.0,4.0,AAPL
4445,2020-09-01,129.137999,131.122352,126.968843,130.519257,151948100,0.0,0.0,AAPL
4446,2020-09-02,133.836203,134.215562,123.535127,127.815079,200119000,0.0,0.0,AAPL
4447,2020-09-03,123.447617,125.324955,117.212493,117.582123,257599600,0.0,0.0,AAPL
4448,2020-09-04,116.794217,120.325179,107.864668,117.659935,332607200,0.0,0.0,AAPL


In [None]:
#Membership Indicator 

# Create 'month' column 
yfinance_df['month'] = yfinance_df['Date'].dt.to_period('M').dt.to_timestamp()

# Merge with survivors_df on 'month'
merged_df = yfinance_df.merge(
    survivors_df[['month', 'ticker_list']],
    on='month',
    how='left'
)

# Check if ticker is in ticker_list for that month, 1 if yes, 0 if no
merged_df['is_in_sp500'] = merged_df.apply(
    lambda row: int(row['ticker'] in row['ticker_list']) if isinstance(row['ticker_list'], (list, set)) else 0,
    axis=1
)



merged_df.drop(columns=['ticker_list', 'month'], inplace=True)


In [None]:
#just to check
merged_df[merged_df['ticker'] == 'TSLA'].tail(25)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500
1129034,2020-11-25,183.353333,191.333328,181.789993,191.333328,146790600,0.0,0.0,TSLA,False
1129035,2020-11-27,193.720001,199.593338,192.816666,195.253326,112683300,0.0,0.0,TSLA,False
1129036,2020-11-30,200.736664,202.600006,184.83667,189.199997,189009300,0.0,0.0,TSLA,False
1129037,2020-12-01,199.196671,199.28334,190.683334,194.919998,120310500,0.0,0.0,TSLA,True
1129038,2020-12-02,185.479996,190.513336,180.403336,189.606674,143327100,0.0,0.0,TSLA,True
1129039,2020-12-03,196.67334,199.656662,194.143326,197.793335,127656000,0.0,0.0,TSLA,True
1129040,2020-12-04,197.003326,199.679993,195.166672,199.679993,88203900,0.0,0.0,TSLA,True
1129041,2020-12-07,201.639999,216.263336,201.016663,213.919998,168929100,0.0,0.0,TSLA,True
1129042,2020-12-08,208.503326,217.093338,206.166672,216.626663,192795000,0.0,0.0,TSLA,True
1129043,2020-12-09,217.896667,218.106674,196.0,201.493332,213873600,0.0,0.0,TSLA,True


In [21]:
# filtering stocks that entered the stock market after 2016-01-01 to ensure we have enough training data per stock

yfinance_df = merged_df
first_trade = yfinance_df.groupby("ticker")["Date"].min().reset_index()
first_trade["Date"] = pd.to_datetime(first_trade["Date"])
first_trade.columns = ["ticker", "first_trade_date"]

# keep only tickers that started before or on 2015-12-31
eligible_tickers = first_trade[first_trade["first_trade_date"] <= "2015-12-31"]["ticker"]

yfinance_df_filtered_relavant_stocks = filtered_df = yfinance_df[yfinance_df["ticker"].isin(eligible_tickers)].copy()


print(f"Total tickers after filtering: {eligible_tickers.nunique()}, tickers filtered out: {len(unique_tickers) - eligible_tickers.nunique()}")

Total tickers after filtering: 506, tickers filtered out: 133


In [22]:
# getting daily returns, monthly returns and adding to the dataframe
# we take 21 trading days as approximately 1 month

yfinance_df_filtered_relavant_stocks.sort_values(by=['ticker', 'Date'], inplace=True)
yfinance_df_filtered_relavant_stocks['daily_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change()
yfinance_df_filtered_relavant_stocks['monthly_return'] = yfinance_df_filtered_relavant_stocks.groupby('ticker')['Close'].pct_change(periods=21)

In [23]:
# getting monthly standard deviation (volatility) (TODO)
yfinance_df_filtered_relavant_stocks["monthly_var_3"] = (
    yfinance_df_filtered_relavant_stocks.groupby("ticker")["monthly_return"]
      .transform(lambda x: x.rolling(window=3, min_periods=3).var(ddof=1))
)

yfinance_df_filtered_relavant_stocks["daily_vol_20"] = (
    yfinance_df_filtered_relavant_stocks
    .groupby("ticker")["daily_return"]
    .transform(lambda x: x.rolling(window=20, min_periods=20).std(ddof=1))
)

### EDA for new stocks that entered after our 2013 cut off
- the problem with these stocks is that they might not have enough historical data to compute fields such as:
1) monthly returns
2) monthly variance (3 months)
3) EMA/SMA

In [24]:
start_date = pd.Timestamp("2013-01-01")

# ensure Date is Timestamp
yfinance_df_filtered_relavant_stocks["Date"] = pd.to_datetime(yfinance_df_filtered_relavant_stocks["Date"])

# find earliest date per ticker
first_dates = (
    yfinance_df_filtered_relavant_stocks
    .groupby("ticker")["Date"]
    .min()
    .reset_index(name="first_date")
)

# flag tickers that started trading after 2013-01-01
first_dates["entered_after_2013"] = (first_dates["first_date"] > start_date).astype(int)

# count and preview
num_after = first_dates["entered_after_2013"].sum()
num_total = len(first_dates)

print(f"{num_after} out of {num_total} tickers started trading after {start_date.date()} before 2015-12-31.")
print(first_dates.query("entered_after_2013 == 1").head())

25 out of 506 tickers started trading after 2013-01-01 before 2015-12-31.
   ticker first_date  entered_after_2013
4    ABBV 2013-01-02                   1
26   ALLE 2013-11-18                   1
38   ANET 2014-06-06                   1
88    CDW 2013-06-27                   1
91    CFG 2014-09-24                   1


### Creating new features
#### Moving averages
https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp we will use short:20 days, medium: 50 days, long: 100 days moving averages
1) Simple moving average SMA: SMA_20, SMA_50, SMA_100
2) Exponential Moving Average EMA https://www.investopedia.com/terms/e/ema.asp#toc-formula-for-exponential-moving-average-ema: EMA_20, EMA_50, EMA_100

#### RSI and MACD
1) Relative Strength Index 14 days https://www.investopedia.com/terms/r/rsi.asp : RSI
2) Moving Average Convergence/Divergence indicator: https://www.investopedia.com/terms/m/macd.asp: MACD_26, MACD_12, MACD_9 

#### Stock Splits
1) Forward stock split (Stock split > 1): Commonly known as a bullish indicator which provides more liquidity (when fractional shares werent common)
2) Reverse stock split (0 < Stock split < 1): Bearish indicator

In [25]:
def sma(series: pd.Series, window: int) -> pd.Series:
    """Simple Moving Average."""
    return series.rolling(window, min_periods=window).mean()

In [26]:
def ema(series: pd.Series, span: int) -> pd.Series:
    """Exponential Moving Average."""
    return series.ewm(span=span, adjust=False, min_periods=span).mean()

In [27]:
def rsi_wilder(series: pd.Series, period: int = 14) -> pd.Series:
    """Wilder's RSI (default 14)."""
    delta = series.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [28]:
def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    """
    MACD parts: returns DataFrame with MACD_Line, MACD_Signal, MACD_Hist.
    """
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    macd_signal = macd_line.ewm(span=signal, adjust=False, min_periods=signal).mean()
    macd_hist = macd_line - macd_signal
    return pd.DataFrame(
        {"MACD_Line": macd_line, "MACD_Signal": macd_signal, "MACD_Hist": macd_hist},
        index=series.index
    )

In [29]:
temp_df = yfinance_df_filtered_relavant_stocks.copy()

# Ensure types/order; compute per-ticker
temp_df['Date'] = pd.to_datetime(temp_df['Date'])
temp_df = temp_df.drop_duplicates(subset=['ticker','Date']).sort_values(['ticker','Date'])


g = temp_df.groupby('ticker', group_keys=False)

# SMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'SMA_{w}'] = g['Close'].transform(lambda s, w=w: sma(s, w))

# EMA 20/50/100
for w in [20, 50, 100]:
    temp_df[f'EMA_{w}'] = g['Close'].transform(lambda s, w=w: ema(s, w))

# RSI 14
temp_df['RSI'] = g['Close'].transform(rsi_wilder)

# MACD (12,26,9)
macd_df = g['Close'].apply(macd)
temp_df = temp_df.join(macd_df)

# Stock Splits
temp_df["is_forward_split"] = (yfinance_df["Stock Splits"] > 1).astype(int)
temp_df["is_reverse_split"] = ((yfinance_df["Stock Splits"] > 0) & 
                                   (yfinance_df["Stock Splits"] < 1)).astype(int) 

SP500_all_stock_data = temp_df


SP500_all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split
0,2012-01-03,22.776932,23.507777,22.713380,23.183664,4156394,0.0,0.0,A,False,...,,,,,,,,,0,0
1,2012-01-04,22.973933,23.107390,22.618042,22.999352,4651845,0.0,0.0,A,False,...,,,,,,,,,0,0
2,2012-01-05,22.802339,23.717482,22.700656,23.514118,6842651,0.0,0.0,A,False,...,,,,,,,,,0,0
3,2012-01-06,23.571320,23.870014,23.393377,23.768332,4711400,0.0,0.0,A,False,...,,,,,,,,,0,0
4,2012-01-09,23.908135,24.416549,23.812808,24.391129,4429563,0.0,0.0,A,False,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,153.424280,155.295071,153.357131,154.191788,417400,0.0,0.0,ZTS,True,...,154.709181,154.477098,154.663481,151.699308,48.711857,-0.298368,-0.514776,0.216408,0,0
839122,2020-12-28,154.882521,156.216060,153.798415,155.793930,1522400,0.0,0.0,ZTS,True,...,154.755876,154.602511,154.707813,151.780389,52.813339,-0.160573,-0.443935,0.283362,0,0
839123,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,0.0,ZTS,True,...,154.774984,154.782681,154.777871,151.873734,54.525326,0.005086,-0.354131,0.359217,0,0
839124,2020-12-30,156.868441,158.106051,156.532666,157.597580,1009000,0.0,0.0,ZTS,True,...,154.828600,155.050766,154.888448,151.987078,57.162020,0.222829,-0.238739,0.461568,0,0


In [30]:
# we dont want to backward fill the data as it may introduce lookahead bias, 
# nor do we want to delete the entries, so for stocks that newly entered the SP500 with insufficient data we fill with 0
SP500_all_stock_data.fillna(0, inplace=True)
SP500_all_stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,...,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split
0,2012-01-03,22.776932,23.507777,22.71338,23.183664,4156394,0.0,0.0,A,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,2012-01-04,22.973933,23.10739,22.618042,22.999352,4651845,0.0,0.0,A,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2012-01-05,22.802339,23.717482,22.700656,23.514118,6842651,0.0,0.0,A,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2012-01-06,23.57132,23.870014,23.393377,23.768332,4711400,0.0,0.0,A,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,2012-01-09,23.908135,24.416549,23.812808,24.391129,4429563,0.0,0.0,A,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [31]:
# for daily prices we use spreads instead of absolute prices
SP500_all_stock_data['high_low_spread'] = (SP500_all_stock_data['High'] - SP500_all_stock_data['Low'])/SP500_all_stock_data['Close']
SP500_all_stock_data['open_close_spread'] = (SP500_all_stock_data['Open'] - SP500_all_stock_data['Close'])/SP500_all_stock_data['Close']

# for moving averages we use moving averages of returns instead of absolute prices
for w in [20, 50, 100]:
    SP500_all_stock_data[f"SMA_ret_{w}"] = g["daily_return"].transform(lambda s, w=w: s.rolling(window=w, min_periods=w).mean())

# EMA (Exponential Moving Average) of daily returns
for w in [20, 50, 100]:
    SP500_all_stock_data[f"EMA_ret_{w}"] = g["daily_return"].transform(lambda s, w=w: s.ewm(span=w, adjust=False, min_periods=w).mean())

In [32]:
# filter for stocks from 2013-01-01 onwards 
pd.set_option('display.max_columns', None)  # Show all columns

SP500_all_stock_data_final = SP500_all_stock_data[SP500_all_stock_data['Date'] >= pd.to_datetime('2013-01-01')]
SP500_all_stock_data_final

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,is_in_sp500,daily_return,monthly_return,monthly_var_3,daily_vol_20,SMA_20,SMA_50,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split,high_low_spread,open_close_spread,SMA_ret_20,SMA_ret_50,SMA_ret_100,EMA_ret_20,EMA_ret_50,EMA_ret_100
250,2013-01-02,27.067745,27.067745,26.413036,26.881603,8790205,0.0,0.0,A,True,0.022961,0.096440,0.000508,0.016544,25.679973,24.407815,24.462087,25.703243,24.966329,24.735125,64.959757,0.571169,0.559055,0.012114,0,0,0.024355,0.006924,0.005231,0.002119,0.000592,0.004949,0.003209,0.001800
251,2013-01-03,26.920111,27.048486,26.689037,26.977880,5751791,0.0,0.0,A,True,0.003582,0.111103,0.000424,0.016548,25.808131,24.478159,24.472193,25.824637,25.045213,24.779536,65.650147,0.602699,0.567784,0.034915,0,0,0.013324,-0.002141,0.005134,0.002936,0.000526,0.004818,0.003224,0.001835
252,2013-01-04,27.048488,27.568405,26.868764,27.510635,6432897,0.0,0.0,A,True,0.019748,0.126805,0.000231,0.016762,25.945959,24.570555,24.486733,25.985208,25.141896,24.833617,69.259481,0.663032,0.586833,0.076199,0,0,0.025432,-0.016799,0.005426,0.003817,0.000689,0.006240,0.003872,0.002190
253,2013-01-07,27.343750,27.472125,27.202538,27.311657,3589505,0.0,0.0,A,True,-0.007233,0.103319,0.000143,0.016695,26.084723,24.655771,24.501008,26.111536,25.226985,24.882687,66.451028,0.686873,0.606841,0.080032,0,0,0.009871,0.001175,0.005504,0.003533,0.000683,0.004957,0.003436,0.002003
254,2013-01-08,27.260307,27.459288,27.022814,27.093420,3896925,0.0,0.0,A,True,-0.007991,0.104214,0.000177,0.016420,26.228903,24.742896,24.514058,26.205049,25.300178,24.926464,63.413792,0.680316,0.621536,0.058779,0,0,0.016110,0.006160,0.005770,0.003645,0.000640,0.003724,0.002988,0.001805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,153.424280,155.295071,153.357131,154.191788,417400,0.0,0.0,ZTS,True,0.005443,-0.002111,0.000204,0.009383,153.626688,155.672889,154.709181,154.477098,154.663481,151.699308,48.711857,-0.298368,-0.514776,0.216408,0,0,0.012568,-0.004978,0.000207,-0.000220,0.000439,-0.000127,-0.000022,0.000498
839122,2020-12-28,154.882521,156.216060,153.798415,155.793930,1522400,0.0,0.0,ZTS,True,0.010391,0.013734,0.000429,0.009502,153.670818,155.665286,154.755876,154.602511,154.707813,151.780389,52.813339,-0.160573,-0.443935,0.283362,0,0,0.015518,-0.005850,0.000327,0.000070,0.000409,0.000875,0.000387,0.000694
839123,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,0.0,ZTS,True,0.004495,0.010219,0.000069,0.009394,153.802254,155.660959,154.774984,154.782681,154.777871,151.873734,54.525326,0.005086,-0.354131,0.359217,0,0,0.016552,0.000552,0.000889,0.000091,0.000225,0.001220,0.000548,0.000770
839124,2020-12-30,156.868441,158.106051,156.532666,157.597580,1009000,0.0,0.0,ZTS,True,0.007050,0.024255,0.000053,0.009423,153.941843,155.746919,154.828600,155.050766,154.888448,151.987078,57.162020,0.222829,-0.238739,0.461568,0,0,0.009984,-0.004627,0.000936,0.000667,0.000447,0.001775,0.000803,0.000894


#### Stationarity checks
we will use ADF test for stationarity checks and eliminate serial correlation through transformations if required

In [33]:
from statsmodels.tsa.stattools import adfuller
import numpy as np

In [34]:
def check_adf_all_columns(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    results = []

    for col in numeric_cols:
        series = df[col].dropna()
        try:
            p_val = adfuller(series, autolag="AIC")[1]
            verdict = "stationary" if p_val < 0.05 else "non_stationary"
        except Exception as e:
            p_val = np.nan
            verdict = f"error: {e}"

        results.append({
            "column": col,
            "ADF_p": p_val,
            "Verdict": verdict
        })

    return pd.DataFrame(results)

In [35]:
# Using only one stock (appl) for ADF because my kernel keeps crashing and its computationally expensive :"
appl_stock_data = SP500_all_stock_data_final[SP500_all_stock_data_final['ticker'] == 'AAPL']
adf_results = check_adf_all_columns(appl_stock_data)
display(adf_results)

Unnamed: 0,column,ADF_p,Verdict
0,Open,1.0,non_stationary
1,High,1.0,non_stationary
2,Low,1.0,non_stationary
3,Close,1.0,non_stationary
4,Volume,0.0003941899,stationary
5,Stock Splits,0.0,stationary
6,daily_return,8.005653e-27,stationary
7,monthly_return,4.571881e-09,stationary
8,monthly_var_3,9.980672e-05,stationary
9,daily_vol_20,2.819014e-06,stationary


In [36]:
# dropping non-stationary columns
SP500_all_stock_data_final.drop(columns=['Open', 'High', 'Low', 'Close', 'SMA_20', 'SMA_50', 'SMA_100', 'EMA_20', 'EMA_50', 'EMA_100'], inplace=True)
SP500_all_stock_data_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SP500_all_stock_data_final.drop(columns=['Open', 'High', 'Low', 'Close', 'SMA_20', 'SMA_50', 'SMA_100', 'EMA_20', 'EMA_50', 'EMA_100'], inplace=True)


Unnamed: 0,Date,Volume,Dividends,Stock Splits,ticker,is_in_sp500,daily_return,monthly_return,monthly_var_3,daily_vol_20,RSI,MACD_Line,MACD_Signal,MACD_Hist,is_forward_split,is_reverse_split,high_low_spread,open_close_spread,SMA_ret_20,SMA_ret_50,SMA_ret_100,EMA_ret_20,EMA_ret_50,EMA_ret_100
250,2013-01-02,8790205,0.0,0.0,A,True,0.022961,0.096440,0.000508,0.016544,64.959757,0.571169,0.559055,0.012114,0,0,0.024355,0.006924,0.005231,0.002119,0.000592,0.004949,0.003209,0.001800
251,2013-01-03,5751791,0.0,0.0,A,True,0.003582,0.111103,0.000424,0.016548,65.650147,0.602699,0.567784,0.034915,0,0,0.013324,-0.002141,0.005134,0.002936,0.000526,0.004818,0.003224,0.001835
252,2013-01-04,6432897,0.0,0.0,A,True,0.019748,0.126805,0.000231,0.016762,69.259481,0.663032,0.586833,0.076199,0,0,0.025432,-0.016799,0.005426,0.003817,0.000689,0.006240,0.003872,0.002190
253,2013-01-07,3589505,0.0,0.0,A,True,-0.007233,0.103319,0.000143,0.016695,66.451028,0.686873,0.606841,0.080032,0,0,0.009871,0.001175,0.005504,0.003533,0.000683,0.004957,0.003436,0.002003
254,2013-01-08,3896925,0.0,0.0,A,True,-0.007991,0.104214,0.000177,0.016420,63.413792,0.680316,0.621536,0.058779,0,0,0.016110,0.006160,0.005770,0.003645,0.000640,0.003724,0.002988,0.001805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839121,2020-12-24,417400,0.0,0.0,ZTS,True,0.005443,-0.002111,0.000204,0.009383,48.711857,-0.298368,-0.514776,0.216408,0,0,0.012568,-0.004978,0.000207,-0.000220,0.000439,-0.000127,-0.000022,0.000498
839122,2020-12-28,1522400,0.0,0.0,ZTS,True,0.010391,0.013734,0.000429,0.009502,52.813339,-0.160573,-0.443935,0.283362,0,0,0.015518,-0.005850,0.000327,0.000070,0.000409,0.000875,0.000387,0.000694
839123,2020-12-29,1188400,0.0,0.0,ZTS,True,0.004495,0.010219,0.000069,0.009394,54.525326,0.005086,-0.354131,0.359217,0,0,0.016552,0.000552,0.000889,0.000091,0.000225,0.001220,0.000548,0.000770
839124,2020-12-30,1009000,0.0,0.0,ZTS,True,0.007050,0.024255,0.000053,0.009423,57.162020,0.222829,-0.238739,0.461568,0,0,0.009984,-0.004627,0.000936,0.000667,0.000447,0.001775,0.000803,0.000894


In [37]:
SP500_all_stock_data_final.isna().sum()

Date                    0
Volume                  0
Dividends               0
Stock Splits            0
ticker                  0
is_in_sp500             0
daily_return            0
monthly_return          0
monthly_var_3           0
daily_vol_20            0
RSI                     0
MACD_Line               0
MACD_Signal             0
MACD_Hist               0
is_forward_split        0
is_reverse_split        0
high_low_spread         0
open_close_spread       0
SMA_ret_20            500
SMA_ret_50           1250
SMA_ret_100          2547
EMA_ret_20            500
EMA_ret_50           1250
EMA_ret_100          2547
dtype: int64

#### Covid hold out set 
https://en.wikipedia.org/wiki/2020_stock_market_crash#:~:text=Though%20the%20crash%20began%20on,13%25%20in%20most%20global%20markets.

Covid stock crash happened on: 20 Feb
we will use 2013-01-01 to 2020-02-19 data as our training test set and 2020-02-19 to 2020-12-12 as our holdout set

Purging will happen later using 100 days during time-series CV

In [38]:
# splitting the data appropriately into training, test and covid stress test data 
# Covid start https://en.wikipedia.org/wiki/2020_stock_market_crash#:~:text=Though%20the%20crash%20began%20on,13%25%20in%20most%20global%20markets.
COVID_start_date = '2020-02-20'
training_data =  SP500_all_stock_data_final[SP500_all_stock_data_final['Date'] < pd.to_datetime(COVID_start_date)]
covid_stress_test_data = SP500_all_stock_data_final[SP500_all_stock_data_final['Date'] >= pd.to_datetime(COVID_start_date)]