In [1]:
# importing necessary libraries
import pandas as pd
import yfinance as yf
import requests
from datetime import timedelta


In [2]:
df = pd.read_csv(r'raw_data/sp_500_historical_components.csv')

# Data cleaning
In this portion of the code we will 
1) Get the list of stocks that ever existed from sp_500_historical_components.csv from https://github.com/hanshof/sp500_constituents/blob/main/sp_500_historical_components.csv
2) Use yfinance library to get historical data (2013-2020) OHLCV, PE ratio, PB ratio from the lists of stocks, indicate their presence in the stock data at any point of time on a daily level
3) Only use stocks that are listed at the start of the month to the end of the month, e.g: if a stock enters the stock market on 4 Jan 2016, we will only use the data starting from 1 feb onwards
4) Filter out stocks that newly entered the stock market before 2016, this is to ensure that we have sufficient training data for each stock

In [3]:
# Firstly, we will get the list of all stocks that ever existed in the S&P 500 from 2013 to 2020

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Ensure 'tickers' is string
df['tickers'] = df['tickers'].astype(str)

# Filter date range
df = df[(df['date'] >= '2012-01-01') & (df['date'] < '2021-01-08')]

df2 = df[(df['date'] >= '2013-01-01') & (df['date'] < '2021-01-08')]#For 2013-2020 analysis only
df2 = df2.copy()


In [4]:
# Function to parse tickers from CSV string
def parse_ticker_list(s):
    if pd.isna(s): 
        return []
    s = s.strip()
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    items = [t.strip() for t in s.split(',') if t.strip() != '']
    return items

# Apply parsing
df['ticker_list'] = df['tickers'].apply(parse_ticker_list)
df2['ticker_list'] = df2['tickers'].apply(parse_ticker_list)    #For 2013-2020 analysis only

# Keep only relevant columns
df = df[['date', 'ticker_list']]
df2 = df2[['date', 'ticker_list']]  #For 2013-2020 analysis only


In [5]:
def replace_fb_with_meta(ticker_list):
    return ['META' if ticker == 'FB' else ticker for ticker in ticker_list]

df['ticker_list'] = df['ticker_list'].apply(replace_fb_with_meta)
df2['ticker_list'] = df2['ticker_list'].apply(replace_fb_with_meta)   #For 2013-2020 analysis only

In [None]:
# # Extract first date of each month
# df['year_month'] = df['date'].dt.to_period('M')  # e.g., 2020-01
# month_starts = df.groupby('year_month')['date'].min().reset_index()
# month_starts.columns = ['year_month', 'start_date']

# # Loop over each month and get tickers that were present throughout the month
# # If the stock was missing on any date in that month, it is excluded from the list of tickers for that month

# results = []

# for i in range(len(month_starts) - 1):
#     start = month_starts.loc[i, 'start_date']
#     end = month_starts.loc[i + 1, 'start_date']

#     # Filter rows from start to just before next month’s start
#     month_df = df[(df['date'] >= start) & (df['date'] < end)]

#     all_tickers = set().union(*month_df['ticker_list'])

#     # Get intersection of ticker_lists in this period
#     if not month_df.empty:
#         surviving = set(month_df.iloc[0]['ticker_list'])
#         for tickers in month_df['ticker_list']:
#             surviving &= set(tickers)

#         not_survived = all_tickers - surviving

#         results.append({
#             'date': start,
#             'ticker_list': sorted(list(surviving)),
#             'not_survived': sorted(list(not_survived)),
#             'all tickers': len(all_tickers)
#         })

#--------------------------------------------FOR DF2-------------------------------------------------
def get_monthly_survivors(df):
    """
    Computes tickers that survived each month and those that didn't.
    
    Parameters:
        df (pd.DataFrame): DataFrame with columns 'date' (datetime) and 'ticker_list' (list of tickers)
    
    Returns:
        pd.DataFrame: Each row contains:
            - 'date': first date of the month
            - 'ticker_list': list of tickers present every day in the month
            - 'not_survived': tickers missing on any day of the month
            - 'all_tickers': total number of unique tickers observed in the month
    """
    df['year_month'] = df['date'].dt.to_period('M')
    
    # Get first date of each month
    month_starts = df.groupby('year_month')['date'].min().reset_index()
    month_starts.columns = ['year_month', 'start_date']
    
    results = []
    
    for i in range(len(month_starts) - 1):
        start = month_starts.loc[i, 'start_date']
        end = month_starts.loc[i + 1, 'start_date']

        # Filter month data
        month_df = df[(df['date'] >= start) & (df['date'] < end)]
        if month_df.empty:
            continue

        # All tickers ever observed in this month
        all_tickers = set().union(*month_df['ticker_list'])

        # Intersection across all days → surviving tickers
        surviving = set(month_df.iloc[0]['ticker_list'])
        for tickers in month_df['ticker_list']:
            surviving &= set(tickers)

        not_survived = all_tickers - surviving

        results.append({
            'date': start,
            'ticker_list': sorted(list(surviving)),
            'not_survived': sorted(list(not_survived)),
            'all_tickers': len(all_tickers)
        })

    return pd.DataFrame(results)

survivors_df = get_monthly_survivors(df)
survivors_df2 = get_monthly_survivors(df2)



Unnamed: 0,date,ticker_list,not_survived,all_tickers
0,2013-01-02,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458
1,2013-02-04,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...","[BIG, PVH]",459
2,2013-03-11,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458
3,2013-04-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458
4,2013-05-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...","[DF, KSU, MAC]",460
...,...,...,...,...
86,2020-06-22,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",[],501
87,2020-09-18,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...","[COTY, CTLT, ETSY, HRB, KSS, TER]",504
88,2020-10-07,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...","[NBL, VNT]",502
89,2020-11-17,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",[],501


In [9]:
# # Create final tickers DataFrame
# survivors_df['num_survivors'] = survivors_df['ticker_list'].apply(len)

# # Reindex to ensure all months are present, forward-fill missing months
# survivors_df['month'] = survivors_df['date'].dt.to_period('M').dt.to_timestamp()
# survivors_df.set_index('month', inplace=True)
# full_month_range = pd.date_range('2012-01-01', '2020-12-01', freq='MS')
# survivors_df = survivors_df.reindex(full_month_range)
# survivors_df.ffill(inplace=True)
# survivors_df.reset_index(inplace=True)
# survivors_df.rename(columns={'index': 'month'}, inplace=True)

#----------------------------
def process_survivors_df(survivors_df, start_date='2012-01-01', end_date='2020-12-01'):
    """
    Processes a monthly survivors DataFrame:
    - Adds a 'num_survivors' column
    - Reindexes to include all months in the specified range
    - Forward-fills missing months
    
    Parameters:
        survivors_df (pd.DataFrame): must have columns 'date' (datetime) and 'ticker_list' (list)
        start_date (str or pd.Timestamp): first month in the reindexed range
        end_date (str or pd.Timestamp): last month in the reindexed range
    
    Returns:
        pd.DataFrame: processed DataFrame with columns:
            - 'month' (datetime)
            - 'ticker_list' (list of tickers)
            - 'not_survived' (list)
            - 'all_tickers' (int)
            - 'num_survivors' (int)
    """
    df = survivors_df.copy()
    
    # Calculate number of surviving tickers
    df['num_survivors'] = df['ticker_list'].apply(len)
    
    # Convert 'date' to month start timestamp
    df['month'] = df['date'].dt.to_period('M').dt.to_timestamp()
    df.set_index('month', inplace=True)
    
    # Reindex to full month range and forward-fill missing months
    full_month_range = pd.date_range(start=start_date, end=end_date, freq='MS')
    df = df.reindex(full_month_range)
    df.ffill(inplace=True)
    
    # Reset index and rename
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'month'}, inplace=True)
    
    return df

survivors_df = process_survivors_df(survivors_df, start_date='2012-01-01', end_date='2020-12-01')
survivors_df2 = process_survivors_df(survivors_df2, start_date='2013-01-01', end_date='2020-12-01')



In [10]:
# # Explode the ticker_list so each row has one ticker
# exploded = survivors_df.explode('ticker_list')

# # Assign presence flag
# exploded['value'] = 1

# # Pivot the table
# pivot_df = exploded.pivot_table(
#     index='date',        # Each row is a month
#     columns='ticker_list',
#     values='value',
#     fill_value=0         # If the ticker wasn't present, put 0
# )

# #sort columns (tickers)
# pivot_df = pivot_df.sort_index(axis=1)
# pivot_df = pivot_df.sort_index(axis=0)


#-------------------------------
def create_presence_matrix(survivors_df, date_col='date', ticker_col='ticker_list'):
    """
    Converts a monthly survivors DataFrame into a pivoted presence matrix.
    
    Parameters:
        survivors_df (pd.DataFrame): must have columns for month/date and 'ticker_list'
        date_col (str): name of the column containing the month/date
        ticker_col (str): name of the column containing the list of tickers
        
    Returns:
        pd.DataFrame: pivoted DataFrame where:
            - index = months/dates
            - columns = tickers
            - values = 1 if ticker present that month, 0 otherwise
    """
    df = survivors_df.copy()
    
    # Explode the ticker list so each row has one ticker
    exploded = df.explode(ticker_col)
    
    # Assign presence flag
    exploded['value'] = 1
    
    # Pivot the table
    pivot_df = exploded.pivot_table(
        index=date_col,
        columns=ticker_col,
        values='value',
        fill_value=0
    )
    
    # Sort rows and columns
    pivot_df = pivot_df.sort_index(axis=0).sort_index(axis=1)
    
    return pivot_df

pivot_df = create_presence_matrix(survivors_df)
pivot_df2 = create_presence_matrix(survivors_df2)


In [None]:
# summary_data = []

# for ticker in pivot_df.columns:
#     series = pivot_df[ticker]
#     active_months = series[series == 1]

#     if not active_months.empty:
#         summary_data.append({
#             'ticker': ticker,
#             'first_seen': active_months.index.min(),
#             'last_seen': active_months.index.max(),
#             'months_active': active_months.count()
#         })

# summary_df = pd.DataFrame(summary_data)
# summary_df = summary_df.sort_values('ticker').reset_index(drop=True)


#-------------------------------------------
def summarize_ticker_activity(pivot_df):
    """
    Summarizes the activity of each ticker from a pivoted presence matrix.
    
    Parameters:
        pivot_df (pd.DataFrame): rows = months/dates, columns = tickers, values = 1 if present, 0 if not
    
    Returns:
        pd.DataFrame: summary DataFrame with columns:
            - 'ticker': ticker symbol
            - 'first_seen': first month ticker was present
            - 'last_seen': last month ticker was present
            - 'months_active': total number of months ticker was active
    """
    summary_data = []

    for ticker in pivot_df.columns:
        series = pivot_df[ticker]
        active_months = series[series == 1]

        if not active_months.empty:
            summary_data.append({
                'ticker': ticker,
                'first_seen': active_months.index.min(),
                'last_seen': active_months.index.max(),
                'months_active': active_months.count()
            })

    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('ticker').reset_index(drop=True)
    
    return summary_df

summary_df = summarize_ticker_activity(pivot_df)
summary_df2 = summarize_ticker_activity(pivot_df2)



Unnamed: 0,ticker,first_seen,last_seen,months_active
0,A,2013-01-02,2020-12-21,91
1,AAL,2015-04-07,2020-12-21,64
2,AAP,2015-08-04,2020-12-21,60
3,AAPL,2013-01-02,2020-12-21,91
4,ABBV,2013-01-02,2020-12-21,91
...,...,...,...,...
635,YUM,2013-01-02,2020-12-21,91
636,ZBH,2013-01-02,2020-12-21,91
637,ZBRA,2020-01-28,2020-12-21,9
638,ZION,2013-01-02,2020-12-21,91


In [20]:
def compare_summaries(df_full, df_subset):
    """
    Compares two summary DataFrames and flags tickers as 'Dropped', 'New', or 'Common'.
    
    Parameters:
        df_full (pd.DataFrame): full summary (e.g., 2012-2020)
        df_subset (pd.DataFrame): subset summary (e.g., 2013-2020)
    
    Returns:
        pd.DataFrame: merged DataFrame with status and side-by-side activity info
    """
    # Merge on ticker
    comparison_df = df_full.merge(
        df_subset, 
        on='ticker', 
        how='outer', 
        suffixes=('_2012_2020', '_2013_2020')
    ).sort_values('ticker')

    # Add status column
    comparison_df['status'] = comparison_df.apply(
        lambda row: 'Dropped' if pd.notna(row['months_active_2012_2020']) and pd.isna(row['months_active_2013_2020'])
        else ('New' if pd.isna(row['months_active_2012_2020']) and pd.notna(row['months_active_2013_2020'])
        else 'Common'),
        axis=1
    )

    return comparison_df

comparison_df = compare_summaries(summary_df, summary_df2)

comparison_df[comparison_df['status'] == 'Dropped']


Unnamed: 0,ticker,first_seen_2012_2020,last_seen_2012_2020,months_active_2012_2020,first_seen_2013_2020,last_seen_2013_2020,months_active_2013_2020,status
56,ATGE,2012-01-03,2012-09-05,9,NaT,NaT,,Dropped
105,CBE,2012-01-03,2012-11-06,11,NaT,NaT,,Dropped
202,EP,2012-01-03,2012-04-02,4,NaT,NaT,,Dropped
231,FII,2012-01-03,2012-12-03,12,NaT,NaT,,Dropped
266,GR,2012-01-03,2012-06-01,6,NaT,NaT,,Dropped
372,LXK,2012-01-03,2012-09-05,9,NaT,NaT,,Dropped
392,MHS,2012-01-03,2012-03-01,3,NaT,NaT,,Dropped
398,MMI,2012-01-03,2012-04-02,4,NaT,NaT,,Dropped
511,RRD,2012-01-03,2012-11-06,11,NaT,NaT,,Dropped
543,SVU,2012-01-03,2012-04-02,4,NaT,NaT,,Dropped


In [21]:
# Ensure dates are datetime 
summary_df['first_seen'] = pd.to_datetime(summary_df['first_seen'])
summary_df['last_seen'] = pd.to_datetime(summary_df['last_seen'])

# Function to get first and last day of the month
def get_month_range(start, end):
    start_of_month = start.to_period('M').to_timestamp()       # first day of month
    end_of_month = end.to_period('M').to_timestamp('M')        # last day of month
    return start_of_month, end_of_month

# Function to fetch stock data from Yahoo Finance
def get_stock_data(ticker, start_date, end_date):
    try:
        stock = yf.Ticker(ticker)
        stock_data = stock.history(start=start_date, end=end_date + pd.Timedelta(days=1))  # Add 1 day to include end date
        stock_data['ticker'] = ticker
        return stock_data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

# Loop through summary_df and collect stock data
stock_data_list = []
success_count = 0  # Counter for successful tickers

for idx, row in summary_df.iterrows():
    ticker = row['ticker']
    first_seen = row['first_seen']
    last_seen = row['last_seen']
    
    # Adjust to first and last day of the month
    start_date, end_date = get_month_range(first_seen, last_seen)
    
    stock_data = get_stock_data(ticker, start_date , end_date)
    
    if not stock_data.empty:
        stock_data_list.append(stock_data)
        success_count += 1  # Increment if data fetched successfully

# Combine all stock data
all_stock_data = pd.concat(stock_data_list)
all_stock_data.reset_index(inplace=True)

# Print count of tickers successfully fetched
print(f"Number of tickers successfully fetched: {success_count}")

$ABC: possibly delisted; no timezone found
$ABMD: possibly delisted; no timezone found
$ADS: possibly delisted; no timezone found
$ALXN: possibly delisted; no timezone found
$ANSS: possibly delisted; no timezone found
$ANTM: possibly delisted; no timezone found
$APC: possibly delisted; no timezone found
$APOL: possibly delisted; no price data found  (1d 2012-01-01 00:00:00 -> 2013-07-01 00:00:00)
$ARG: possibly delisted; no price data found  (1d 2012-01-01 00:00:00 -> 2016-05-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1325394000, endDate = 1462075200")
$ATVI: possibly delisted; no timezone found
$AVP: possibly delisted; no timezone found
$BCR: possibly delisted; no price data found  (1d 2012-01-01 00:00:00 -> 2017-12-01 00:00:00)
$BIG: possibly delisted; no timezone found
$BLL: possibly delisted; no timezone found
$BRCM: possibly delisted; no price data found  (1d 2012-01-01 00:00:00 -> 2016-02-01 00:00:00)
$BTUUQ: possibly delisted; no price data found  (1d 2012-0

Number of tickers successfully fetched: 521


In [28]:
all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Capital Gains
0,2012-01-03 00:00:00-05:00,22.776919,23.507763,22.713367,23.183651,4156394,0.0,0.0,A,
1,2012-01-04 00:00:00-05:00,22.973940,23.107398,22.618049,22.999359,4651845,0.0,0.0,A,
2,2012-01-05 00:00:00-05:00,22.802342,23.717486,22.700660,23.514122,6842651,0.0,0.0,A,
3,2012-01-06 00:00:00-05:00,23.571320,23.870014,23.393377,23.768332,4711400,0.0,0.0,A,
4,2012-01-09 00:00:00-05:00,23.908148,24.416562,23.812821,24.391142,4429563,0.0,0.0,A,
...,...,...,...,...,...,...,...,...,...,...
922047,2020-12-24 00:00:00-05:00,153.424234,155.295024,153.357085,154.191742,417400,0.0,0.0,ZTS,
922048,2020-12-28 00:00:00-05:00,154.882551,156.216091,153.798445,155.793961,1522400,0.0,0.0,ZTS,
922049,2020-12-29 00:00:00-05:00,156.580649,158.393879,155.803537,156.494293,1188400,0.0,0.0,ZTS,
922050,2020-12-30 00:00:00-05:00,156.868410,158.106020,156.532636,157.597549,1009000,0.0,0.0,ZTS,


In [29]:
# List of tickers to remove as they were not in S&P 500 between 2013-2020 
tickers_to_remove = [
    "ATGE", "CBE", "EP", "FII", "GR", "LXK", 
    "MHS", "MMI", "RRD", "SVU", "TIE"
]

# Remove rows where 'ticker' is in the list
all_stock_data = all_stock_data[~all_stock_data['ticker'].isin(tickers_to_remove)]


In [31]:
SP500_all_stock_data = all_stock_data.drop(columns=['Stock Splits']) # dropping since we already account for the price adjustments in 'Adj Close'


#Daily returns
SP500_all_stock_data['daily_return'] = (
    SP500_all_stock_data.groupby('ticker')['Close']
    .pct_change()
)

#Monthly returns
monthly_returns = (
    SP500_all_stock_data
    .set_index('Date')
    .groupby('ticker')['Close']
    .resample('ME')
    .ffill()             # forward-fill missing days within month
    .pct_change()        # monthly percentage change
    .reset_index(name='monthly_return')
)

SP500_all_stock_data = SP500_all_stock_data.merge(
    monthly_returns[['Date', 'ticker', 'monthly_return']],
    on=['Date', 'ticker'],
    how='left'
)

#Check index membership, if tikcer is in ticker_list for that month
SP500_all_stock_data['Date'] = SP500_all_stock_data['Date'].dt.tz_localize(None)
SP500_all_stock_data['month'] = SP500_all_stock_data['Date'].dt.to_period('M').dt.to_timestamp()


SP500_all_stock_data = SP500_all_stock_data.merge(
    survivors_df[['month', 'ticker_list']],
    on='month',
    how='left'
)

SP500_all_stock_data['membership_index'] = SP500_all_stock_data.apply(
    lambda row: row['ticker'] in row['ticker_list'],
    axis=1
)

SP500_all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,ticker,Capital Gains,daily_return,monthly_return,month,ticker_list,membership_index
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,A,,,,2012-01-01,"[A, AAPL, ABC, ABT, ACN, ADBE, ADI, ADM, ADP, ...",True
1,2012-01-04,22.973940,23.107398,22.618049,22.999359,4651845,0.0,A,,-0.007949,,2012-01-01,"[A, AAPL, ABC, ABT, ACN, ADBE, ADI, ADM, ADP, ...",True
2,2012-01-05,22.802342,23.717486,22.700660,23.514122,6842651,0.0,A,,0.022382,,2012-01-01,"[A, AAPL, ABC, ABT, ACN, ADBE, ADI, ADM, ADP, ...",True
3,2012-01-06,23.571320,23.870014,23.393377,23.768332,4711400,0.0,A,,0.010811,,2012-01-01,"[A, AAPL, ABC, ABT, ACN, ADBE, ADI, ADM, ADP, ...",True
4,2012-01-09,23.908148,24.416562,23.812821,24.391142,4429563,0.0,A,,0.026203,,2012-01-01,"[A, AAPL, ABC, ABT, ACN, ADBE, ADI, ADM, ADP, ...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921110,2020-12-24,153.424234,155.295024,153.357085,154.191742,417400,0.0,ZTS,,0.005442,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
921111,2020-12-28,154.882551,156.216091,153.798445,155.793961,1522400,0.0,ZTS,,0.010391,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
921112,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,ZTS,,0.004495,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
921113,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,ZTS,,0.007050,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True


In [32]:
SP500_all_stock_data = SP500_all_stock_data.drop(columns=['ticker_list','month','Capital Gains'])

SP500_all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,ticker,daily_return,monthly_return,membership_index
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,A,,,True
1,2012-01-04,22.973940,23.107398,22.618049,22.999359,4651845,0.0,A,-0.007949,,True
2,2012-01-05,22.802342,23.717486,22.700660,23.514122,6842651,0.0,A,0.022382,,True
3,2012-01-06,23.571320,23.870014,23.393377,23.768332,4711400,0.0,A,0.010811,,True
4,2012-01-09,23.908148,24.416562,23.812821,24.391142,4429563,0.0,A,0.026203,,True
...,...,...,...,...,...,...,...,...,...,...,...
921110,2020-12-24,153.424234,155.295024,153.357085,154.191742,417400,0.0,ZTS,0.005442,,True
921111,2020-12-28,154.882551,156.216091,153.798445,155.793961,1522400,0.0,ZTS,0.010391,,True
921112,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,ZTS,0.004495,,True
921113,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,ZTS,0.007050,,True


### Creating new features
#### Moving averages
https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp we will use short:20 days, medium: 50 days, long: 100 days moving averages
1) Simple moving average SMA: SMA_20, SMA_50, SMA_100
2) Exponential Moving Average EMA https://www.investopedia.com/terms/e/ema.asp#toc-formula-for-exponential-moving-average-ema: EMA_20, EMA_50, EMA_100
3) Relative Strength Index 14 days https://www.investopedia.com/terms/r/rsi.asp : RSI
4) Moving Average Convergence/Divergence indicator: https://www.investopedia.com/terms/m/macd.asp: MACD_26, MACD_12, MACD_9 

In [33]:
def sma(series: pd.Series, window: int) -> pd.Series:
    """Simple Moving Average."""
    return series.rolling(window, min_periods=window).mean()


In [34]:
def ema(series: pd.Series, span: int) -> pd.Series:
    """Exponential Moving Average."""
    return series.ewm(span=span, adjust=False, min_periods=span).mean()

In [35]:
def rsi_wilder(series: pd.Series, period: int = 14) -> pd.Series:
    """Wilder's RSI (default 14)."""
    delta = series.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [36]:
def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    """
    MACD parts: returns DataFrame with MACD_Line, MACD_Signal, MACD_Hist.
    """
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    macd_signal = macd_line.ewm(span=signal, adjust=False, min_periods=signal).mean()
    macd_hist = macd_line - macd_signal
    return pd.DataFrame(
        {"MACD_Line": macd_line, "MACD_Signal": macd_signal, "MACD_Hist": macd_hist},
        index=series.index
    )

In [37]:
df = SP500_all_stock_data.copy()

# Ensure types/order; compute per-ticker
df['Date'] = pd.to_datetime(df['Date'])
df = df.drop_duplicates(subset=['ticker','Date']).sort_values(['ticker','Date'])

# Prefer adjusted close if available
price_col = 'Adj Close' if 'Adj Close' in df.columns else 'Close'

g = df.groupby('ticker', group_keys=False)

# SMA 20/50/100
for w in [20, 50, 100]:
    df[f'SMA_{w}'] = g[price_col].transform(lambda s, w=w: sma(s, w))

# EMA 20/50/100
for w in [20, 50, 100]:
    df[f'EMA_{w}'] = g[price_col].transform(lambda s, w=w: ema(s, w))

# RSI 14
df['RSI'] = g[price_col].transform(rsi_wilder)

# MACD (12,26,9)
macd_df = g[price_col].apply(macd)
df = df.join(macd_df)

# Hand back to your master frame
SP500_all_stock_data_Final = df

In [38]:
SP500_all_stock_data_Final

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,ticker,daily_return,monthly_return,...,SMA_20,SMA_50,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist
0,2012-01-03,22.776919,23.507763,22.713367,23.183651,4156394,0.0,A,,,...,,,,,,,,,,
1,2012-01-04,22.973940,23.107398,22.618049,22.999359,4651845,0.0,A,-0.007949,,...,,,,,,,,,,
2,2012-01-05,22.802342,23.717486,22.700660,23.514122,6842651,0.0,A,0.022382,,...,,,,,,,,,,
3,2012-01-06,23.571320,23.870014,23.393377,23.768332,4711400,0.0,A,0.010811,,...,,,,,,,,,,
4,2012-01-09,23.908148,24.416562,23.812821,24.391142,4429563,0.0,A,0.026203,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921110,2020-12-24,153.424234,155.295024,153.357085,154.191742,417400,0.0,ZTS,0.005442,,...,153.626685,155.672890,154.709183,154.477097,154.663481,151.699308,48.711728,-0.298369,-0.514775,0.216406
921111,2020-12-28,154.882551,156.216091,153.798445,155.793961,1522400,0.0,ZTS,0.010391,,...,153.670818,155.665287,154.755878,154.602513,154.707813,151.780390,52.813424,-0.160571,-0.443934,0.283363
921112,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,ZTS,0.004495,,...,153.802254,155.660959,154.774986,154.782682,154.777872,151.873735,54.525339,0.005087,-0.354130,0.359217
921113,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,ZTS,0.007050,,...,153.941844,155.746919,154.828602,155.050765,154.888447,151.987078,57.161973,0.222828,-0.238738,0.461566


In [39]:
# 2012-05-23 is the 100th trading day from 2012-01-01
final_cleaned_data = SP500_all_stock_data_Final[SP500_all_stock_data_Final['Date'] > '2012-05-23']

final_cleaned_data


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,ticker,daily_return,monthly_return,...,SMA_20,SMA_50,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist
99,2012-05-24,26.013019,26.210475,25.764610,26.127670,3591182,0.0,A,0.006132,,...,25.924481,26.995302,26.976645,25.890111,26.463516,26.338783,50.437394,-0.354859,-0.473274,0.118415
100,2012-05-25,26.127663,26.407921,25.981165,26.121294,2633552,0.0,A,-0.000244,,...,25.875117,26.934831,27.006022,25.912128,26.450095,26.334476,50.374673,-0.294990,-0.437617,0.142627
101,2012-05-29,26.414296,26.911116,26.388818,26.866531,5696570,0.0,A,0.028530,,...,25.875117,26.892317,27.044693,26.003024,26.466426,26.345012,57.090799,-0.185274,-0.387149,0.201875
102,2012-05-30,26.522572,26.522572,25.987534,26.337856,4297871,0.0,A,-0.019678,,...,25.823525,26.842533,27.072931,26.034913,26.461384,26.344870,51.741093,-0.139375,-0.337594,0.198219
103,2012-05-31,26.407923,26.509836,25.649952,25.898363,5887258,0.0,A,-0.016687,-0.036036,...,25.747727,26.790696,27.094231,26.021908,26.439305,26.336028,47.736466,-0.136886,-0.297452,0.160566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921110,2020-12-24,153.424234,155.295024,153.357085,154.191742,417400,0.0,ZTS,0.005442,,...,153.626685,155.672890,154.709183,154.477097,154.663481,151.699308,48.711728,-0.298369,-0.514775,0.216406
921111,2020-12-28,154.882551,156.216091,153.798445,155.793961,1522400,0.0,ZTS,0.010391,,...,153.670818,155.665287,154.755878,154.602513,154.707813,151.780390,52.813424,-0.160571,-0.443934,0.283363
921112,2020-12-29,156.580649,158.393879,155.803537,156.494293,1188400,0.0,ZTS,0.004495,,...,153.802254,155.660959,154.774986,154.782682,154.777872,151.873735,54.525339,0.005087,-0.354130,0.359217
921113,2020-12-30,156.868410,158.106020,156.532636,157.597549,1009000,0.0,ZTS,0.007050,,...,153.941844,155.746919,154.828602,155.050765,154.888447,151.987078,57.161973,0.222828,-0.238738,0.461566


In [43]:
final_cleaned_data[final_cleaned_data['ticker'] == 'GME']

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,ticker,daily_return,monthly_return,...,SMA_20,SMA_50,SMA_100,EMA_20,EMA_50,EMA_100,RSI,MACD_Line,MACD_Signal,MACD_Hist
367794,2012-05-24,3.260227,3.279013,3.202161,3.273889,19172800,0.0375,GME,0.012679,,...,3.526386,3.709473,3.861936,3.458634,3.648406,3.780963,34.715898,-0.147554,-0.121189,-0.026365
367795,2012-05-25,3.284136,3.371235,3.282428,3.333663,12567200,0.0,GME,0.018258,,...,3.500925,3.695459,3.855228,3.446732,3.636063,3.772105,39.244309,-0.139058,-0.124762,-0.014295
367796,2012-05-29,3.347325,3.378066,3.296090,3.366111,10799200,0.0,GME,0.009733,,...,3.476409,3.682941,3.848273,3.439054,3.625477,3.764066,41.611959,-0.128229,-0.125456,-0.002773
367797,2012-05-30,3.331955,3.610329,3.261934,3.357572,26932000,0.0,GME,-0.002537,,...,3.450619,3.668931,3.839329,3.431294,3.614971,3.756016,41.157430,-0.118964,-0.124157,0.005193
367798,2012-05-31,3.325123,3.371234,3.243148,3.275597,11325600,0.0,GME,-0.024415,-0.150616,...,3.421492,3.653079,3.830223,3.416465,3.601662,3.746503,36.980956,-0.116889,-0.122704,0.005815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368757,2016-03-24,5.920847,5.952071,5.778388,5.907187,17952400,0.0,GME,-0.003293,,...,6.023418,5.533993,6.114308,5.933349,5.846805,6.173149,50.912328,0.093434,0.136730,-0.043295
368758,2016-03-28,5.649589,6.039889,5.542256,5.872060,32988800,0.0,GME,-0.005947,,...,6.022398,5.547477,6.085053,5.927512,5.847795,6.167187,49.684164,0.077790,0.124942,-0.047152
368759,2016-03-29,5.889624,6.231136,5.854497,6.149173,15210000,0.0,GME,0.047192,,...,6.032533,5.571751,6.058360,5.948622,5.859614,6.166830,58.242133,0.086753,0.117304,-0.030551
368760,2016-03-30,6.160882,6.238942,6.049646,6.197960,10088400,0.0,GME,0.007934,,...,6.043949,5.596769,6.031066,5.972369,5.872883,6.167447,59.546658,0.096678,0.113179,-0.016501
