In [3]:
import pandas as pd
from tqdm import tqdm
import json

df = pd.read_csv("S&P 500 Historical Components & Changes(08-17-2024).csv")
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,tickers
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."


In [4]:
# Get Unique Tickers
unique_tickers = set()
transformed_data = []
for i, row in df.iterrows():
    tickers = row['tickers'].split(',')
    unique_tickers = unique_tickers.union(set(tickers))

In [5]:
# Convert Point in Time to Ticker Start/End Records
ticker_start_end_records = []
for ticker in tqdm(unique_tickers):
    ticker_record = None
    for i, row in df.iterrows():
        period_tickers = set(row['tickers'].split(','))
        if ticker in period_tickers:
            if ticker_record is None:
                ticker_record = {"ticker": ticker, "start_date": row["date"]}
                continue
        elif ticker_record is not None:
            ticker_record.update({"end_date": row["date"]})
            ticker_start_end_records.append(ticker_record)
            ticker_record = None

    if ticker_record is not None:
        ticker_start_end_records.append(ticker_record)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1168/1168 [05:19<00:00,  3.66it/s]


In [6]:
# Convert back to dataframe
ticker_record_df = pd.DataFrame(ticker_start_end_records)
ticker_record_df.head()

Unnamed: 0,ticker,start_date,end_date
0,A,2000-06-05,NaT
1,FRC,2019-01-02,2023-05-04
2,FBF,1996-01-02,2004-04-01
3,CMCSK,2015-09-21,2015-12-14
4,PCAR,1996-01-02,NaT


In [7]:
# Look at top tickers entering and leaving S&P 500
(
    ticker_record_df.groupby("ticker")["start_date"]
    .count().sort_values(ascending=False)
    .head(15)
)

ticker
COV     3
MEE     2
DOW     2
GGP     2
MXIM    2
IR      2
CBE     2
FL      2
DXC     2
CCI     2
TT      2
AMD     2
AMP     2
CE      2
CEG     2
Name: start_date, dtype: int64

In [8]:
# Record to CSV
(
    ticker_record_df.sort_values(["ticker", "start_date"])
    .to_csv("sp500_ticker_start_end.csv", index=False)
)

In [9]:
ticker_record_df.ticker.nunique()

1168

In [10]:
len(ticker_record_df)

1216

In [11]:
# Record list of tickers to JSON (Optional)
# with open("sp_500_full.json", "w") as f:
#     json.dump(ticker_record_df.ticker.str.replace(".", " ").to_list(), f)