In [1]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("S&P 500 Historical Components & Changes(04-08-2024).csv")
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,tickers
0,1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1,1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
2,1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
3,1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
4,1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."


In [2]:
# Get Unique Tickers
unique_tickers = set()
transformed_data = []
for i, row in df.iterrows():
    tickers = row['tickers'].split(',')
    unique_tickers = unique_tickers.union(set(tickers))

In [3]:
# Convert Point in Time to Ticker Start/End Records
ticker_start_end_records = []
for ticker in tqdm(unique_tickers):
    ticker_record = None
    for i, row in df.iterrows():
        period_tickers = set(row['tickers'].split(','))
        if ticker in period_tickers:
            if ticker_record is None:
                ticker_record = {"ticker": ticker, "start_date": row["date"]}
                continue
        elif ticker_record is not None:
            ticker_record.update({"end_date": row["date"]})
            ticker_start_end_records.append(ticker_record)
            ticker_record = None

100%|██████████████████████████████████████████████████████████████████████████████| 1163/1163 [03:26<00:00,  5.63it/s]


In [4]:
# Convert back to dataframe
ticker_record_df = pd.DataFrame(ticker_start_end_records)
ticker_record_df.head()

Unnamed: 0,ticker,start_date,end_date
0,MON,2002-08-14,2018-06-07
1,EVHC,2016-12-02,2018-10-11
2,ASC,1996-01-02,1999-06-24
3,PHA,1996-01-02,2003-04-16
4,CAM,2008-01-29,2016-04-04


In [5]:
# Look at top tickers entereing and leaving S&P 500
(
    ticker_record_df.groupby("ticker")["start_date"]
    .count().sort_values(ascending=False)
    .head(15)
)

ticker
COV     3
MIR     2
NE      2
MXIM    2
AN      2
FL      2
CBE     2
HRS     2
OI      2
MEE     2
DXC     2
HP      2
GGP     2
H       2
GAS     2
Name: start_date, dtype: int64

In [6]:
# Record to CSV
(
    ticker_record_df.sort_values(["ticker", "start_date"])
    .to_csv("sp500_ticker_start_end.csv", index=False)
)