In [1]:
import pandas as pd

df = pd.read_csv(r'C:\Users\harry\Downloads\sp_500_historical_components.csv')

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Ensure 'tickers' is string
df['tickers'] = df['tickers'].astype(str)

# Filter date range
df = df[(df['date'] >= '2013-01-01') & (df['date'] < '2021-01-08')]

# Function to parse tickers from CSV string
def parse_ticker_list(s):
    if pd.isna(s): 
        return []
    s = s.strip()
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    items = [t.strip() for t in s.split(',') if t.strip() != '']
    return items

# Apply parsing
df['ticker_list'] = df['tickers'].apply(parse_ticker_list)

# Keep only relevant columns
df = df[['date', 'ticker_list']]

def replace_fb_with_meta(ticker_list):
    return ['META' if ticker == 'FB' else ticker for ticker in ticker_list]

df['ticker_list'] = df['ticker_list'].apply(replace_fb_with_meta)



In [2]:
# Extract first date of each month
df['year_month'] = df['date'].dt.to_period('M')  # e.g., 2020-01
month_starts = df.groupby('year_month')['date'].min().reset_index()
month_starts.columns = ['year_month', 'start_date']

print(month_starts)
# Loop over each month and get tickers that were present throughout the month
# If the stock was missing on any date in that month, it is excluded from the list of tickers for that month

results = []

for i in range(len(month_starts) - 1):
    start = month_starts.loc[i, 'start_date']
    end = month_starts.loc[i + 1, 'start_date']

    # Filter rows from start to just before next month’s start
    month_df = df[(df['date'] >= start) & (df['date'] < end)]

    all_tickers = set().union(*month_df['ticker_list'])

    # Get intersection of ticker_lists in this period
    if not month_df.empty:
        surviving = set(month_df.iloc[0]['ticker_list'])
        for tickers in month_df['ticker_list']:
            surviving &= set(tickers)

        not_survived = all_tickers - surviving

        results.append({
            'date': start,
            'ticker_list': sorted(list(surviving)),
            'not_survived': sorted(list(not_survived)),
            'all tickers': len(all_tickers)
        })

# Create final tickers DataFrame
survivors_df = pd.DataFrame(results)
survivors_df['num_survivors'] = survivors_df['ticker_list'].apply(len)

# Reindex to ensure all months are present, forward-fill missing months
survivors_df['month'] = survivors_df['date'].dt.to_period('M').dt.to_timestamp()
survivors_df.set_index('month', inplace=True)
full_month_range = pd.date_range('2013-01-01', '2020-12-01', freq='MS')
survivors_df = survivors_df.reindex(full_month_range)
survivors_df.ffill(inplace=True)
survivors_df.reset_index(inplace=True)
survivors_df.rename(columns={'index': 'month'}, inplace=True)

survivors_df



   year_month start_date
0     2013-01 2013-01-02
1     2013-02 2013-02-04
2     2013-03 2013-03-11
3     2013-04 2013-04-01
4     2013-05 2013-05-01
..        ...        ...
87    2020-09 2020-09-18
88    2020-10 2020-10-07
89    2020-11 2020-11-17
90    2020-12 2020-12-21
91    2021-01 2021-01-07

[92 rows x 2 columns]


Unnamed: 0,month,date,ticker_list,not_survived,all tickers,num_survivors
0,2013-01-01,2013-01-02,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458.0,458.0
1,2013-02-01,2013-02-04,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...","[BIG, PVH]",459.0,457.0
2,2013-03-01,2013-03-11,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458.0,458.0
3,2013-04-01,2013-04-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",[],458.0,458.0
4,2013-05-01,2013-05-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...","[DF, KSU, MAC]",460.0,457.0
...,...,...,...,...,...,...
91,2020-08-01,2020-06-22,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",[],501.0,501.0
92,2020-09-01,2020-09-18,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...","[COTY, CTLT, ETSY, HRB, KSS, TER]",504.0,498.0
93,2020-10-01,2020-10-07,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...","[NBL, VNT]",502.0,500.0
94,2020-11-01,2020-11-17,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",[],501.0,501.0


In [3]:
# Explode the ticker_list so each row has one ticker
exploded = survivors_df.explode('ticker_list')

# Assign presence flag
exploded['value'] = 1

# Pivot the table
pivot_df = exploded.pivot_table(
    index='date',        # Each row is a month
    columns='ticker_list',
    values='value',
    fill_value=0         # If the ticker wasn't present, put 0
)

#sort columns (tickers)
pivot_df = pivot_df.sort_index(axis=1)
pivot_df = pivot_df.sort_index(axis=0)


pivot_df

ticker_list,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2013-02-04,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2013-03-11,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2013-04-01,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2013-05-01,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-22,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-09-18,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-10-07,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-11-17,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
summary_data = []

for ticker in pivot_df.columns:
    series = pivot_df[ticker]
    active_months = series[series == 1]

    if not active_months.empty:
        summary_data.append({
            'ticker': ticker,
            'first_seen': active_months.index.min(),
            'last_seen': active_months.index.max(),
            'months_active': active_months.count()
        })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('ticker').reset_index(drop=True)


summary_df

Unnamed: 0,ticker,first_seen,last_seen,months_active
0,A,2013-01-02,2020-12-21,91
1,AAL,2015-04-07,2020-12-21,64
2,AAP,2015-08-04,2020-12-21,60
3,AAPL,2013-01-02,2020-12-21,91
4,ABBV,2013-01-02,2020-12-21,91
...,...,...,...,...
635,YUM,2013-01-02,2020-12-21,91
636,ZBH,2013-01-02,2020-12-21,91
637,ZBRA,2020-01-28,2020-12-21,9
638,ZION,2013-01-02,2020-12-21,91


In [5]:
import yfinance as yf


# Ensure dates are datetime 
summary_df['first_seen'] = pd.to_datetime(summary_df['first_seen'])
summary_df['last_seen'] = pd.to_datetime(summary_df['last_seen'])

# Function to get first and last day of the month
def get_month_range(start, end):
    start_of_month = start.to_period('M').to_timestamp()       # first day of month
    end_of_month = end.to_period('M').to_timestamp('M')        # last day of month
    return start_of_month, end_of_month

# Function to fetch stock data from Yahoo Finance
def get_stock_data(ticker, start_date, end_date):
    try:
        stock = yf.Ticker(ticker)
        stock_data = stock.history(start=start_date, end=end_date + pd.Timedelta(days=1))  # Add 1 day to include end date
        stock_data['ticker'] = ticker
        return stock_data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

# Loop through summary_df and collect stock data
stock_data_list = []
success_count = 0  # Counter for successful tickers

for idx, row in summary_df.iterrows():
    ticker = row['ticker']
    first_seen = row['first_seen']
    last_seen = row['last_seen']
    
    # Adjust to first and last day of the month
    start_date, end_date = get_month_range(first_seen, last_seen)
    
    stock_data = get_stock_data(ticker, start_date, end_date)
    
    if not stock_data.empty:
        stock_data_list.append(stock_data)
        success_count += 1  # Increment if data fetched successfully

# Combine all stock data
all_stock_data = pd.concat(stock_data_list)
all_stock_data.reset_index(inplace=True)

# Print count of tickers successfully fetched
print(f"Number of tickers successfully fetched: {success_count}")

all_stock_data


$ABC: possibly delisted; no timezone found
$ABMD: possibly delisted; no timezone found
$ADS: possibly delisted; no timezone found
$ALXN: possibly delisted; no timezone found
$ANSS: possibly delisted; no timezone found
$ANTM: possibly delisted; no timezone found
$APC: possibly delisted; no timezone found
$APOL: possibly delisted; no price data found  (1d 2013-01-01 00:00:00 -> 2013-07-01 00:00:00)
$ARG: possibly delisted; no price data found  (1d 2013-01-01 00:00:00 -> 2016-05-01 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1357016400, endDate = 1462075200")
$ATVI: possibly delisted; no timezone found
$AVP: possibly delisted; no timezone found
$BCR: possibly delisted; no price data found  (1d 2013-01-01 00:00:00 -> 2017-12-01 00:00:00)
$BIG: possibly delisted; no timezone found
$BLL: possibly delisted; no timezone found
$BRCM: possibly delisted; no price data found  (1d 2013-01-01 00:00:00 -> 2016-02-01 00:00:00)
$BTUUQ: possibly delisted; no price data found  (1d 2013-0

Number of tickers successfully fetched: 515


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Capital Gains
0,2013-01-02 00:00:00-05:00,27.067743,27.067743,26.413034,26.881601,8790205,0.0,0.0,A,
1,2013-01-03 00:00:00-05:00,26.920117,27.048492,26.689042,26.977886,5751791,0.0,0.0,A,
2,2013-01-04 00:00:00-05:00,27.048485,27.568401,26.868760,27.510632,6432897,0.0,0.0,A,
3,2013-01-07 00:00:00-05:00,27.343746,27.472121,27.202534,27.311653,3589505,0.0,0.0,A,
4,2013-01-08 00:00:00-05:00,27.260305,27.459286,27.022812,27.093418,3896925,0.0,0.0,A,
...,...,...,...,...,...,...,...,...,...,...
831335,2020-12-24 00:00:00-05:00,153.424264,155.295055,153.357115,154.191772,417400,0.0,0.0,ZTS,
831336,2020-12-28 00:00:00-05:00,154.882490,156.216030,153.798385,155.793900,1522400,0.0,0.0,ZTS,
831337,2020-12-29 00:00:00-05:00,156.580603,158.393833,155.803492,156.494247,1188400,0.0,0.0,ZTS,
831338,2020-12-30 00:00:00-05:00,156.868471,158.106082,156.532696,157.597610,1009000,0.0,0.0,ZTS,


In [6]:
SP500_all_stock_data = all_stock_data.drop(columns=['Dividends', 'Stock Splits','Capital Gains'])

#Daily returns
SP500_all_stock_data['daily_return'] = (
    SP500_all_stock_data.groupby('ticker')['Close']
    .pct_change()
)

#Monthly returns
monthly_returns = (
    SP500_all_stock_data
    .set_index('Date')
    .groupby('ticker')['Close']
    .resample('ME')
    .ffill()             # forward-fill missing days within month
    .pct_change()        # monthly percentage change
    .reset_index(name='monthly_return')
)

SP500_all_stock_data = SP500_all_stock_data.merge(
    monthly_returns[['Date', 'ticker', 'monthly_return']],
    on=['Date', 'ticker'],
    how='left'
)

#Check index membership, if tikcer is in ticker_list for that month
SP500_all_stock_data['Date'] = SP500_all_stock_data['Date'].dt.tz_localize(None)
SP500_all_stock_data['month'] = SP500_all_stock_data['Date'].dt.to_period('M').dt.to_timestamp()


SP500_all_stock_data = SP500_all_stock_data.merge(
    survivors_df[['month', 'ticker_list']],
    on='month',
    how='left'
)

SP500_all_stock_data['membership_index'] = SP500_all_stock_data.apply(
    lambda row: row['ticker'] in row['ticker_list'],
    axis=1
)

SP500_all_stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,ticker,daily_return,monthly_return,month,ticker_list,membership_index
0,2013-01-02,27.067743,27.067743,26.413034,26.881601,8790205,A,,,2013-01-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",True
1,2013-01-03,26.920117,27.048492,26.689042,26.977886,5751791,A,0.003582,,2013-01-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",True
2,2013-01-04,27.048485,27.568401,26.868760,27.510632,6432897,A,0.019747,,2013-01-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",True
3,2013-01-07,27.343746,27.472121,27.202534,27.311653,3589505,A,-0.007233,,2013-01-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",True
4,2013-01-08,27.260305,27.459286,27.022812,27.093418,3896925,A,-0.007991,,2013-01-01,"[A, AAPL, ABBV, ABC, ABT, ACN, ADBE, ADI, ADM,...",True
...,...,...,...,...,...,...,...,...,...,...,...,...
831335,2020-12-24,153.424264,155.295055,153.357115,154.191772,417400,ZTS,0.005443,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
831336,2020-12-28,154.882490,156.216030,153.798385,155.793900,1522400,ZTS,0.010390,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
831337,2020-12-29,156.580603,158.393833,155.803492,156.494247,1188400,ZTS,0.004495,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True
831338,2020-12-30,156.868471,158.106082,156.532696,157.597610,1009000,ZTS,0.007051,,2020-12-01,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...",True


In [7]:
SP500_all_stock_data_Final = SP500_all_stock_data.drop(columns=['ticker_list','month'])

SP500_all_stock_data_Final

Unnamed: 0,Date,Open,High,Low,Close,Volume,ticker,daily_return,monthly_return,membership_index
0,2013-01-02,27.067743,27.067743,26.413034,26.881601,8790205,A,,,True
1,2013-01-03,26.920117,27.048492,26.689042,26.977886,5751791,A,0.003582,,True
2,2013-01-04,27.048485,27.568401,26.868760,27.510632,6432897,A,0.019747,,True
3,2013-01-07,27.343746,27.472121,27.202534,27.311653,3589505,A,-0.007233,,True
4,2013-01-08,27.260305,27.459286,27.022812,27.093418,3896925,A,-0.007991,,True
...,...,...,...,...,...,...,...,...,...,...
831335,2020-12-24,153.424264,155.295055,153.357115,154.191772,417400,ZTS,0.005443,,True
831336,2020-12-28,154.882490,156.216030,153.798385,155.793900,1522400,ZTS,0.010390,,True
831337,2020-12-29,156.580603,158.393833,155.803492,156.494247,1188400,ZTS,0.004495,,True
831338,2020-12-30,156.868471,158.106082,156.532696,157.597610,1009000,ZTS,0.007051,,True


In [8]:
# import pandas as pd

# # Show all rows
# pd.set_option('display.max_rows', None)

# # Show all columns
# pd.set_option('display.max_columns', None)

# # Show entire width
# pd.set_option('display.width', None)

# # Do not truncate column content
# pd.set_option('display.max_colwidth', None)

SP500_all_stock_data_Final[SP500_all_stock_data_Final['ticker'] == 'META']


Unnamed: 0,Date,Open,High,Low,Close,Volume,ticker,daily_return,monthly_return,membership_index
500501,2014-01-02,54.494062,54.881671,53.857980,54.374794,43195500,META,,,True
500502,2014-01-03,54.682896,55.309037,54.195897,54.225716,38246200,META,-0.002742,,True
500503,2014-01-06,54.086567,56.909167,53.718835,56.849537,68852600,META,0.048387,,True
500504,2014-01-07,57.346479,58.191269,56.869420,57.565128,77207400,META,0.012587,,True
500505,2014-01-08,57.247085,58.052124,56.879353,57.873226,56682400,META,0.005352,,True
...,...,...,...,...,...,...,...,...,...,...
502259,2020-12-24,267.232539,268.743215,264.568967,265.761597,6702000,META,-0.002648,,True
502260,2020-12-28,267.093395,275.600945,264.032280,275.302795,23299700,META,0.035901,,True
502261,2020-12-29,275.253170,278.791356,274.587261,275.084198,16383000,META,-0.000794,,True
502262,2020-12-30,276.247019,276.376197,270.045231,270.204254,11803800,META,-0.017740,,True
