In [None]:
import pandas as pd

# Load tables
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)

# Get current S&P 500 (2024)
current_sp500 = tables[0][['Symbol', 'Security', 'CIK', 'GICS Sector']]
current_sp500.columns = ['Ticker', 'Company', 'CIK', 'Sector']

# Historical changes
changes = tables[1].copy()
changes.columns = ['Date', 'Added Ticker', 'Added Company', 'Removed Ticker', 'Removed Company', 'Reason']
changes['Date'] = pd.to_datetime(changes['Date'], errors='coerce')
changes = changes.dropna(subset=['Date'])
changes = changes[(changes['Date'].dt.year >= 2018) & (changes['Date'].dt.year <= 2024)]
changes = changes.sort_values('Date')

years = list(range(2018, 2025))
membership = {year: {} for year in years}
company_lifetime = {}

# Process historical additions/removals
for _, row in changes.iterrows():
    year = row['Date'].year
    added_company = row['Added Company']
    added_ticker = row['Added Ticker']
    removed_company = row['Removed Company']
    removed_ticker = row['Removed Ticker']

    if pd.notna(added_company):
        if added_company not in company_lifetime:
            company_lifetime[added_company] = {
                'start': year, 'end': None, 'ticker': added_ticker, 'cik': None, 'sector': None
            }
        else:
            company_lifetime[added_company]['start'] = min(company_lifetime[added_company]['start'], year)
            company_lifetime[added_company]['ticker'] = added_ticker

    if pd.notna(removed_company):
        if removed_company not in company_lifetime:
            company_lifetime[removed_company] = {
                'start': 2018, 'end': year, 'ticker': removed_ticker, 'cik': None, 'sector': None
            }
        else:
            company_lifetime[removed_company]['end'] = year
            if pd.notna(removed_ticker):
                company_lifetime[removed_company]['ticker'] = removed_ticker

# Fill in missing info from current snapshot (2024)
for _, row in current_sp500.iterrows():
    company = row['Company']
    ticker = row['Ticker']
    cik = row['CIK']
    sector = row['Sector']

    cik_str = str(int(cik)) if pd.notna(cik) else None

    if company not in company_lifetime:
        company_lifetime[company] = {
            'start': 2018,
            'end': None,
            'ticker': ticker,
            'cik': cik_str,
            'sector': sector
        }
    else:
        if company_lifetime[company].get('cik') is None and pd.notna(cik):
            company_lifetime[company]['cik'] = str(int(cik))

        if company_lifetime[company].get('sector') is None:
            company_lifetime[company]['sector'] = sector


# Build yearly membership
for year in years:
    for company, info in company_lifetime.items():
        if info['start'] <= year and (info['end'] is None or year <= info['end']):
            sector = info.get('sector')
            if sector is None or sector.lower() != 'financials':  
                membership[year][company] = {
                    'Ticker': info['ticker'],
                    'CIK': info['cik'],
                    'Sector': sector
                }

for year in years:
    data = [
        {'Company': company, 'Ticker': info['Ticker'], 'CIK': info['CIK'], 'Sector': info['Sector']}
        for company, info in sorted(membership[year].items())
    ]
    df = pd.DataFrame(data)
    df.to_csv(f"sp500_full_{year}_1.csv", index=False)
    print(f"Saved sp500_full_{year}._1csv with {len(df)} companies.")

Saved sp500_full_2018.csv with 486 companies.
Saved sp500_full_2019.csv with 484 companies.
Saved sp500_full_2020.csv with 478 companies.
Saved sp500_full_2021.csv with 478 companies.
Saved sp500_full_2022.csv with 474 companies.
Saved sp500_full_2023.csv with 471 companies.
Saved sp500_full_2024.csv with 470 companies.
