In [1]:
import pandas as pd
import requests
import time

def get_cik(ticker):
    url = "https://www.sec.gov/include/ticker.txt"
    response = requests.get(url)
    content = response.text
    
    for line in content.splitlines():
        parts = line.strip().split('\t')
        if len(parts) == 2:
            t, c = parts
            if t.upper() == ticker.upper():
                return c.zfill(10)
    
    return None

# Read S&P 500 components
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)
df = tables[0]

# Rename columns
df = df.rename(columns={
    'Symbol': 'Ticker',
    'Security': 'Company',
    'GICS Sector': 'Sector',
    'GICS Sub-Industry': 'Sub-Industry',
    'Headquarters Location': 'Headquarters',
    'Date added': 'Date_Added',
    'Founded': 'Founded_Year'
})

# Add CIK column
df['CIK'] = df['Ticker'].apply(lambda x: get_cik(x))

# Reorder columns
columns_order = ['Ticker', 'CIK', 'Company', 'Sector', 'Sub-Industry', 'Headquarters', 'Date_Added', 'Founded_Year']
df = df[columns_order]

# Save to CSV
df.to_csv('data/sp500_companies.csv', index=False)

print("CSV file 'sp500_companies.csv' has been created.")

CSV file 'sp500_companies.csv' has been created.
