In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import pickle
import os

DATA_DIR = os.path.join('..', 'data')
print(f'Data directory: {os.path.abspath(DATA_DIR)}')

In [None]:
# Cell 2: Scrape S&P 500 Sector Data from Wikipedia
import requests
from io import StringIO

url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {'User-Agent': 'Mozilla/5.0 (stock-prediction-ml project)'}
response = requests.get(url, headers=headers)
response.raise_for_status()

tables = pd.read_html(StringIO(response.text))
sp500_df = tables[0]

print(f'Total companies: {len(sp500_df)}')
print(f'Columns: {list(sp500_df.columns)}')
print(f'\nUnique sectors: {sp500_df["GICS Sector"].nunique()}')
print(f'Unique sub-industries: {sp500_df["GICS Sub-Industry"].nunique()}')

sp500_df.head()

In [None]:
# Cell 3: Build Sector Dictionary
# Use both Wikipedia format (BRK-B) and yfinance format (BRK-B) as keys
sector_data = {}

for _, row in sp500_df.iterrows():
    # Wikipedia uses dots (BRK.B), yfinance uses dashes (BRK-B)
    ticker_wiki = str(row['Symbol']).strip()
    ticker_yf = ticker_wiki.replace('.', '-')
    
    entry = {
        'sector': row['GICS Sector'],
        'sub_industry': row['GICS Sub-Industry'],
        'company': row.get('Security', '')
    }
    
    # Store under both formats so lookups always work
    sector_data[ticker_wiki] = entry
    if ticker_yf != ticker_wiki:
        sector_data[ticker_yf] = entry

print(f'Sector entries: {len(sector_data)}')

# Show sector distribution
print(f'\nSector Distribution:')
for sector, count in sp500_df['GICS Sector'].value_counts().items():
    print(f'  {sector}: {count}')

In [None]:
# Cell 4: Cross-check with Price Data
price_path = os.path.join(DATA_DIR, 'price_data.pkl')
if os.path.exists(price_path):
    with open(price_path, 'rb') as f:
        price_data = pickle.load(f)
    
    price_tickers = set(price_data.keys())
    sector_tickers = set(sector_data.keys())
    
    matched = price_tickers & sector_tickers
    missing = price_tickers - sector_tickers
    
    print(f'Price data tickers: {len(price_tickers)}')
    print(f'Sector data tickers: {len(sector_tickers)}')
    print(f'Matched: {len(matched)}')
    if missing:
        print(f'Missing sector data for: {sorted(missing)[:20]}...' if len(missing) > 20 else f'Missing sector data for: {sorted(missing)}')
else:
    print('price_data.pkl not found - run notebook 01c first')

In [None]:
# Cell 5: Save Sector Data
output_path = os.path.join(DATA_DIR, 'sector_data.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(sector_data, f)

print(f'Saved sector_data.pkl: {len(sector_data)} entries')
print(f'File size: {os.path.getsize(output_path) / 1024:.1f} KB')

# Sample entries
print(f'\nSample entries:')
for ticker in ['AAPL', 'JPM', 'JNJ']:
    if ticker in sector_data:
        print(f'  {ticker}: {sector_data[ticker]}')

In [None]:
# Cell 6: Verification
with open(os.path.join(DATA_DIR, 'sector_data.pkl'), 'rb') as f:
    loaded = pickle.load(f)

assert isinstance(loaded, dict), 'Expected dict'
sample = loaded.get('AAPL', {})
assert 'sector' in sample, 'Missing sector key'
assert 'sub_industry' in sample, 'Missing sub_industry key'

print('Verification passed!')
print(f'  Entries: {len(loaded)}')
print(f'  Format: Dict[str, Dict[str, str]]')
print(f'  Keys per entry: {list(sample.keys())}')
print(f'\nReady for notebook 02.')