In [1]:
# Cell 1: Imports and Configuration
import pandas as pd
import numpy as np
import yfinance as yf
import pickle
import os
import time
import shutil
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = os.path.join('..', 'data')
START_DATE = '2009-01-01'
SLEEP_TIME = 0.5  # seconds between requests

print(f'yfinance version: {yf.__version__}')
print(f'Data directory: {os.path.abspath(DATA_DIR)}')
print(f'Download range: {START_DATE} to present')

yfinance version: 0.2.35
Data directory: c:\Users\chris\stock-prediction-ml\data
Download range: 2009-01-01 to present


In [2]:
# Cell 2: Get S&P 500 Ticker List from Wikipedia
import requests
from io import StringIO

url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {'User-Agent': 'Mozilla/5.0 (stock-prediction-ml project)'}
response = requests.get(url, headers=headers)
response.raise_for_status()

tables = pd.read_html(StringIO(response.text))
sp500_df = tables[0]

print(f'Columns: {list(sp500_df.columns)}')
print(f'Total companies: {len(sp500_df)}')

# Extract tickers - handle special characters
sp500_tickers = sp500_df['Symbol'].str.replace('.', '-', regex=False).tolist()

# Show sector breakdown
print(f'\nSectors:')
for sector, count in sp500_df['GICS Sector'].value_counts().items():
    print(f'  {sector}: {count}')

print(f'\nFirst 10 tickers: {sp500_tickers[:10]}')

Columns: ['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'Headquarters Location', 'Date added', 'CIK', 'Founded']
Total companies: 503

Sectors:
  Industrials: 79
  Financials: 76
  Information Technology: 71
  Health Care: 60
  Consumer Discretionary: 48
  Consumer Staples: 36
  Utilities: 31
  Real Estate: 31
  Materials: 26
  Communication Services: 23
  Energy: 22

First 10 tickers: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


In [3]:
# Cell 3: Download Price Data (batch mode)
price_data = {}
failed_tickers = []

# Download in batches of 50 tickers at a time
BATCH_SIZE = 50
total = len(sp500_tickers)
print(f'Downloading {total} tickers in batches of {BATCH_SIZE}...\n')

for batch_start in range(0, total, BATCH_SIZE):
    batch = sp500_tickers[batch_start:batch_start + BATCH_SIZE]
    batch_num = batch_start // BATCH_SIZE + 1
    print(f'  Batch {batch_num}: tickers {batch_start+1}-{batch_start+len(batch)}...', end=' ')
    
    try:
        df = yf.download(
            batch, 
            start=START_DATE, 
            auto_adjust=False, 
            group_by='ticker', 
            threads=False,
            progress=False
        )
        
        if len(batch) == 1:
            # Single ticker - no MultiIndex on columns
            ticker = batch[0]
            if len(df) > 100:
                sub = df[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']].copy()
                sub.index = sub.index.tz_localize(None) if sub.index.tz else sub.index
                price_data[ticker] = sub
        else:
            # Multiple tickers - MultiIndex columns (Ticker, Price)
            for ticker in batch:
                try:
                    sub = df[ticker][['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']].copy()
                    sub = sub.dropna(how='all')
                    if len(sub) > 100:
                        sub.index = sub.index.tz_localize(None) if sub.index.tz else sub.index
                        price_data[ticker] = sub
                    else:
                        failed_tickers.append(ticker)
                except Exception:
                    failed_tickers.append(ticker)
        
        print(f'{len(price_data)} total successful')
    except Exception as e:
        print(f'BATCH FAILED: {e}')
        failed_tickers.extend(batch)
    
    time.sleep(2)  # pause between batches

print(f'\n=== Download Complete ===')
print(f'Successful: {len(price_data)}')
print(f'Failed: {len(failed_tickers)}')
if failed_tickers:
    print(f'Failed tickers: {failed_tickers[:20]}')

Downloading 503 tickers in batches of 50...

  Batch 1: tickers 1-50... 

Failed to get ticker 'APH' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AOS' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ARE' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'APD' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ALLE' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AKAM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'APTV' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AIG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ANET' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AIZ' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ALGN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AWK' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'AMZN' reason: Expecting value: line 1 column 1 (c

0 total successful
  Batch 2: tickers 51-100... 

Failed to get ticker 'ADP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BBY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BLK' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CNP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CARR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BRO' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CAH' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CRL' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CHTR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CPB' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CVNA' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BIIB' reason: Expecting value: line 1 column 1 (char

0 total successful
  Batch 3: tickers 101-150... 

Failed to get ticker 'COP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'KO' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CSCO' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DXCM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ED' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DELL' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CINF' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CPRT' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CIEN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CLX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'STZ' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DVA' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'CAG' reason: Expecting value: line 1 column 1 (cha

0 total successful
  Batch 4: tickers 151-200... 

Failed to get ticker 'ETN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'EMR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'FDS' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'FAST' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DUK' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'EL' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'FITB' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'EXC' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'DHI' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'EW' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ELV' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'EPAM' reason: Expecting value: line 1 column 1 (char 0)

0 total successful
  Batch 5: tickers 201-250... 

Failed to get ticker 'IBM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'GEHC' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'GDDY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'HPE' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'IEX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'HIG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'GS' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'HBAN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'HSY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'GNRC' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'BEN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'GPC' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'INCY' reason: Expecting value: line 1 column 1 (cha

0 total successful
  Batch 6: tickers 251-300... 

Failed to get ticker 'KDP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'KMB' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'LHX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'LII' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'LLY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MAR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'LULU' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'IP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'IVZ' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'LYB' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'JKHY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ISRG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'IRM' reason: Expecting value: line 1 column 1 (char 

0 total successful
  Batch 7: tickers 301-350... 

Failed to get ticker 'MTCH' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MTD' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MRK' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MGM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'TAP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'NXPI' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'NKE' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MCK' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MA' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MSI' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MOS' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'MLM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'META' reason: Expecting value: line 1 column 1 (char 

0 total successful
  Batch 8: tickers 351-400... 

Failed to get ticker 'PM' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PNR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'RTX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'RMD' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PEP' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'REGN' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PAYX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PSX' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PCG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PFG' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ON' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'PCAR' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'ROK' reason: Expecting value: line 1 column 1 (char 0

KeyboardInterrupt: 

In [None]:
# Cell 4: Retry Failed Tickers (one at a time using Ticker object)
if failed_tickers:
    print(f'Retrying {len(failed_tickers)} failed tickers individually...\n')
    still_failed = []
    for ticker in failed_tickers:
        try:
            time.sleep(1)
            t = yf.Ticker(ticker)
            df = t.history(start=START_DATE, auto_adjust=False)
            if len(df) > 100:
                df.index = df.index.tz_localize(None) if df.index.tz else df.index
                cols = [c for c in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'] if c in df.columns]
                price_data[ticker] = df[cols].copy()
                print(f'  Recovered: {ticker}')
            else:
                still_failed.append(ticker)
        except:
            still_failed.append(ticker)
    
    print(f'\nRecovered: {len(failed_tickers) - len(still_failed)}')
    if still_failed:
        print(f'Still failed ({len(still_failed)}): {still_failed[:20]}')
else:
    print('No failed tickers to retry.')

print(f'\nTotal tickers with price data: {len(price_data)}')

In [None]:
# Cell 5: Data Quality Report
print('=== DATA QUALITY REPORT ===')
print(f'Total tickers: {len(price_data)}')

# Date range summary
date_ranges = []
for ticker, df in price_data.items():
    date_ranges.append({
        'ticker': ticker,
        'start': df.index.min(),
        'end': df.index.max(),
        'trading_days': len(df)
    })

dr = pd.DataFrame(date_ranges)
print(f'\nDate Coverage:')
print(f'  Earliest start: {dr["start"].min().strftime("%Y-%m-%d")}')
print(f'  Latest end:     {dr["end"].max().strftime("%Y-%m-%d")}')
print(f'  Avg trading days: {dr["trading_days"].mean():.0f}')
print(f'  Min trading days: {dr["trading_days"].min()} ({dr.loc[dr["trading_days"].idxmin(), "ticker"]})')
print(f'  Max trading days: {dr["trading_days"].max()}')

# Sample data
sample_ticker = list(price_data.keys())[0]
print(f'\nSample ({sample_ticker}):')
print(price_data[sample_ticker].tail(3))
print(f'\nColumns: {list(price_data[sample_ticker].columns)}')

In [None]:
# Cell 6: Backup Old Data and Save
old_path = os.path.join(DATA_DIR, 'price_data.pkl')
backup_path = os.path.join(DATA_DIR, 'price_data_kaggle_backup.pkl')

# Backup existing file
if os.path.exists(old_path) and not os.path.exists(backup_path):
    shutil.copy2(old_path, backup_path)
    print(f'Backed up old price data to {backup_path}')

# Save new price data
with open(old_path, 'wb') as f:
    pickle.dump(price_data, f)

print(f'Saved price_data.pkl: {len(price_data)} tickers')
print(f'File size: {os.path.getsize(old_path) / 1024 / 1024:.1f} MB')

In [None]:
# Cell 7: Verification
with open(os.path.join(DATA_DIR, 'price_data.pkl'), 'rb') as f:
    loaded = pickle.load(f)

assert isinstance(loaded, dict), 'Expected dict'
sample_key = list(loaded.keys())[0]
sample_df = loaded[sample_key]
assert isinstance(sample_df, pd.DataFrame), 'Expected DataFrame'
assert isinstance(sample_df.index, pd.DatetimeIndex), 'Expected DatetimeIndex'

expected_cols = {'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'}
assert expected_cols.issubset(set(sample_df.columns)), f'Missing columns: {expected_cols - set(sample_df.columns)}'

print('Verification passed!')
print(f'  Tickers: {len(loaded)}')
print(f'  Format: Dict[str, DataFrame]')
print(f'  Columns: {list(sample_df.columns)}')
print(f'  Index type: DatetimeIndex')
print(f'\nReady for notebook 02.')