In [4]:
# Cell 1: Imports and Setup
import yfinance as yf
import pandas as pd
import numpy as np
import time
import pickle
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("✓ Imports successful")
print(f"Working directory: {os.getcwd()}")

✓ Imports successful
Working directory: c:\Users\chris\stock-prediction-ml\notebooks


In [13]:
# Cell 2: Get Company List
"""
Using companies available in the Kaggle dataset.
Ticker names match the CSV filenames exactly.
"""

# Companies that exist in the Kaggle individual_stocks_5yr folder
tickers = [
    'AAPL',     # Apple - AAPL_data.csv
    'MSFT',     # Microsoft - MSFT_data.csv
    'GOOGL',    # Google - GOOGL_data.csv (you have this!)
    'AMZN',     # Amazon - AMZN_data.csv
    'FB',       # Facebook - FB_data.csv (was Facebook 2013-2018)
    'INTC',     # Intel - INTC_data.csv
    'NVDA',     # Nvidia - NVDA_data.csv
    'V',        # Visa - V_data.csv
    'JPM',      # JPMorgan - JPM_data.csv
    'UNH'       # UnitedHealth - UNH_data.csv
]

print(f"✓ Using {len(tickers)} tickers for prototype")
print(f"Companies: {', '.join(tickers)}")

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Save ticker list
with open('../data/tickers.pkl', 'wb') as f:
    pickle.dump(tickers, f)

print("\n✓ Ticker list saved to data/tickers.pkl")

✓ Using 10 tickers for prototype
Companies: AAPL, MSFT, GOOGL, AMZN, FB, INTC, NVDA, V, JPM, UNH

✓ Ticker list saved to data/tickers.pkl


In [14]:
# Cell 3: Load Stock Price Data from Kaggle Dataset
"""
Loading historical stock price data from Kaggle S&P 500 dataset.
Source: https://www.kaggle.com/datasets/camnugent/sandp500
"""

import os
import glob

print("Loading stock price data from Kaggle CSV files...\n")

# Path to the CSV files - use os.path.join for Windows compatibility
base_dir = os.path.join('..', 'data', 'raw', 'kaggle', 'individual_stocks_5yr')

# Verify directory exists
if not os.path.exists(base_dir):
    print(f"❌ Directory not found: {base_dir}")
    raise FileNotFoundError(f"Data directory not found: {base_dir}")

print(f"✓ Found data directory: {base_dir}")

# Check for CSV files
csv_files = glob.glob(os.path.join(base_dir, '*.csv'))
print(f"✓ Found {len(csv_files)} CSV files\n")

all_price_data = {}
failed = []

for ticker in tickers:
    # File naming pattern: AAPL_data.csv - use os.path.join
    filepath = os.path.join(base_dir, f'{ticker}_data.csv')
    
    if not os.path.exists(filepath):
        print(f"  ⚠ {ticker}: File not found ({ticker}_data.csv)")
        failed.append(ticker)
        continue
    
    try:
        print(f"  Loading {ticker}...", end=" ")
        
        # Read CSV
        df = pd.read_csv(filepath)
        
        # Debug: Show columns for first file
        if ticker == tickers[0]:
            print(f"\n    Columns: {list(df.columns)}")
        
        # This dataset uses 'date' column (lowercase)
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.set_index('date')
        else:
            print(f"✗ No date column. Columns: {list(df.columns)}")
            failed.append(ticker)
            continue
        
        # Standardize to title case
        df.columns = [col.strip().title() for col in df.columns]
        
        # Check for required columns
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_cols = [col for col in required_cols if col not in df.columns]
        
        if missing_cols:
            print(f"✗ Missing: {missing_cols}. Has: {list(df.columns)}")
            failed.append(ticker)
            continue
        
        # Add Adj Close (this dataset doesn't have it, use Close)
        df['Adj Close'] = df['Close']
        
        # Keep only needed columns
        keep_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
        df = df[keep_cols]
        
        # Convert to numeric
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Remove NaN rows
        df = df.dropna()
        
        # Sort by date
        df = df.sort_index()
        
        if len(df) > 100:  # At least 100 days
            all_price_data[ticker] = df
            print(f"✓ ({len(df)} days, {df.index[0].date()} to {df.index[-1].date()})")
        else:
            print(f"⚠ Only {len(df)} days")
            failed.append(ticker)
    
    except Exception as e:
        print(f"✗ Error: {str(e)[:50]}")
        import traceback
        print(traceback.format_exc()[:200])
        failed.append(ticker)

print(f"\n{'='*60}")
print(f"✓ Successfully loaded: {len(all_price_data)} companies")
print(f"⚠ Failed to load: {len(failed)} companies")

if failed:
    print(f"\nFailed tickers: {failed}")

if len(all_price_data) > 0:
    # Save to pickle
    with open('../data/price_data.pkl', 'wb') as f:
        pickle.dump(all_price_data, f)
    
    print(f"\n✓ Price data saved to data/price_data.pkl")
    print(f"Total daily records: {sum(len(df) for df in all_price_data.values()):,}")
    
    # Show sample
    sample_ticker = list(all_price_data.keys())[0]
    print(f"\nSample data ({sample_ticker}):")
    print(all_price_data[sample_ticker].head())
    print(f"\nDate range: {all_price_data[sample_ticker].index.min().date()} to {all_price_data[sample_ticker].index.max().date()}")
else:
    print("\n❌ No data loaded successfully!")

Loading stock price data from Kaggle CSV files...

✓ Found data directory: ..\data\raw\kaggle\individual_stocks_5yr
✓ Found 505 CSV files

  Loading AAPL... 
    Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'Name']
✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading MSFT... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading GOOGL... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading AMZN... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading FB... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading INTC... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading NVDA... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading V... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading JPM... ✓ (1259 days, 2013-02-08 to 2018-02-07)
  Loading UNH... ✓ (1259 days, 2013-02-08 to 2018-02-07)

✓ Successfully loaded: 10 companies
⚠ Failed to load: 0 companies

✓ Price data saved to data/price_data.pkl
Total daily records: 12,590

Sample data (AAPL):
               Open     High      Low    Close     Volu

In [15]:
# Cell 4: Generate Synthetic Quarterly Financial Data
"""
Generating realistic quarterly financial statements.
Note: Real financial data would come from SEC EDGAR or premium APIs.
This demonstrates the feature engineering methodology.
"""

print("Generating synthetic quarterly financial data...")
print("(Kaggle dataset contains prices only, not financial statements)\n")

financial_data = []

# Financial parameters for each company (realistic ranges)
financial_params = {
    'AAPL': {'revenue': 90e9, 'margin': 0.25, 'debt_ratio': 0.3, 'growth': 0.05},
    'MSFT': {'revenue': 50e9, 'margin': 0.35, 'debt_ratio': 0.2, 'growth': 0.08},
    'GOOGL': {'revenue': 70e9, 'margin': 0.28, 'debt_ratio': 0.1, 'growth': 0.12},  # GOOGL
    'AMZN': {'revenue': 120e9, 'margin': 0.05, 'debt_ratio': 0.4, 'growth': 0.15},
    'NVDA': {'revenue': 8e9, 'margin': 0.30, 'debt_ratio': 0.1, 'growth': 0.25},
    'FB': {'revenue': 30e9, 'margin': 0.32, 'debt_ratio': 0.05, 'growth': 0.18},     # FB
    'INTC': {'revenue': 15e9, 'margin': 0.22, 'debt_ratio': 0.2, 'growth': 0.04},
    'JPM': {'revenue': 25e9, 'margin': 0.25, 'debt_ratio': 0.2, 'growth': 0.06},     # JPM
    'V': {'revenue': 7e9, 'margin': 0.50, 'debt_ratio': 0.3, 'growth': 0.10},
    'UNH': {'revenue': 70e9, 'margin': 0.08, 'debt_ratio': 0.25, 'growth': 0.12}
}

# Generate 10 years of quarterly data (40 quarters)
num_quarters = 40
np.random.seed(42)  # For reproducibility

for ticker in tickers:
    # Skip if we don't have price data for this ticker
    if ticker not in all_price_data:
        continue
    
    print(f"  Generating {ticker}...", end=" ")
    
    params = financial_params.get(ticker, {'revenue': 50e9, 'margin': 0.20, 'debt_ratio': 0.25, 'growth': 0.10})
    
    base_revenue = params['revenue']
    base_margin = params['margin']
    base_debt_ratio = params['debt_ratio']
    growth_rate = params['growth']
    
    # Create quarterly dates (going backwards from now)
    end_date = datetime.now()
    quarter_dates = []
    for i in range(num_quarters):
        date = end_date - timedelta(days=90*i)
        # Snap to quarter end
        quarter_end = pd.Timestamp(date.year, ((date.month-1)//3 + 1)*3, 1) + pd.offsets.QuarterEnd(0)
        quarter_dates.append(quarter_end)
    
    quarter_dates = sorted(quarter_dates)
    
    # Generate financial data for each quarter
    income_data = {}
    balance_data = {}
    
    for i, date in enumerate(quarter_dates):
        # Revenue grows over time with some noise
        quarters_from_start = i
        revenue = base_revenue * ((1 + growth_rate/4) ** quarters_from_start) * (1 + np.random.randn() * 0.05)
        
        # Profit margin varies slightly
        margin = base_margin * (1 + np.random.randn() * 0.1)
        margin = max(0.01, min(0.6, margin))  # Keep realistic
        
        gross_profit = revenue * (margin + 0.2)  # Gross margin higher than net
        net_income = revenue * margin
        
        # Balance sheet
        assets = revenue * 3 * (1 + np.random.randn() * 0.1)  # Assets ~3x quarterly revenue
        debt = assets * base_debt_ratio * (1 + np.random.randn() * 0.15)
        
        income_data[date] = {
            'Total Revenue': revenue,
            'Gross Profit': gross_profit,
            'Net Income': net_income
        }
        
        balance_data[date] = {
            'Total Assets': assets,
            'Total Debt': debt
        }
    
    # Convert to DataFrames (transpose so dates are columns, matching yfinance format)
    quarterly_income = pd.DataFrame(income_data)
    quarterly_balance = pd.DataFrame(balance_data)
    
    financial_data.append({
        'ticker': ticker,
        'quarterly_income': quarterly_income,
        'quarterly_balance': quarterly_balance,
        'info': {'symbol': ticker}
    })
    
    print(f"✓ ({len(quarter_dates)} quarters)")

print(f"\n{'='*60}")
print(f"✓ Generated financial data for {len(financial_data)} companies")

# Save
with open('../data/financial_data.pkl', 'wb') as f:
    pickle.dump(financial_data, f)

print(f"\n✓ Financial data saved to data/financial_data.pkl")

# Show sample
if financial_data:
    print(f"\nSample quarterly income ({financial_data[0]['ticker']}):")
    print(financial_data[0]['quarterly_income'].iloc[:, :5])  # Show first 5 quarters

Generating synthetic quarterly financial data...
(Kaggle dataset contains prices only, not financial statements)

  Generating AAPL... ✓ (40 quarters)
  Generating MSFT... ✓ (40 quarters)
  Generating GOOGL... ✓ (40 quarters)
  Generating AMZN... ✓ (40 quarters)
  Generating FB... ✓ (40 quarters)
  Generating INTC... ✓ (40 quarters)
  Generating NVDA... ✓ (40 quarters)
  Generating V... ✓ (40 quarters)
  Generating JPM... ✓ (40 quarters)
  Generating UNH... ✓ (40 quarters)

✓ Generated financial data for 10 companies

✓ Financial data saved to data/financial_data.pkl

Sample quarterly income (AAPL):
                 2016-06-30    2016-09-30    2016-12-31    2017-03-31  \
Total Revenue  9.223521e+10  9.005814e+10  9.009828e+10  9.454754e+10   
Gross Profit   4.118703e+10  3.999901e+10  4.176632e+10  3.802399e+10   
Net Income     2.273998e+10  2.198739e+10  2.374666e+10  1.911449e+10   

                 2017-06-30  
Total Revenue  8.979514e+10  
Gross Profit   4.111326e+10  
Net Income