In [None]:
"""
Complete Stock Data Downloader for S&P 500 & NASDAQ
Saves organized CSV files to Google Drive
"""


In [1]:
# Environment Detection and Setup
import os
import sys
from pathlib import Path

def detect_environment():
    """
    Detects whether the code is running locally, on Kaggle, or on Google Colab.
    Returns: 'local', 'kaggle', or 'colab'
    """
    # Check for Google Colab
    try:
        import google.colab
        return 'colab'
    except ImportError:
        pass
    
    # Check for Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    
    # Default to local
    return 'local'

def setup_environment():
    """
    Sets up the environment based on where the code is running.
    Returns the output directory path.
    """
    env = detect_environment()
    print(f"üîç Detected environment: {env.upper()}")
    
    if env == 'colab':
        # Mount Google Drive
        print("üìÇ Mounting Google Drive...")
        from google.colab import drive
        drive.mount('/content/drive')
        
        # Set output path to Google Drive
        output_dir = Path('/content/drive/MyDrive/Stocks_Data')
        print(f"‚úÖ Google Drive mounted successfully")
        
    elif env == 'kaggle':
        # Kaggle output directory
        output_dir = Path('/kaggle/working')
        print(f"‚úÖ Using Kaggle working directory")
        
    else:  # local
        # Local output directory (same as notebook location)
        output_dir = Path.cwd() / 'output'
        print(f"‚úÖ Using local directory")
    
    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"üìÅ Output directory: {output_dir}")
    
    return env, output_dir

# Run setup
ENVIRONMENT, OUTPUT_DIR = setup_environment()

üîç Detected environment: LOCAL
‚úÖ Using local directory
üìÅ Output directory: c:\Users\Ferhat\Documents\GitHub\Stocks\output


In [2]:
# Install required packages (if not already installed)
import subprocess
import sys

def install_package(package):
    """Install a package using pip if not already installed."""
    try:
        __import__(package.split('[')[0])
        print(f"‚úì {package} already installed")
    except ImportError:
        print(f"üì¶ Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        print(f"‚úì {package} installed successfully")

# Install required packages
packages = ['yfinance', 'pandas', 'numpy']

for package in packages:
    install_package(package)

‚úì yfinance already installed
‚úì pandas already installed
‚úì numpy already installed


In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import time
import requests
from bs4 import BeautifulSoup

In [5]:
# Configuration - using OUTPUT_DIR from environment setup
BASE_PATH = str(OUTPUT_DIR)
START_DATE = '2020-01-01'
END_DATE = datetime.today().strftime('%Y-%m-%d')

# Create directory structure
BASE_PATH_PATH = Path(BASE_PATH)
(BASE_PATH_PATH / 'SP500').mkdir(parents=True, exist_ok=True)
(BASE_PATH_PATH / 'NASDAQ').mkdir(parents=True, exist_ok=True)
(BASE_PATH_PATH / 'Combined').mkdir(parents=True, exist_ok=True)

print(f"\nüìÅ Data will be saved to: {BASE_PATH}")
print(f"üìÖ Date range: {START_DATE} to {END_DATE}\n")


üìÅ Data will be saved to: c:\Users\Ferhat\Documents\GitHub\Stocks\output
üìÖ Date range: 2020-01-01 to 2026-02-02



In [6]:
# ============================================================================
# FUNCTION: Get S&P 500 Tickers
# ============================================================================
def get_sp500_tickers():
    """Scrape S&P 500 tickers from Wikipedia with User-Agent headers"""
    try:
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

        # Add a headers dictionary to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers)

        # Check if the request was successful
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'id': 'constituents'})

        if table is None:
            raise ValueError("Could not find the 'constituents' table on the page.")

        # Pass the HTML string to read_html
        df = pd.read_html(str(table))[0]

        # Clean tickers (Wikipedia uses '.' for some classes, Yahoo Finance uses '-')
        tickers = [ticker.replace('.', '-') for ticker in df['Symbol'].tolist()]

        print(f"‚úÖ Retrieved {len(tickers)} S&P 500 tickers")
        return tickers, df[['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry']]

    except Exception as e:
        print(f"‚ùå Error getting S&P 500 tickers: {e}")
        return [], pd.DataFrame()

# ============================================================================
# FUNCTION: Get NASDAQ 100 Tickers (as proxy for NASDAQ)
# ============================================================================
def get_nasdaq_tickers():
    """Scrape NASDAQ-100 tickers from Wikipedia"""
    try:
        url = 'https://en.wikipedia.org/wiki/Nasdaq-100'

        # Adding the same User-Agent header to bypass blocks
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # The NASDAQ-100 table ID is usually 'constituents', but if that fails,
        # we target the first wikitable on the page.
        table = soup.find('table', {'id': 'constituents'})
        if table is None:
            table = soup.find('table', {'class': 'wikitable'})

        df = pd.read_html(str(table))[0]

        # Clean the tickers
        tickers = [ticker.replace('.', '-') for ticker in df['Ticker'].tolist()]

        print(f"‚úÖ Retrieved {len(tickers)} NASDAQ-100 tickers")

        # Note: NASDAQ table uses 'Company' instead of 'Security'
        return tickers, df[['Ticker', 'Company', 'GICS Sector', 'GICS Sub-Industry']]

    except Exception as e:
        print(f"‚ùå Error getting NASDAQ tickers: {e}")
        return [], pd.DataFrame()

# ============================================================================
# FUNCTION: Download Data with Error Handling
# ============================================================================
def download_ticker_data(ticker, start_date, end_date, retry=3):
    """Download data for a single ticker with retry logic"""
    for attempt in range(retry):
        try:
            data = yf.download(ticker, start=start_date, end=end_date,
                             progress=False, show_errors=False)
            if not data.empty:
                return data
            time.sleep(0.5)  # Rate limiting
        except Exception as e:
            if attempt == retry - 1:
                return pd.DataFrame()
            time.sleep(1)
    return pd.DataFrame()

# ============================================================================
# FUNCTION: Batch Download with Progress Tracking
# ============================================================================
def batch_download(tickers, index_name, start_date, end_date):
    """Download data for multiple tickers with progress tracking"""

    all_data = {}
    failed_tickers = []

    print(f"\nüìä Downloading {index_name} data for {len(tickers)} tickers...")

    for ticker in tqdm(tickers, desc=f"{index_name} Progress"):
        data = download_ticker_data(ticker, start_date, end_date)

        if not data.empty:
            all_data[ticker] = data
        else:
            failed_tickers.append(ticker)

        # Rate limiting - be nice to yfinance
        time.sleep(0.2)

    print(f"‚úÖ Successfully downloaded: {len(all_data)}/{len(tickers)}")
    if failed_tickers:
        print(f"‚ö†Ô∏è  Failed tickers: {failed_tickers[:10]}{'...' if len(failed_tickers) > 10 else ''}")

    return all_data, failed_tickers

# ============================================================================
# FUNCTION: Create Combined DataFrames
# ============================================================================
def create_combined_dataframes(all_data):
    """Create combined DataFrames for different price types"""

    if not all_data:
        return {}

    combined = {}

    # Get all tickers
    tickers = list(all_data.keys())

    # Extract Close prices
    close_df = pd.DataFrame({ticker: all_data[ticker]['Close']
                            for ticker in tickers if 'Close' in all_data[ticker].columns})

    # Extract Adjusted Close
    adj_close_df = pd.DataFrame({ticker: all_data[ticker]['Adj Close']
                                for ticker in tickers if 'Adj Close' in all_data[ticker].columns})

    # Extract Volume
    volume_df = pd.DataFrame({ticker: all_data[ticker]['Volume']
                             for ticker in tickers if 'Volume' in all_data[ticker].columns})

    # Extract Open, High, Low
    open_df = pd.DataFrame({ticker: all_data[ticker]['Open']
                           for ticker in tickers if 'Open' in all_data[ticker].columns})

    high_df = pd.DataFrame({ticker: all_data[ticker]['High']
                           for ticker in tickers if 'High' in all_data[ticker].columns})

    low_df = pd.DataFrame({ticker: all_data[ticker]['Low']
                          for ticker in tickers if 'Low' in all_data[ticker].columns})

    combined['Close'] = close_df
    combined['Adj_Close'] = adj_close_df
    combined['Volume'] = volume_df
    combined['Open'] = open_df
    combined['High'] = high_df
    combined['Low'] = low_df

    return combined

# ============================================================================
# FUNCTION: Calculate Returns and Additional Metrics
# ============================================================================
def calculate_metrics(price_df):
    """Calculate daily and cumulative returns"""

    # Daily returns
    daily_returns = price_df.pct_change()

    # Cumulative returns
    cumulative_returns = (1 + daily_returns).cumprod() - 1

    return daily_returns, cumulative_returns

In [7]:
# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("="*70)
print("STARTING DATA DOWNLOAD")
print("="*70)

# Get tickers
sp500_tickers, sp500_info = get_sp500_tickers()
nasdaq_tickers, nasdaq_info = get_nasdaq_tickers()

# Save ticker lists
if not sp500_info.empty:
    sp500_info.to_csv(f'{BASE_PATH}/SP500/sp500_constituents.csv', index=False)
    print(f"üíæ Saved S&P 500 constituent info")

if not nasdaq_info.empty:
    nasdaq_info.to_csv(f'{BASE_PATH}/NASDAQ/nasdaq100_constituents.csv', index=False)
    print(f"üíæ Saved NASDAQ-100 constituent info")

# Download S&P 500 data
if sp500_tickers:
    sp500_data, sp500_failed = batch_download(sp500_tickers, "S&P 500", START_DATE, END_DATE)

    if sp500_data:
        # Create combined DataFrames
        sp500_combined = create_combined_dataframes(sp500_data)

        # Save each type
        for data_type, df in sp500_combined.items():
            df.to_csv(f'{BASE_PATH}/SP500/sp500_{data_type.lower()}.csv')
            print(f"üíæ Saved S&P 500 {data_type}: {df.shape}")

        # Calculate and save returns
        sp500_returns, sp500_cum_returns = calculate_metrics(sp500_combined['Adj_Close'])
        sp500_returns.to_csv(f'{BASE_PATH}/SP500/sp500_daily_returns.csv')
        sp500_cum_returns.to_csv(f'{BASE_PATH}/SP500/sp500_cumulative_returns.csv')
        print(f"üíæ Saved S&P 500 returns data")

# Download NASDAQ data
if nasdaq_tickers:
    nasdaq_data, nasdaq_failed = batch_download(nasdaq_tickers, "NASDAQ-100", START_DATE, END_DATE)

    if nasdaq_data:
        # Create combined DataFrames
        nasdaq_combined = create_combined_dataframes(nasdaq_data)

        # Save each type
        for data_type, df in nasdaq_combined.items():
            df.to_csv(f'{BASE_PATH}/NASDAQ/nasdaq_{data_type.lower()}.csv')
            print(f"üíæ Saved NASDAQ-100 {data_type}: {df.shape}")

        # Calculate and save returns
        nasdaq_returns, nasdaq_cum_returns = calculate_metrics(nasdaq_combined['Adj_Close'])
        nasdaq_returns.to_csv(f'{BASE_PATH}/NASDAQ/nasdaq_daily_returns.csv')
        nasdaq_cum_returns.to_csv(f'{BASE_PATH}/NASDAQ/nasdaq_cumulative_returns.csv')
        print(f"üíæ Saved NASDAQ-100 returns data")

# Create combined universe (unique tickers from both)
all_tickers = list(set(sp500_tickers + nasdaq_tickers))
print(f"\nüìä Total unique tickers: {len(all_tickers)}")

# Save combined ticker list
pd.DataFrame({
    'Ticker': all_tickers,
    'In_SP500': [t in sp500_tickers for t in all_tickers],
    'In_NASDAQ100': [t in nasdaq_tickers for t in all_tickers]
}).to_csv(f'{BASE_PATH}/Combined/all_tickers.csv', index=False)

print("\n" + "="*70)
print("‚úÖ DOWNLOAD COMPLETE!")
print("="*70)
print(f"\nFiles saved to: {BASE_PATH}")
print("\nFile structure:")
print("‚îú‚îÄ‚îÄ SP500/")
print("‚îÇ   ‚îú‚îÄ‚îÄ sp500_constituents.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ sp500_close.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ sp500_adj_close.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ sp500_volume.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ sp500_daily_returns.csv")
print("‚îÇ   ‚îî‚îÄ‚îÄ sp500_cumulative_returns.csv")
print("‚îú‚îÄ‚îÄ NASDAQ/")
print("‚îÇ   ‚îú‚îÄ‚îÄ nasdaq100_constituents.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_close.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_adj_close.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_volume.csv")
print("‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_daily_returns.csv")
print("‚îÇ   ‚îî‚îÄ‚îÄ nasdaq_cumulative_returns.csv")
print("‚îî‚îÄ‚îÄ Combined/")
print("    ‚îî‚îÄ‚îÄ all_tickers.csv")
print("\nüîó You can now access these files through Claude via Google Drive!")

STARTING DATA DOWNLOAD


  df = pd.read_html(str(table))[0]


‚úÖ Retrieved 503 S&P 500 tickers


  df = pd.read_html(str(table))[0]


‚úÖ Retrieved 101 NASDAQ-100 tickers
‚ùå Error getting NASDAQ tickers: "['GICS Sector', 'GICS Sub-Industry'] not in index"
üíæ Saved S&P 500 constituent info

üìä Downloading S&P 500 data for 503 tickers...


S&P 500 Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 503/503 [18:29<00:00,  2.21s/it]

‚úÖ Successfully downloaded: 0/503
‚ö†Ô∏è  Failed tickers: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']...

üìä Total unique tickers: 503

‚úÖ DOWNLOAD COMPLETE!

Files saved to: c:\Users\Ferhat\Documents\GitHub\Stocks\output

File structure:
‚îú‚îÄ‚îÄ SP500/
‚îÇ   ‚îú‚îÄ‚îÄ sp500_constituents.csv
‚îÇ   ‚îú‚îÄ‚îÄ sp500_close.csv
‚îÇ   ‚îú‚îÄ‚îÄ sp500_adj_close.csv
‚îÇ   ‚îú‚îÄ‚îÄ sp500_volume.csv
‚îÇ   ‚îú‚îÄ‚îÄ sp500_daily_returns.csv
‚îÇ   ‚îî‚îÄ‚îÄ sp500_cumulative_returns.csv
‚îú‚îÄ‚îÄ NASDAQ/
‚îÇ   ‚îú‚îÄ‚îÄ nasdaq100_constituents.csv
‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_close.csv
‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_adj_close.csv
‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_volume.csv
‚îÇ   ‚îú‚îÄ‚îÄ nasdaq_daily_returns.csv
‚îÇ   ‚îî‚îÄ‚îÄ nasdaq_cumulative_returns.csv
‚îî‚îÄ‚îÄ Combined/
    ‚îî‚îÄ‚îÄ all_tickers.csv

üîó You can now access these files through Claude via Google Drive!





In [None]:
sp500_returns

In [None]:
nasdaq_returns