In [4]:
# Install required packages
!pip install yfinance pandas pyarrow tqdm pandas-market-calendars azure-storage-blob azure-ai-ml azure-identity azure-keyvault-secrets lxml

# Force reinstall lxml to ensure it's available
!pip install lxml --force-reinstall

# Verify lxml installation
import pkg_resources
try:
    lxml_version = pkg_resources.get_distribution("lxml").version
    print(f"lxml version {lxml_version} installed successfully")
except pkg_resources.DistributionNotFound:
    raise ImportError("lxml is not installed. Please run '!pip install lxml --force-reinstall' and restart the kernel.")

import yfinance as yf
import pandas as pd
import requests
from datetime import datetime
import os
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from multiprocessing import Pool, cpu_count
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.keyvault.secrets import SecretClient
import time

# Azure Key Vault setup
KEY_VAULT_URL = "https://stockpredictionvault2025.vault.azure.net/"
SECRET_NAME = "AzureBlobConnStr"
credential = DefaultAzureCredential()

# Fetch connection string from Key Vault
try:
    secret_client = SecretClient(vault_url=KEY_VAULT_URL, credential=credential)
    AZURE_CONN_STR = secret_client.get_secret(SECRET_NAME).value
    print("Successfully retrieved Blob Storage connection string from Key Vault")
except Exception as e:
    raise Exception(f"Failed to retrieve connection string from Key Vault: {e}")

# Azure Blob Storage setup
try:
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONN_STR)
    print("Connected to Azure Blob Storage")
except Exception as e:
    raise Exception(f"Failed to connect to Blob Storage: {e}")

CONTAINER_NAME = "stock-data"
BLOB_FOLDER = "stockdata_us_v2/"  # New folder for Parquet files

# Azure ML workspace setup
try:
    ml_client = MLClient.from_config(credential=credential)
    print("Connected to Azure ML workspace")
except Exception as e:
    raise Exception(f"Failed to connect to Azure ML workspace: {e}")

# Parameters
MIN_YEARS = 20
VERSION_TIMESTAMP = datetime.now().strftime("%Y%m%d")
CUTOFF_DATE = pd.Timestamp(datetime.now() - pd.DateOffset(years=MIN_YEARS)).tz_localize('UTC')

# Create local directory for temporary storage
os.makedirs("stockdata_us_v2", exist_ok=True)

def get_sp500():
    """Fetch S&P 500 tickers from Wikipedia."""
    try:
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        tables = pd.read_html(response.text)
        for df in tables:
            for col in df.columns:
                if "Symbol" in str(col):
                    tickers = df[col].astype(str).str.replace(".", "-", regex=False).tolist()
                    print(f"Fetched {len(tickers)} S&P 500 tickers")
                    return tickers
        raise ValueError("Couldn't find S&P 500 tickers")
    except Exception as e:
        raise Exception(f"Failed to fetch S&P 500 tickers: {e}")

def get_nasdaq100():
    """Fetch NASDAQ-100 tickers from Wikipedia."""
    try:
        url = "https://en.wikipedia.org/wiki/NASDAQ-100"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        tables = pd.read_html(response.text)
        for df in tables:
            for col in df.columns:
                if "Ticker" in str(col) or "Symbol" in str(col):
                    tickers = df[col].astype(str).str.replace(".", "-", regex=False).tolist()
                    print(f"Fetched {len(tickers)} NASDAQ-100 tickers")
                    return tickers
        raise ValueError("Couldn't find NASDAQ-100 tickers")
    except Exception as e:
        raise Exception(f"Failed to fetch NASDAQ-100 tickers: {e}")

def get_dowjones():
    """Fetch Dow Jones tickers from Wikipedia."""
    try:
        url = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        tables = pd.read_html(response.text)
        for df in tables:
            for col in df.columns:
                if "Symbol" in str(col) or "Ticker" in str(col):
                    tickers = df[col].astype(str).str.replace(".", "-", regex=False).tolist()
                    print(f"Fetched {len(tickers)} Dow Jones tickers")
                    return tickers
        raise ValueError("Couldn't find Dow Jones tickers")
    except Exception as e:
        raise Exception(f"Failed to fetch Dow Jones tickers: {e}")

def validate_stock_data(df, ticker):
    """Validate stock data quality (minimal validation like original code)."""
    try:
        # Ensure required columns exist
        required_columns = ["Open", "High", "Low", "Close", "Volume", "Date"]
        if not all(col in df.columns for col in required_columns):
            return False, "Missing required columns"

        # Convert Date to datetime
        df["Date"] = pd.to_datetime(df["Date"])
        # Check for sufficient history
        if df["Date"].min() > CUTOFF_DATE:
            return False, "Insufficient history"

        print(f"{ticker}: Validation passed with {len(df)} rows")
        return True, ""
    except Exception as e:
        print(f"{ticker}: Validation error: {e}")
        return False, str(e)

def fetch_ticker_data(ticker):
    """Fetch and validate data for a single ticker."""
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(period="max")
        if df.empty:
            print(f"{ticker}: Empty data")
            return None, None

        df.reset_index(inplace=True)
        df["Ticker"] = ticker

        # Validate data
        is_valid, reason = validate_stock_data(df, ticker)
        if not is_valid:
            return None, None

        # Fetch sector
        sector = stock.info.get("sector", "Unknown")
        print(f"{ticker}: Fetched {len(df)} rows, start date: {df['Date'].min()}, sector: {sector}")
        return df, sector
    except Exception as e:
        print(f"{ticker}: Error fetching data: {e}")
        return None, None

def process_ticker(ticker):
    """Process a ticker and save to Blob Storage."""
    try:
        df, sector = fetch_ticker_data(ticker)
        if df is None:
            return ticker, None, sector

        # Save to local Parquet
        local_path = f"stockdata_us_v2/{ticker}_{VERSION_TIMESTAMP}.parquet"
        df.to_parquet(local_path, index=False)
        print(f"{ticker}: Saved local Parquet to {local_path}")

        # Upload to Blob Storage
        blob_client = blob_service_client.get_container_client(CONTAINER_NAME).get_blob_client(f"{BLOB_FOLDER}{ticker}_{VERSION_TIMESTAMP}.parquet")
        with open(local_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

        # Verify upload
        blob_properties = blob_client.get_blob_properties()
        if blob_properties.size > 0:
            print(f"{ticker}: Successfully uploaded {len(df)} rows to Blob Storage")
        else:
            print(f"{ticker}: Uploaded file is empty")
            return ticker, None, sector

        return ticker, local_path, sector
    except Exception as e:
        print(f"{ticker}: Processing error: {e}")
        return ticker, None, None

def main():
    # Fetch tickers
    try:
        sp500 = get_sp500()
        nasdaq100 = get_nasdaq100()
        dowjones = get_dowjones()
        tickers = sorted(set(sp500 + nasdaq100 + dowjones))
        print(f"Total unique tickers: {len(tickers)}")
    except Exception as e:
        raise Exception(f"Failed to fetch tickers: {e}")

    # Parallel processing with up to 12 cores
    num_cores = min(4, cpu_count())
    print(f"Using {num_cores} CPU cores for parallel processing")
    try:
        with Pool(num_cores) as pool:
            results = list(tqdm(pool.imap(process_ticker, tickers), total=len(tickers)))
    except Exception as e:
        raise Exception(f"Parallel processing failed: {e}")

    # Collect qualified tickers and sectors
    qualified_tickers = []
    sector_data = []
    for ticker, local_path, sector in results:
        if local_path:
            qualified_tickers.append(ticker)
            sector_data.append({"Ticker": ticker, "Sector": sector})

    # Save qualified tickers
    try:
        tickers_df = pd.Series(qualified_tickers)
        tickers_csv_path = f"qualified_us_tickers_{VERSION_TIMESTAMP}.csv"
        tickers_df.to_csv(tickers_csv_path, index=False)
        print(f"Saved qualified tickers to local file: {tickers_csv_path}")
        blob_client = blob_service_client.get_container_client(CONTAINER_NAME).get_blob_client(tickers_csv_path)
        with open(tickers_csv_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
        print(f"Uploaded {len(qualified_tickers)} qualified tickers to Blob Storage")
    except Exception as e:
        raise Exception(f"Failed to save qualified tickers: {e}")

    # Save sector data
    try:
        sectors_df = pd.DataFrame(sector_data)
        sectors_csv_path = f"sectors_{VERSION_TIMESTAMP}.csv"
        sectors_df.to_csv(sectors_csv_path, index=False)
        print(f"Saved sector data to local file: {sectors_csv_path}")
        blob_client = blob_service_client.get_container_client(CONTAINER_NAME).get_blob_client(sectors_csv_path)
        with open(sectors_csv_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
        print(f"Uploaded sector data for {len(sectors_df)} tickers")
    except Exception as e:
        raise Exception(f"Failed to save sector data: {e}")

    # Register dataset in Azure ML
    try:
        data_asset = Data(
            path=f"azureml://datastores/workspaceblobstore/paths/{CONTAINER_NAME}/{BLOB_FOLDER}",
            type=AssetTypes.URI_FOLDER,
            description="US stock data with 20+ years history (v2)",
            name=f"us_stock_data_v2_{VERSION_TIMESTAMP}"
        )
        ml_client.data.create_or_update(data_asset)
        print(f"Registered dataset us_stock_data_v2_{VERSION_TIMESTAMP} in Azure ML")
    except Exception as e:
        raise Exception(f"Failed to register dataset in Azure ML: {e}")

if __name__ == "__main__":
    main()

Collecting lxml
  Using cached lxml-6.0.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Using cached lxml-6.0.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
Installing collected packages: lxml
  Attempting uninstall: lxml
    Found existing installation: lxml 6.0.1
    Uninstalling lxml-6.0.1:
      Successfully uninstalled lxml-6.0.1
Successfully installed lxml-6.0.1
Connected to Azure ML workspace


Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Exception: Failed to fetch tickers: Failed to fetch S&P 500 tickers: 403 Client Error: Forbidden for url: https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

In [11]:
df, sector = fetch_ticker_data("AAPL")
print(df.head(), df.columns, sector)

AAPL: Validation passed with 11255 rows
AAPL: Fetched 11255 rows, start date: 1980-12-12 00:00:00-05:00, sector: Technology
                       Date      Open      High       Low     Close  \
0 1980-12-12 00:00:00-05:00  0.098597  0.099025  0.098597  0.098597   
1 1980-12-15 00:00:00-05:00  0.093881  0.093881  0.093453  0.093453   
2 1980-12-16 00:00:00-05:00  0.087022  0.087022  0.086594  0.086594   
3 1980-12-17 00:00:00-05:00  0.088737  0.089165  0.088737  0.088737   
4 1980-12-18 00:00:00-05:00  0.091309  0.091738  0.091309  0.091309   

      Volume  Dividends  Stock Splits Ticker  
0  469033600        0.0           0.0   AAPL  
1  175884800        0.0           0.0   AAPL  
2  105728000        0.0           0.0   AAPL  
3   86441600        0.0           0.0   AAPL  
4   73449600        0.0           0.0   AAPL   Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Ticker'],
      dtype='object') Technology


In [3]:
# ========================
# US Stock Data Collection
# ========================
import yfinance as yf
import pandas as pd
import requests, os, time
from datetime import datetime
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from multiprocessing import Pool, cpu_count

# --- CONFIG ---
KEY_VAULT_URL = "https://stockpredictionvault2025.vault.azure.net/"
SECRET_NAME   = "AzureBlobConnStr"
CONTAINER_NAME = "stock-data"
BLOB_FOLDER    = "stockdata_us_adjclose/"
MIN_YEARS      = 20
VERSION_TIMESTAMP = datetime.now().strftime("%Y%m%d")
CUTOFF_DATE = pd.Timestamp(datetime.now() - pd.DateOffset(years=MIN_YEARS)).tz_localize('UTC')
os.makedirs("stockdata_us_adjclose", exist_ok=True)

# --- AZURE CONNECTIONS ---
cred = DefaultAzureCredential()
secret_client = SecretClient(vault_url=KEY_VAULT_URL, credential=cred)
AZURE_CONN_STR = secret_client.get_secret(SECRET_NAME).value
blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONN_STR)
ml_client = MLClient.from_config(credential=cred)

# --- FETCH TICKERS ---
def get_wiki_tickers(url, col_keyword):
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    tables = pd.read_html(resp.text)
    for df in tables:
        for col in df.columns:
            if col_keyword in str(col):
                return df[col].astype(str).str.replace(".", "-", regex=False).tolist()
    raise ValueError(f"No tickers found at {url}")

def get_all_tickers():
    sp500   = get_wiki_tickers("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies", "Symbol")
    nasdaq  = get_wiki_tickers("https://en.wikipedia.org/wiki/NASDAQ-100", "Ticker")
    dow     = get_wiki_tickers("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average", "Symbol")
    return sorted(set(sp500 + nasdaq + dow))

# --- VALIDATION ---
def validate_stock_data(df):
    required_cols = {"Open", "High", "Low", "Close", "Adj Close", "Volume", "Date"}
    if not required_cols.issubset(df.columns):
        return False
    if df["Date"].min() > CUTOFF_DATE:
        return False
    return True

# --- FETCH FUNCTION ---
def fetch_and_save(ticker):
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(period="max", auto_adjust=False)  # keep both Close & Adj Close
        if df.empty: return ticker, None, "Empty data"

        df.reset_index(inplace=True)
        df["Date"] = pd.to_datetime(df["Date"], utc=True)
        df["Ticker"] = ticker

        if not validate_stock_data(df):
            return ticker, None, "Validation failed"

        sector = stock.info.get("sector", "Unknown")
        local_path = f"stockdata_us_adjclose/{ticker}_{VERSION_TIMESTAMP}.parquet"
        df.to_parquet(local_path, index=False)

        blob_path = f"{BLOB_FOLDER}{ticker}_{VERSION_TIMESTAMP}.parquet"
        blob_client = blob_service_client.get_container_client(CONTAINER_NAME).get_blob_client(blob_path)
        with open(local_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

        return ticker, sector, "OK"
    except Exception as e:
        return ticker, None, str(e)

# --- MAIN ---
def main():
    tickers = get_all_tickers()
    print(f"Total tickers: {len(tickers)}")
    num_cores = min(12, cpu_count())

    results = []
    with Pool(num_cores) as pool:
        for res in tqdm(pool.imap(fetch_and_save, tickers), total=len(tickers)):
            results.append(res)

    # Save sector mapping
    sectors = [{"Ticker": t, "Sector": s} for t, s, status in results if status == "OK"]
    pd.DataFrame(sectors).to_csv(f"sectors_{VERSION_TIMESTAMP}.csv", index=False)

    # Register dataset
    data_asset = Data(
        path=f"azureml://datastores/workspaceblobstore/paths/{CONTAINER_NAME}/{BLOB_FOLDER}",
        type=AssetTypes.URI_FOLDER,
        description="US stock data (Adj Close included, 20+ years history)",
        name=f"us_stock_data_adjclose_{VERSION_TIMESTAMP}"
    )
    ml_client.data.create_or_update(data_asset)

if __name__ == "__main__":
    main()


Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


HTTPError: 403 Client Error: Forbidden for url: https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

In [2]:
import pandas as pd

# Example: load one local file after data collection
df = pd.read_parquet("stockdata_us_adjclose/AAPL_20250811.parquet")

print("HEAD (first 5 rows):")
print(df.head())

print("\nTAIL (last 5 rows):")
print(df.tail())

print("\nColumns:", df.columns.tolist())
print("Rows:", len(df))


HEAD (first 5 rows):
                       Date      Open      High       Low     Close  \
0 1980-12-12 05:00:00+00:00  0.128348  0.128906  0.128348  0.128348   
1 1980-12-15 05:00:00+00:00  0.122210  0.122210  0.121652  0.121652   
2 1980-12-16 05:00:00+00:00  0.113281  0.113281  0.112723  0.112723   
3 1980-12-17 05:00:00+00:00  0.115513  0.116071  0.115513  0.115513   
4 1980-12-18 05:00:00+00:00  0.118862  0.119420  0.118862  0.118862   

   Adj Close     Volume  Dividends  Stock Splits Ticker  
0   0.098597  469033600        0.0           0.0   AAPL  
1   0.093453  175884800        0.0           0.0   AAPL  
2   0.086594  105728000        0.0           0.0   AAPL  
3   0.088737   86441600        0.0           0.0   AAPL  
4   0.091310   73449600        0.0           0.0   AAPL  

TAIL (last 5 rows):
                           Date        Open        High         Low  \
11250 2025-08-04 04:00:00+00:00  204.509995  207.880005  201.679993   
11251 2025-08-05 04:00:00+00:00  203.3999

In [1]:
# ========================
# US Stock Data Collection (Final Single-Core Version)
# ========================

# --- DEPENDENCIES ---
!pip install yfinance pandas pyarrow requests tqdm azure-storage-blob azure-identity azure-keyvault-secrets azure-ai-ml lxml

import yfinance as yf
import pandas as pd
import requests
import os
import time
import random
from datetime import datetime, timezone
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from requests.exceptions import HTTPError

# --- CONFIG ---
KEY_VAULT_URL = "https://stockpredictionvault2025.vault.azure.net/"
SECRET_NAME   = "AzureBlobConnStr"
CONTAINER_NAME = "stock-data"
BLOB_FOLDER    = "stockdata_us_adjclose/"
MIN_YEARS      = 20
VERSION_TIMESTAMP = datetime.now().strftime("%Y%m%d")
CUTOFF_DATE = pd.Timestamp(datetime.now() - pd.DateOffset(years=MIN_YEARS)).tz_localize('UTC')
os.makedirs("stockdata_us_adjclose", exist_ok=True)

# --- GLOBAL REQUESTS SESSION ---
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
session.headers.update(headers)

# --- AZURE CONNECTIONS ---
try:
    print("Connecting to Azure services...")
    cred = DefaultAzureCredential()
    secret_client = SecretClient(vault_url=KEY_VAULT_URL, credential=cred)
    AZURE_CONN_STR = secret_client.get_secret(SECRET_NAME).value
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONN_STR)
    ml_client = MLClient.from_config(credential=cred)
    print("Successfully connected to Azure.")
except Exception as e:
    raise Exception(f"Failed to connect to Azure services: {e}")

# --- FETCH TICKERS ---
def get_wiki_tickers(url, col_keyword):
    try:
        resp = session.get(url, timeout=15)
        resp.raise_for_status()
        tables = pd.read_html(resp.text)
        for df in tables:
            for col in df.columns:
                if col_keyword in str(col):
                    tickers = df[col].astype(str).str.replace(".", "-", regex=False).tolist()
                    print(f"Found {len(tickers)} tickers from {url.split('/')[-1]}")
                    return tickers
        raise ValueError(f"Column keyword '{col_keyword}' not found in any table at {url}")
    except Exception as e:
        raise Exception(f"Failed to process tickers from {url}: {e}")

def get_all_tickers():
    print("Fetching ticker lists...")
    sp500   = get_wiki_tickers("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies", "Symbol")
    nasdaq  = get_wiki_tickers("https://en.wikipedia.org/wiki/NASDAQ-100", "Ticker")
    dow     = get_wiki_tickers("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average", "Symbol")
    all_tickers = sorted(set(sp500 + nasdaq + dow))
    print(f"Total unique tickers to process: {len(all_tickers)}")
    return all_tickers

# --- VALIDATION ---
def validate_stock_data(df):
    required_cols = {"Open", "High", "Low", "Close", "Adj Close", "Volume", "Date"}
    if not required_cols.issubset(df.columns):
        return False, "Missing required columns"
    if df.empty or df["Date"].min() > CUTOFF_DATE:
        return False, "Insufficient history"
    return True, "OK"

# --- FETCH FUNCTION ---
def fetch_and_save(ticker):
    """
    Fetches data for a single ticker with a mandatory delay.
    """
    # Increased mandatory delay before every single request
    time.sleep(random.uniform(1, 3))

    try:
        # 1. Fetch historical data
        url = f"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}"
        params = {
            "period1": 0, "period2": int(datetime.now(timezone.utc).timestamp()),
            "interval": "1d", "includeAdjustedClose": "true", "events": "div,split"
        }
        response = session.get(url, params=params, timeout=20)
        response.raise_for_status()
        data = response.json()

        # 2. Parse historical data
        chart_data = data.get("chart", {}).get("result", [{}])[0]
        timestamps = chart_data.get("timestamp")
        if not timestamps:
            return ticker, None, chart_data.get("error", {}).get("description", "No timestamp data")

        indicators = chart_data.get("indicators", {}).get("quote", [{}])[0]
        adjclose_data = chart_data.get("indicators", {}).get("adjclose", [{}])[0].get("adjclose")
        df = pd.DataFrame({
            'Date': pd.to_datetime(timestamps, unit='s', utc=True),
            'Open': indicators.get('open'), 'High': indicators.get('high'),
            'Low': indicators.get('low'), 'Close': indicators.get('close'),
            'Volume': indicators.get('volume'), 'Adj Close': adjclose_data
        })
        
        df = df.dropna().reset_index(drop=True)
        df["Ticker"] = ticker
        is_valid, reason = validate_stock_data(df)
        if not is_valid:
            return ticker, None, f"Validation failed: {reason}"

        # 3. Fetch sector info
        stock_info = yf.Ticker(ticker, session=session).info
        sector = stock_info.get("sector", "Unknown")
        
        # 4. Save and upload
        local_path = f"stockdata_us_adjclose/{ticker}_{VERSION_TIMESTAMP}.parquet"
        df.to_parquet(local_path, index=False)
        blob_path = f"{BLOB_FOLDER}{ticker}_{VERSION_TIMESTAMP}.parquet"
        blob_client = blob_service_client.get_container_client(CONTAINER_NAME).get_blob_client(blob_path)
        with open(local_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

        return ticker, sector, "OK"

    except HTTPError as e:
        if e.response.status_code == 429:
            return ticker, None, "Error: Rate limited even on single thread. IP range likely blocked."
        return ticker, None, f"HTTP Error: {e.response.status_code}"
    except Exception as e:
        return ticker, None, f"General Error: {str(e).splitlines()[0]}"

# --- MAIN (MODIFIED FOR SINGLE-CORE EXECUTION) ---
def main():
    """Main execution function, runs sequentially."""
    tickers = get_all_tickers()
    
    print("Starting data download in single-threaded mode (slow but safe)...")

    results = []
    # Remove the multiprocessing Pool and use a simple for loop
    for ticker in tqdm(tickers):
        res = fetch_and_save(ticker)
        results.append(res)
    
    successful_results = [res for res in results if res[2] == "OK"]
    failed_results = [res for res in results if res[2] != "OK"]

    print(f"\nProcessing complete. Success: {len(successful_results)}, Failed: {len(failed_results)}")
    if failed_results:
        print("Sample of failed tickers:")
        for ticker, _, reason in failed_results[:15]:
            print(f"  - {ticker}: {reason}")

    if successful_results:
        sectors = [{"Ticker": t, "Sector": s} for t, s, status in successful_results]
        sectors_df = pd.DataFrame(sectors)
        sectors_csv_path = f"sectors_{VERSION_TIMESTAMP}.csv"
        sectors_df.to_csv(sectors_csv_path, index=False)
        print(f"Sector mapping for {len(sectors_df)} tickers saved to {sectors_csv_path}")

        print("Registering data asset in Azure ML...")
        data_asset = Data(
            path=f"azureml://datastores/workspaceblobstore/paths/{CONTAINER_NAME}/{BLOB_FOLDER}",
            type=AssetTypes.URI_FOLDER,
            description=f"US stock data (Adj Close included, {MIN_YEARS}+ years history)",
            name=f"us_stock_data_adjclose_{VERSION_TIMESTAMP}"
        )
        ml_client.data.create_or_update(data_asset)
        print(f"Successfully registered dataset '{data_asset.name}' in Azure ML.")

if __name__ == "__main__":
    main()

Connecting to Azure services...
Successfully connected to Azure.
Fetching ticker lists...


  0%|          | 0/517 [00:00<?, ?it/s]429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/A?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=A&crumb=Edge%3A+Too+Many+Requests
  0%|          | 1/517 [00:02<24:58,  2.90s/it]429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/AAPL?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=AAPL&crumb=Edge%3A+Too+Many+Requests
  1%|          | 4/517 [00:09<18:07,  2.12s/it]429 Client Error: Too Many Requests for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/ABT?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=ABT&crumb=Edge%3A+Too+Many+Requests
  1%|          | 5/5

KeyboardInterrupt: 