In [None]:
!pip install requests yfinance azure-eventhub pandas requests azure-data-tables azure-kusto-data azure-kusto-ingest
!pip install --upgrade azure-identity azure-core azure-kusto-data azure-kusto-ingest

StatementMeta(, 8803a690-7a36-4a3f-8053-ad48cbf30914, 9, Finished, Available, Finished)



In [None]:
import json
import requests
import pandas as pd
import time
import logging

from datetime import datetime
from azure.eventhub import EventHubProducerClient, EventData
from azure.storage.filedatalake import DataLakeServiceClient
from concurrent.futures import ThreadPoolExecutor

from azureml.core import Workspace
from azureml.core.keyvault import Keyvault

StatementMeta(, 8803a690-7a36-4a3f-8053-ad48cbf30914, 10, Finished, Available, Finished)

In [None]:
# ENVIRONMENT CONFIGURATIONS
ws = Workspace.from_config()
key_vault = ws.get_default_keyvault()
secret_value = key_vault.get_secret("my-secret-name")
api_key = key_vault.get_secret("api_key")
conn_string = key_vault.get_secret("conn_string")


# Configuration
API_KEY = api_key
EVENT_HUB_CONN_STR = conn_string
MAX_WORKERS = 5
DAYS_HISTORY = 100
REQUEST_TIMEOUT = 15

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('formatted_financial_pipeline.log')
    ]
)
logger = logging.getLogger(__name__)

def fetch_ticker_data(ticker: str) -> list:
    """Fetch stock data with proper error handling"""
    url = "https://api.financialdatasets.ai/prices/"
    params = {
        "ticker": ticker,
        "interval": "day",
        "interval_multiplier": 1,
        "start_date": (datetime.utcnow() - timedelta(days=DAYS_HISTORY)).strftime('%Y-%m-%d'),
        "end_date": datetime.utcnow().strftime('%Y-%m-%d')
    }
    
    headers = {
        "X-API-KEY": API_KEY,
        "Accept": "application/json"
    }
    
    try:
        response = requests.get(
            url,
            headers=headers,
            params=params,
            timeout=REQUEST_TIMEOUT
        )
        response.raise_for_status()
        
        data = response.json()
        if isinstance(data, dict) and 'prices' in data:
            return data['prices']
        return data if isinstance(data, list) else []
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed for {ticker}: {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error fetching {ticker}: {str(e)}")
    
    return []

def format_record(record: dict, ticker: str) -> dict:
    """Format data to match your exact requirements"""
    try:
        # Calculate daily change if not provided
        daily_change = (float(record['close']) - float(record['open'])) if 'close' in record and 'open' in record else 0
        
        return {
            "ticker": ticker,
            "price": float(record.get('close', 0)),
            "timestamp": record.get('time', datetime.utcnow().isoformat()),
            "volume": int(record.get('volume', 0)),
            "open": float(record.get('open', 0)),
            "high": float(record.get('high', 0)),
            "low": float(record.get('low', 0)),
            "change": round(daily_change, 2),
            "EventProcessedUtcTime": datetime.utcnow().isoformat() + "Z",
            "PartitionId": str(hash(ticker) % 4),  # Distribute across partitions
            "EventEnqueuedUtcTime": datetime.utcnow().isoformat() + "Z"
        }
    except Exception as e:
        logger.error(f"Formatting error for {ticker}: {str(e)}")
        return None

def process_ticker(ticker: str) -> list:
    """Process all records for a single ticker"""
    logger.info(f"Processing {ticker}")
    records = fetch_ticker_data(ticker)
    return [format_record(r, ticker) for r in records if format_record(r, ticker) is not None]

def send_to_event_hub(records: list) -> bool:
    """Send formatted records to Event Hub"""
    if not records:
        return False
    
    producer = None
    try:
        producer = EventHubProducerClient.from_connection_string(
            EVENT_HUB_CONN_STR,
            retry_total=3
        )
        
        with producer:
            batch = producer.create_batch()
            for record in records:
                try:
                    batch.add(EventData(json.dumps(record).encode('utf-8')))
                except ValueError:
                    # Batch full, send and create new
                    producer.send_batch(batch)
                    batch = producer.create_batch()
                    batch.add(EventData(json.dumps(record).encode('utf-8')))
            
            if len(batch) > 0:
                producer.send_batch(batch)
                logger.info(f"Sent {len(batch)} records")
                return True
            
            return False
            
    except Exception as e:
        logger.error(f"Event Hub send failed: {str(e)}")
        return False
    finally:
        if producer:
            producer.close()

def main():
    """Main execution with parallel processing"""
    logger.info("Starting formatted financial pipeline")
    start_time = time.time()
    
    try:
        # Process tickers in parallel
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            results = list(executor.map(process_ticker, TICKERS))
        
        # Flatten results
        all_records = [r for sublist in results for r in sublist]
        
        if all_records:
            success = send_to_event_hub(all_records)
            logger.info(f"Pipeline {'succeeded' if success else 'failed'}. Processed {len(all_records)} records")
        else:
            logger.warning("No records processed")
            
    except Exception as e:
        logger.error(f"Pipeline failed: {str(e)}")
    finally:
        logger.info(f"Execution time: {time.time() - start_time:.2f}s")

if __name__ == "__main__":
    main()

StatementMeta(, 8803a690-7a36-4a3f-8053-ad48cbf30914, 11, Finished, Available, Finished)

2025-04-08 23:48:18,789 - INFO - Starting formatted financial pipeline
2025-04-08 23:48:18,798 - INFO - Processing AAPL
2025-04-08 23:48:18,800 - INFO - Processing MSFT
2025-04-08 23:48:18,800 - INFO - Processing GOOGL
2025-04-08 23:48:18,801 - INFO - Processing TSLA
2025-04-08 23:48:18,802 - INFO - Processing NVDA
2025-04-08 23:48:21,081 - INFO - Connection state changed: None -> <ConnectionState.START: 0>
2025-04-08 23:48:21,521 - INFO - Connection state changed: <ConnectionState.START: 0> -> <ConnectionState.HDR_SENT: 2>
2025-04-08 23:48:21,522 - INFO - Connection state changed: <ConnectionState.HDR_SENT: 2> -> <ConnectionState.HDR_SENT: 2>
2025-04-08 23:48:21,522 - INFO - Connection state changed: <ConnectionState.HDR_SENT: 2> -> <ConnectionState.OPEN_PIPE: 4>
2025-04-08 23:48:21,523 - INFO - Session state changed: <SessionState.UNMAPPED: 0> -> <SessionState.BEGIN_SENT: 1>
2025-04-08 23:48:21,524 - INFO - Link state changed: <LinkState.DETACHED: 0> -> <LinkState.ATTACH_SENT: 1>
202