# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Log Storage and Analysis Patterns
# MAGIC 
# MAGIC Let's explore how to store logs in a structured way and analyze them for monitoring and debugging purposes.

In [None]:
# COMMAND ----------

# PySpark exception handling and debugging

try:
    # Try to import PySpark-specific exceptions
    from pyspark.errors import PySparkException, AnalysisException
    PYSPARK_EXCEPTIONS_AVAILABLE = True
except ImportError:
    # Fallback for older Spark versions
    from pyspark.sql.utils import AnalysisException
    PySparkException = Exception  # Fallback
    PYSPARK_EXCEPTIONS_AVAILABLE = False

def extract_spark_error_info(exception: Exception) -> Dict[str, Any]:
    """
    Extract structured information from Spark exceptions
    """
    error_info = {
        "exception_type": type(exception).__name__,
        "message": str(exception),
        "timestamp": datetime.now().isoformat()
    }
    
    # Extract additional info for newer PySpark versions
    if PYSPARK_EXCEPTIONS_AVAILABLE and hasattr(exception, 'getErrorClass'):
        try:
            error_info.update({
                "error_class": exception.getErrorClass(),
                "sql_state": exception.getSqlState() if hasattr(exception, 'getSqlState') else None,
                "message_parameters": exception.getMessageParameters() if hasattr(exception, 'getMessageParameters') else {}
            })
        except:
            pass  # Fallback gracefully
    
    return error_info

def create_spark_error_demonstrations():
    """
    Create various Spark errors for demonstration purposes
    """
    demonstrations = []
    
    # 1. Table not found error
    def demo_table_not_found():
        try:
            spark.sql("SELECT * FROM non_existent_table").show()
        except Exception as e:
            return extract_spark_error_info(e)
    
    # 2. Column not found error
    def demo_column_not_found():
        try:
            test_df.select("nonexistent_column").show()
        except Exception as e:
            return extract_spark_error_info(e)
    
    # 3. Type mismatch error
    def demo_type_mismatch():
        try:
            spark.sql("SELECT 'text' / 5").show()
        except Exception as e:
            return extract_spark_error_info(e)
    
    # 4. Division by zero error
    def demo_division_by_zero():
        try:
            spark.sql("SELECT 10 / 0").show()
        except Exception as e:
            return extract_spark_error_info(e)
    
    return {
        "table_not_found": demo_table_not_found,
        "column_not_found": demo_column_not_found,
        "type_mismatch": demo_type_mismatch,
        "division_by_zero": demo_division_by_zero
    }

print("=== Spark Exception Handling Demonstrations ===")

demonstrations = create_spark_error_demonstrations()

# Run demonstrations and log errors
for demo_name, demo_func in demonstrations.items():
    try:
        print(f"\n--- {demo_name.replace('_', ' ').title()} Error ---")
        error_info = demo_func()
        if error_info:
            functional_logger.error(
                f"Demonstration error: {demo_name}",
                context=error_info,
                function_name="error_demonstration"
            )
            
            print(f"Error Type: {error_info['exception_type']}")
            print(f"Message: {error_info['message'][:100]}...")
            
            if 'error_class' in error_info and error_info['error_class']:
                print(f"Error Class: {error_info['error_class']}")
            if 'sql_state' in error_info and error_info['sql_state']:
                print(f"SQL State: {error_info['sql_state']}")
    except Exception as e:
        print(f"Unexpected error in demonstration: {e}")

# Advanced error handling with recovery strategies
print("\n=== Advanced Error Handling with Recovery ===")

def safe_sql_execution(sql_query: str, 
                      recovery_strategies: List[Callable[[], DataFrame]] = None) -> Result[DataFrame, PipelineError]:
    """
    Execute SQL with recovery strategies for common errors
    """
    try:
        result_df = spark.sql(sql_query)
        
        functional_logger.info(
            "SQL execution successful",
            context={"query": sql_query[:100], "columns": result_df.columns},
            function_name="safe_sql_execution"
        )
        
        return Ok(result_df)
        
    except Exception as e:
        error_info = extract_spark_error_info(e)
        
        functional_logger.warning(
            "SQL execution failed, attempting recovery",
            context=error_info,
            function_name="safe_sql_execution"
        )
        
        # Try recovery strategies
        if recovery_strategies:
            for i, recovery_func in enumerate(recovery_strategies):
                try:
                    functional_logger.info(
                        f"Attempting recovery strategy {i+1}",
                        context={"strategy_index": i},
                        function_name="safe_sql_execution"
                    )
                    
                    recovery_df = recovery_func()
                    
                    functional_logger.info(
                        f"Recovery strategy {i+1} successful",
                        context={"strategy_index": i, "result_columns": recovery_df.columns},
                        function_name="safe_sql_execution"
                    )
                    
                    return Ok(recovery_df)
                    
                except Exception as recovery_error:
                    functional_logger.warning(
                        f"Recovery strategy {i+1} failed",
                        context={"strategy_index": i, "recovery_error": str(recovery_error)},
                        function_name="safe_sql_execution"
                    )
                    continue
        
        # All recovery attempts failed
        pipeline_error = PipelineError(
            error_type=error_info["exception_type"],
            message=error_info["message"],
            context=error_info,
            function_name="safe_sql_execution",
            timestamp=error_info["timestamp"],
            original_exception=e
        )
        
        return Err(pipeline_error)

# Test SQL execution with recovery
print("\nTesting SQL execution with recovery strategies...")

# Recovery strategies for table not found
def create_fallback_table():
    """Recovery strategy: create a fallback empty DataFrame"""
    return spark.createDataFrame([], StructType([StructField("fallback", StringType(), True)]))

def use_existing_table():
    """Recovery strategy: use an existing table"""
    return test_df

# Test with recovery
sql_result = safe_sql_execution(
    "SELECT * FROM non_existent_table",
    recovery_strategies=[use_existing_table, create_fallback_table]
)

if sql_result.is_ok():
    print("✅ SQL execution successful (possibly with recovery)")
    sql_result.value.show(3)
else:
    print(f"❌ SQL execution failed: {sql_result.error.message}")

print("\n🔧 Advanced error handling patterns demonstrated")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. PySpark Exception Handling and Debugging
# MAGIC 
# MAGIC Let's explore how to handle specific PySpark exceptions and extract useful debugging information.

In [None]:
# COMMAND ----------

# Test the functional error handling pipeline

print("=== Testing Safe Transformation Pipeline ===")

# Test successful transformations
print("\n1. Testing successful transformations:")
result1 = safe_validate_ages(test_df)
if result1.is_ok():
    print(f"✅ Age validation successful: {result1.value.count()} records")
    
    result2 = safe_add_age_category(result1.value)
    if result2.is_ok():
        print(f"✅ Age categorization successful: {result2.value.count()} records")
        result2.value.show()
    else:
        print(f"❌ Age categorization failed: {result2.error.message}")
else:
    print(f"❌ Age validation failed: {result1.error.message}")

# Test transformation that might fail
print("\n2. Testing transformation with null handling:")
result3 = safe_calculate_bonus(test_df)
if result3.is_ok():
    print(f"✅ Bonus calculation successful: {result3.value.count()} records")
    result3.value.show()
else:
    print(f"❌ Bonus calculation failed: {result3.error.message}")
    print(f"   Error details: {json.dumps(result3.error.to_dict(), indent=2)}")

# Composition of safe transformations
def compose_safe_transformations(df: DataFrame) -> Result[DataFrame, List[PipelineError]]:
    """
    Compose multiple safe transformations
    Returns either the final result or all accumulated errors
    """
    errors = []
    current_df = df
    
    # Chain transformations, collecting errors
    transformations = [
        safe_validate_ages,
        safe_add_age_category,
        safe_calculate_bonus
    ]
    
    for transform in transformations:
        result = transform(current_df)
        if result.is_ok():
            current_df = result.value
        else:
            errors.append(result.error)
            # Decide whether to continue or stop
            if result.error.error_type in ["AnalysisException", "PySparkException"]:
                # Stop on critical errors
                return Err(errors)
    
    if errors:
        return Err(errors)
    else:
        return Ok(current_df)

print("\n3. Testing composed safe transformations:")
composed_result = compose_safe_transformations(test_df)

if composed_result.is_ok():
    print("✅ All transformations completed successfully")
    composed_result.value.show()
else:
    print(f"❌ Pipeline failed with {len(composed_result.error)} errors:")
    for i, error in enumerate(composed_result.error, 1):
        print(f"   {i}. {error.function_name}: {error.message}")

print("\n🔄 Pipeline execution completed with comprehensive logging")

In [None]:
# COMMAND ----------

# Functional error handling with Result types

from abc import ABC, abstractmethod
from typing import Generic, TypeVar, Union

T = TypeVar('T')
E = TypeVar('E')

class Result(Generic[T, E], ABC):
    """
    Result type for functional error handling
    Inspired by Rust's Result<T, E> type
    """
    
    @abstractmethod
    def is_ok(self) -> bool:
        pass
    
    @abstractmethod
    def is_err(self) -> bool:
        pass

@dataclass
class Ok(Result[T, E]):
    """Success result containing a value"""
    value: T
    
    def is_ok(self) -> bool:
        return True
    
    def is_err(self) -> bool:
        return False

@dataclass
class Err(Result[T, E]):
    """Error result containing error information"""
    error: E
    
    def is_ok(self) -> bool:
        return False
    
    def is_err(self) -> bool:
        return True

@dataclass
class PipelineError:
    """
    Structured error information for pipeline operations
    """
    error_type: str
    message: str
    context: Dict[str, Any]
    function_name: str
    timestamp: str
    original_exception: Optional[Exception] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for logging"""
        return {
            "error_type": self.error_type,
            "message": self.message,
            "context": self.context,
            "function_name": self.function_name,
            "timestamp": self.timestamp,
            "exception_type": type(self.original_exception).__name__ if self.original_exception else None
        }

def safe_transform(func: Callable[[DataFrame], DataFrame], 
                  function_name: str,
                  logger: FunctionalLogger = None) -> Callable[[DataFrame], Result[DataFrame, PipelineError]]:
    """
    Higher-order function that wraps DataFrame transformations with error handling
    Returns a Result type instead of raising exceptions
    """
    def wrapper(df: DataFrame) -> Result[DataFrame, PipelineError]:
        try:
            if logger:
                logger.info(f"Starting transformation: {function_name}", 
                          context={"input_columns": df.columns, "input_rows": df.count()},
                          function_name=function_name)
            
            result_df = func(df)
            
            if logger:
                logger.info(f"Transformation completed successfully: {function_name}",
                          context={"output_columns": result_df.columns, "output_rows": result_df.count()},
                          function_name=function_name)
            
            return Ok(result_df)
            
        except Exception as e:
            error = PipelineError(
                error_type=type(e).__name__,
                message=str(e),
                context={"input_columns": df.columns if df else [], "function": function_name},
                function_name=function_name,
                timestamp=datetime.now().isoformat(),
                original_exception=e
            )
            
            if logger:
                logger.error(f"Transformation failed: {function_name}",
                           context=error.to_dict(),
                           function_name=function_name)
            
            return Err(error)
    
    return wrapper

# Example transformation functions wrapped with error handling
print("=== Functional Error Handling Examples ===")

# Create sample data for testing
sample_data = [
    (1, "Alice", 25, 1000.0),
    (2, "Bob", 30, 1500.0),
    (3, "Charlie", 35, 2000.0),
    (4, "Diana", -5, 800.0),  # Invalid age for testing
    (5, "Eve", 40, None)      # Null amount for testing
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("amount", DoubleType(), True)
])

test_df = spark.createDataFrame(sample_data, schema)

# Pure transformation functions
def validate_ages(df: DataFrame) -> DataFrame:
    """Pure function to validate age ranges"""
    return df.filter((F.col("age") >= 0) & (F.col("age") <= 120))

def calculate_bonus(df: DataFrame) -> DataFrame:
    """Pure function to calculate bonus (might fail with null amounts)"""
    return df.withColumn("bonus", F.col("amount") * 0.1)

def add_age_category(df: DataFrame) -> DataFrame:
    """Pure function to add age categories"""
    return df.withColumn("age_category",
                        F.when(F.col("age") < 30, "Young")
                         .when(F.col("age") < 50, "Middle")
                         .otherwise("Senior"))

# Wrap functions with error handling
safe_validate_ages = safe_transform(validate_ages, "validate_ages", functional_logger)
safe_calculate_bonus = safe_transform(calculate_bonus, "calculate_bonus", functional_logger)
safe_add_age_category = safe_transform(add_age_category, "add_age_category", functional_logger)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Functional Error Handling Patterns
# MAGIC 
# MAGIC Let's explore how to handle errors in a functional way, using Result types and composable error handling patterns.

In [None]:
# COMMAND ----------

# Structured logging utilities for functional pipelines

class LogLevel(Enum):
    """Enum for log levels"""
    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"

@dataclass
class LogEntry:
    """
    Immutable log entry structure for functional logging
    """
    timestamp: str
    level: LogLevel
    logger_name: str
    message: str
    context: Dict[str, Any]
    function_name: Optional[str] = None
    execution_id: Optional[str] = None
    
    def to_json(self) -> str:
        """Convert log entry to JSON string"""
        return json.dumps({
            "timestamp": self.timestamp,
            "level": self.level.value,
            "logger_name": self.logger_name,
            "message": self.message,
            "context": self.context,
            "function_name": self.function_name,
            "execution_id": self.execution_id
        })

class FunctionalLogger:
    """
    Logger designed for functional programming patterns
    Maintains separation between logging and business logic
    """
    
    def __init__(self, name: str, execution_id: Optional[str] = None):
        self.name = name
        self.execution_id = execution_id or f"exec_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self._standard_logger = setup_standard_logger(name)
    
    def create_log_entry(self, level: LogLevel, message: str, 
                        context: Dict[str, Any] = None, 
                        function_name: str = None) -> LogEntry:
        """
        Pure function to create a log entry
        """
        return LogEntry(
            timestamp=datetime.now().isoformat(),
            level=level,
            logger_name=self.name,
            message=message,
            context=context or {},
            function_name=function_name,
            execution_id=self.execution_id
        )
    
    def log_entry(self, entry: LogEntry) -> None:
        """
        Log an entry using the standard logger
        """
        log_message = f"[{entry.function_name or 'unknown'}] {entry.message}"
        if entry.context:
            log_message += f" | Context: {json.dumps(entry.context)}"
        
        if entry.level == LogLevel.DEBUG:
            self._standard_logger.debug(log_message)
        elif entry.level == LogLevel.INFO:
            self._standard_logger.info(log_message)
        elif entry.level == LogLevel.WARNING:
            self._standard_logger.warning(log_message)
        elif entry.level == LogLevel.ERROR:
            self._standard_logger.error(log_message)
        elif entry.level == LogLevel.CRITICAL:
            self._standard_logger.critical(log_message)
    
    def info(self, message: str, context: Dict[str, Any] = None, function_name: str = None):
        """Log an info message"""
        entry = self.create_log_entry(LogLevel.INFO, message, context, function_name)
        self.log_entry(entry)
        return entry
    
    def warning(self, message: str, context: Dict[str, Any] = None, function_name: str = None):
        """Log a warning message"""
        entry = self.create_log_entry(LogLevel.WARNING, message, context, function_name)
        self.log_entry(entry)
        return entry
    
    def error(self, message: str, context: Dict[str, Any] = None, function_name: str = None):
        """Log an error message"""
        entry = self.create_log_entry(LogLevel.ERROR, message, context, function_name)
        self.log_entry(entry)
        return entry

# Create functional logger instance
functional_logger = FunctionalLogger("functional_pipeline")

# Test the functional logger
print("=== Testing Functional Logger ===")

# Example log entries
info_entry = functional_logger.info(
    "Pipeline started successfully",
    context={"input_path": "/data/input", "records_expected": 1000},
    function_name="start_pipeline"
)

warning_entry = functional_logger.warning(
    "Performance degradation detected",
    context={"memory_usage": 85, "cpu_usage": 90, "task_duration": 45.2},
    function_name="monitor_performance"
)

print(f"\nJSON Log Entry Example:\n{info_entry.to_json()}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Structured Logging for Functional Pipelines
# MAGIC 
# MAGIC Let's create structured logging utilities that work well with functional programming patterns.

In [None]:
# COMMAND ----------

# Comparison of logging approaches

print("=== Logging Approach Comparison ===")

# 1. Basic print statements (NOT recommended for production)
print("\n❌ ANTI-PATTERN: Using print statements")
def bad_logging_example(df):
    """
    Anti-pattern: Using print statements for logging
    Problems: No log levels, no structured format, performance overhead
    """
    print("Starting data processing...")  # Side effect in transformation
    print(f"Input record count: {df.count()}")  # Action in transformation!
    
    result = df.filter(F.col("amount") > 0)
    print(f"After filtering: {result.count()} records")  # Another action!
    
    return result

print("Print statements create side effects and performance issues")

# 2. Standard Python logging (Recommended for most cases)
print("\n✅ BETTER: Standard Python logging")

# Configure Python logger
def setup_standard_logger(name: str, level: int = logging.INFO) -> logging.Logger:
    """
    Pure function to set up a standard Python logger
    """
    logger = logging.getLogger(name)
    
    # Avoid duplicate handlers when re-running cells
    if not logger.hasHandlers():
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(level)
    
    return logger

# Create a standard logger
standard_logger = setup_standard_logger("pyspark_pipeline")

print("Standard Python logger configured")
standard_logger.info("This is an info message")
standard_logger.warning("This is a warning message")
standard_logger.error("This is an error message")

# 3. PySpark's built-in logger (Spark 4+ / DBR 17.0+)
print("\n✅ NEWEST: PySpark structured logger")

try:
    # For newer Databricks Runtime versions
    from pyspark.logger import PySparkLogger
    
    pyspark_logger = PySparkLogger.getLogger("functional_pipeline")
    
    print("PySpark structured logger configured")
    pyspark_logger.info("Data processing started for {file_name}", file_name="input.csv")
    pyspark_logger.warning("Low memory warning: {memory_usage}%", memory_usage=85)
    pyspark_logger.error("Failed to process record {record_id} due to {error_msg}", 
                        record_id=123, error_msg="Invalid format")
    
except ImportError:
    print("PySpark structured logger not available (requires Spark 4+ / DBR 17.0+)")
    print("Using standard Python logging for compatibility")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Logging Approaches: PySpark Logger vs Standard Python Logging
# MAGIC 
# MAGIC Let's compare different logging approaches available in PySpark and understand when to use each one.

In [None]:
# COMMAND ----------

# Essential imports for logging and error handling
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F
from typing import Dict, List, Tuple, Optional, Union, Any
import json
import logging
from datetime import datetime
from functools import wraps
import traceback
from dataclasses import dataclass
from enum import Enum

# Initialize Spark session
spark = SparkSession.builder.appName("FunctionalLoggingErrorHandling").getOrCreate()

print("✅ Setup complete - Ready for logging and error handling patterns!")

# Databricks notebook source
# MAGIC %md
# MAGIC # 5.2 Logging and Error Reporting in Functional PySpark Pipelines
# MAGIC 
# MAGIC This notebook demonstrates how to implement effective logging and error reporting in PySpark applications while maintaining functional programming principles. We'll explore structured logging, error handling patterns, and debugging strategies that align with functional design.
# MAGIC 
# MAGIC ## Learning Objectives
# MAGIC 
# MAGIC By the end of this notebook, you will understand how to:
# MAGIC - Implement structured logging in PySpark applications
# MAGIC - Use PySpark's built-in logger vs standard Python logging
# MAGIC - Handle errors functionally without breaking pure function principles
# MAGIC - Create logging utilities that integrate with functional pipelines
# MAGIC - Debug PySpark applications using log analysis
# MAGIC - Store and analyze logs in Unity Catalog volumes
# MAGIC 
# MAGIC ## Prerequisites
# MAGIC 
# MAGIC - Understanding of functional programming principles
# MAGIC - Experience with PySpark transformations and actions
# MAGIC - Knowledge of error handling in distributed systems
# MAGIC - Familiarity with JSON and structured data