In [None]:
# PipelineBuilder & LogWriter v2.9.0 - Standalone Notebook
#
# This notebook contains the complete PipelineBuilder and LogWriter implementation
# as a standalone, executable notebook. All dependencies are included as cells
# in the correct order.
#
# Usage:
# 1. Run all cells from top to bottom
# 2. The PipelineBuilder and LogWriter classes will be available after all cells execute
# 3. Use PipelineBuilder to build and execute data pipelines
# 4. Use LogWriter to log and analyze pipeline execution results
#
# Note: This is generated from version 2.9.0. Module dependencies are
# resolved automatically from source code analysis.

In [None]:
# External imports (PySpark, standard library)
from __future__ import annotations

import logging
import sys
import time
import uuid
from abc import ABC, abstractmethod
from collections import defaultdict, deque
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from functools import wraps
from pathlib import Path
from typing import Any, Callable, Dict, Generator, List, Optional, Protocol, Set, Tuple, TypedDict, TypeVar, Union, cast

# PySpark imports
from pyspark.sql import Column, DataFrame, SparkSession, functions as F
from pyspark.sql.types import (
    BooleanType,
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)
from pyspark.sql.utils import AnalysisException
from pyspark.sql.window import Window

# Delta Lake imports
try:
    from delta.tables import DeltaTable
except ImportError:
    print("⚠️  Delta Lake not available. Some features may not work.")
    DeltaTable = None

# Optional imports
try:
    import psutil
except ImportError:
    print("⚠️  psutil not available. Memory monitoring disabled.")
    psutil = None

In [None]:
# Module: pipeline_builder_base.logging (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Simplified logging system for the framework.

This module provides a clean, focused logging system for pipeline operations
without the complexity of the previous over-engineered system.
"""

import logging
import sys
from contextlib import contextmanager
from datetime import datetime, timezone
from typing import Dict, Generator, Optional, Union

class PipelineLogger:
    """
    Simple, focused logging for pipeline operations.

    Features:
    - Basic logging levels (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    - Console and file output
    - Simple context management
    - Performance timing
    """

    def __init__(
        self,
        name: str = "PipelineRunner",
        level: int = logging.INFO,
        log_file: Optional[str] = None,
        verbose: bool = True,
    ):
        self.name = name
        self.level = level
        self.log_file = log_file
        self.verbose = verbose

        # Create logger
        self.logger = logging.getLogger(name)
        self.logger.setLevel(level)

        # Clear existing handlers
        self.logger.handlers.clear()

        # Setup handlers
        self._setup_handlers()

        # Performance tracking
        self._timers: Dict[str, datetime] = {}

    def _setup_handlers(self) -> None:
        """Setup logging handlers."""
        # Console handler
        if self.verbose:
            console_handler = logging.StreamHandler(sys.stdout)
            console_formatter = logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                datefmt="%H:%M:%S",
            )
            console_handler.setFormatter(console_formatter)
            console_handler.setLevel(self.level)
            self.logger.addHandler(console_handler)

        # File handler
        if self.log_file:
            file_handler = logging.FileHandler(self.log_file)
            file_formatter = logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                datefmt="%Y-%m-%d %H:%M:%S",
            )
            file_handler.setFormatter(file_formatter)
            file_handler.setLevel(self.level)
            self.logger.addHandler(file_handler)

    # Basic logging methods
    def debug(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log debug message."""
        self.logger.debug(self._format_message(message, kwargs))

    def info(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log info message."""
        self.logger.info(self._format_message(message, kwargs))

    def warning(
        self, message: str, **kwargs: Union[str, int, float, bool, None]
    ) -> None:
        """Log warning message."""
        self.logger.warning(self._format_message(message, kwargs))

    def error(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log error message."""
        self.logger.error(self._format_message(message, kwargs))

    def critical(
        self, message: str, **kwargs: Union[str, int, float, bool, None]
    ) -> None:
        """Log critical message."""
        self.logger.critical(self._format_message(message, kwargs))

    def _format_message(
        self, message: str, kwargs: Dict[str, Union[str, int, float, bool, None]]
    ) -> str:
        """Format message with keyword arguments."""
        if not kwargs:
            return message
        kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
        return f"{message} ({kwargs_str})"

    # Performance timing
    @contextmanager
    def time_operation(self, operation_name: str) -> Generator[None, None, None]:
        """Context manager for timing operations."""
        start_time = datetime.now(timezone.utc)
        self._timers[operation_name] = start_time
        try:
            yield
        finally:
            end_time = datetime.now(timezone.utc)
            duration = (end_time - start_time).total_seconds()
            self.info(f"Operation '{operation_name}' took {duration:.2f}s")
            # Clean up timer after operation completes
            if operation_name in self._timers:
                del self._timers[operation_name]

    def start_timer(self, timer_name: str) -> None:
        """Start a named timer."""
        self._timers[timer_name] = datetime.now(timezone.utc)

    def stop_timer(self, timer_name: str) -> float:
        """Stop a named timer and return duration in seconds."""
        if timer_name not in self._timers:
            self.warning(f"Timer '{timer_name}' was not started")
            return 0.0
        start_time = self._timers[timer_name]
        end_time = datetime.now(timezone.utc)
        duration = (end_time - start_time).total_seconds()
        del self._timers[timer_name]
        return duration

    def get_timer_duration(self, timer_name: str) -> float:
        """Get current duration of a running timer without stopping it."""
        if timer_name not in self._timers:
            return 0.0
        start_time = self._timers[timer_name]
        end_time = datetime.now(timezone.utc)
        return (end_time - start_time).total_seconds()

    # Context management
    @contextmanager
    def log_context(self, context_name: str) -> Generator[None, None, None]:
        """Context manager for logging context."""
        self.info(f"Starting: {context_name}")
        try:
            yield
            self.info(f"Completed: {context_name}")
        except Exception as e:
            self.error(f"Failed: {context_name}", error=str(e))
            raise

    # Step execution logging
    def step_start(self, step_type: str, step_name: str) -> None:
        """Log step start."""
        self.info(f"▶️ Starting {step_type.upper()} step: {step_name}")

    def step_complete(
        self,
        step_type: str,
        step_name: str,
        duration: float,
        rows_processed: int = 0,
        rows_written: int = 0,
        invalid_rows: int = 0,
        validation_rate: float = 100.0,
    ) -> None:
        """Log step completion."""
        self.info(
            f"✅ Completed {step_type.upper()} step: {step_name} ({duration:.2f}s) - "
            f"{rows_processed} rows processed, {rows_written} rows written, "
            f"{invalid_rows} invalid, {validation_rate:.1f}% valid"
        )

    # Utility methods
    def set_level(self, level: int) -> None:
        """Set logging level."""
        self.level = level
        self.logger.setLevel(level)
        for handler in self.logger.handlers:
            handler.setLevel(level)

    def add_handler(self, handler: logging.Handler) -> None:
        """Add a custom logging handler."""
        self.logger.addHandler(handler)

    def remove_handler(self, handler: logging.Handler) -> None:
        """Remove a logging handler."""
        self.logger.removeHandler(handler)

    def clear_handlers(self) -> None:
        """Clear all logging handlers."""
        self.logger.handlers.clear()

    def close(self) -> None:
        """Close all logging handlers, especially file handlers."""
        for handler in self.logger.handlers[
            :
        ]:  # Copy list to avoid modification during iteration
            handler.close()
            self.logger.removeHandler(handler)

In [None]:
# Module: pipeline_builder_base.errors (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Simplified error handling system for the framework.

This module provides a clean, consolidated error handling system
with just the essential error types needed for the project.
"""

from __future__ import annotations

from datetime import datetime, timezone
from enum import Enum
from typing import Any, Dict, List, Optional, Union

class ErrorSeverity(Enum):
    """Severity levels for errors."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"

class ErrorCategory(Enum):
    """Categories of errors."""

    CONFIGURATION = "configuration"
    VALIDATION = "validation"
    EXECUTION = "execution"
    DATA = "data"
    SYSTEM = "system"
    PERFORMANCE = "performance"
    RESOURCE = "resource"

# Type definitions for error context
ErrorContextValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ErrorContext = Dict[str, ErrorContextValue]
ErrorSuggestions = List[str]

class SparkForgeError(Exception):
    """
    Base exception for all framework errors.

    This is the root exception class that all other framework exceptions
    inherit from, providing consistent error handling patterns and rich context.
    """

    def __init__(
        self,
        message: str,
        *,
        error_code: Optional[str] = None,
        category: Optional[ErrorCategory] = None,
        severity: ErrorSeverity = ErrorSeverity.MEDIUM,
        context: Optional[ErrorContext] = None,
        suggestions: Optional[ErrorSuggestions] = None,
        timestamp: Optional[datetime] = None,
        cause: Optional[Exception] = None,
    ):
        """
        Initialize a framework error.

        Args:
            message: Human-readable error message
            error_code: Optional error code for programmatic handling
            category: Error category for classification
            severity: Error severity level
            context: Additional context information
            suggestions: Suggested actions to resolve the error
            timestamp: When the error occurred (defaults to now)
            cause: The underlying exception that caused this error
        """
        super().__init__(message)
        self.message = message
        self.error_code = error_code
        self.category = category
        self.severity = severity
        self.context = context or {}
        self.suggestions = suggestions or []
        self.timestamp = timestamp or datetime.now(timezone.utc)
        self.cause = cause

    def __str__(self) -> str:
        """Return string representation of the error."""
        parts = [self.message]

        if self.error_code:
            parts.append(f"[{self.error_code}]")

        if self.context:
            context_str = ", ".join(f"{k}={v}" for k, v in self.context.items())
            parts.append(f"Context: {context_str}")

        if self.suggestions:
            parts.append(f"Suggestions: {'; '.join(self.suggestions)}")

        return " | ".join(parts)

    def to_dict(self) -> Dict[str, Any]:
        """Convert error to dictionary for serialization."""
        return {
            "message": self.message,
            "error_code": self.error_code,
            "category": self.category.value if self.category else None,
            "severity": self.severity.value if self.severity else None,
            "context": self.context,
            "suggestions": self.suggestions,
            "timestamp": self.timestamp.isoformat() if self.timestamp else None,
            "cause": str(self.cause) if self.cause else None,
        }

class ValidationError(SparkForgeError):
    """Raised when validation fails."""

    def __init__(
        self,
        message: str,
        *,
        field: Optional[str] = None,
        value: Any = None,
        **kwargs: Any,
    ):
        super().__init__(
            message,
            category=ErrorCategory.VALIDATION,
            severity=ErrorSeverity.MEDIUM,
            **kwargs,
        )
        self.field = field
        self.value = value
        if field:
            self.context["field"] = field
        if value is not None:
            self.context["value"] = str(value)

class PipelineValidationError(ValidationError):
    """Raised when pipeline validation fails."""

    def __init__(
        self,
        message: str,
        *,
        step_name: Optional[str] = None,
        phase: Optional[str] = None,
        **kwargs: Any,
    ):
        super().__init__(message, **kwargs)
        self.step_name = step_name
        self.phase = phase
        if step_name:
            self.context["step_name"] = step_name
        if phase:
            self.context["phase"] = phase

class ConfigurationError(SparkForgeError):
    """Raised when configuration is invalid."""

    def __init__(self, message: str, **kwargs: Any):
        # Only set default severity if not provided in kwargs
        if "severity" not in kwargs:
            kwargs["severity"] = ErrorSeverity.MEDIUM
        super().__init__(
            message,
            category=ErrorCategory.CONFIGURATION,
            **kwargs,
        )

class ExecutionError(SparkForgeError):
    """Raised when execution fails."""

    def __init__(
        self,
        message: str,
        *,
        step_name: Optional[str] = None,
        phase: Optional[str] = None,
        **kwargs: Any,
    ):
        super().__init__(
            message,
            category=ErrorCategory.EXECUTION,
            severity=ErrorSeverity.HIGH,
            **kwargs,
        )
        self.step_name = step_name
        self.phase = phase
        if step_name:
            self.context["step_name"] = step_name
        if phase:
            self.context["phase"] = phase

class DataError(SparkForgeError):
    """Raised when data operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.DATA,
            severity=ErrorSeverity.MEDIUM,
            **kwargs,
        )

class SystemError(SparkForgeError):
    """Raised when system operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.SYSTEM,
            severity=ErrorSeverity.CRITICAL,
            **kwargs,
        )

class PerformanceError(SparkForgeError):
    """Raised when performance issues are detected."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.PERFORMANCE,
            severity=ErrorSeverity.LOW,
            **kwargs,
        )

class ResourceError(SparkForgeError):
    """Raised when resource operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.RESOURCE,
            severity=ErrorSeverity.HIGH,
            **kwargs,
        )

In [None]:
# Module: pipeline_builder_base.config.validators (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.models

"""
Configuration validation functions.

This module provides validation functions for pipeline configurations.
"""

from __future__ import annotations

from typing import List

# from ..models import PipelineConfig, ValidationThresholds  # Removed: defined in notebook cells above

def validate_pipeline_config(config: PipelineConfig) -> List[str]:
    """
    Validate pipeline configuration.

    Args:
        config: Pipeline configuration to validate

    Returns:
        List of validation errors (empty if valid)
    """
    errors: List[str] = []

    # Validate schema
    # Note: config.schema is typed as str in PipelineConfig, but we validate at runtime
    # The isinstance check is for runtime validation, even though mypy knows it's a str
    if not isinstance(config.schema, str):  # type: ignore[unreachable]
        errors.append("Pipeline schema must be a string")
        return errors  # Early return to avoid unreachable code
    # After isinstance check, mypy knows it's a str
    # The empty check is for runtime validation (empty strings are valid str types)
    if not config.schema.strip():  # type: ignore[unreachable]
        errors.append("Pipeline schema cannot be empty")
    if len(config.schema) > 128:
        errors.append("Pipeline schema name is too long (max 128 characters)")

    # Validate thresholds
    threshold_errors = validate_thresholds(config.thresholds)
    errors.extend(threshold_errors)

    return errors

def validate_thresholds(thresholds: ValidationThresholds) -> List[str]:
    """
    Validate validation thresholds.

    Args:
        thresholds: Validation thresholds to validate

    Returns:
        List of validation errors (empty if valid)
    """
    errors: List[str] = []

    # Check threshold ranges (0-100)
    if not (0.0 <= thresholds.bronze <= 100.0):
        errors.append(
            f"Bronze threshold must be between 0 and 100, got {thresholds.bronze}"
        )

    if not (0.0 <= thresholds.silver <= 100.0):
        errors.append(
            f"Silver threshold must be between 0 and 100, got {thresholds.silver}"
        )

    if not (0.0 <= thresholds.gold <= 100.0):
        errors.append(
            f"Gold threshold must be between 0 and 100, got {thresholds.gold}"
        )

    # Check that thresholds are in ascending order (bronze <= silver <= gold)
    if thresholds.bronze > thresholds.silver:
        errors.append(
            f"Bronze threshold ({thresholds.bronze}) should not exceed silver threshold ({thresholds.silver})"
        )

    if thresholds.silver > thresholds.gold:
        errors.append(
            f"Silver threshold ({thresholds.silver}) should not exceed gold threshold ({thresholds.gold})"
        )

    return errors

In [None]:
# Module: pipeline_builder_base.config.factories (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.models

"""
Configuration factory functions for creating preset configurations.

This module provides factory functions for creating PipelineConfig instances
with preset configurations for different environments.
"""

from __future__ import annotations

from typing import Any

# from ..models import PipelineConfig, ValidationThresholds  # Removed: defined in notebook cells above

def create_development_config(schema: str, **overrides: Any) -> PipelineConfig:
    """
    Create a PipelineConfig optimized for development with relaxed validation.

    Args:
        schema: Database schema name
        **overrides: Additional configuration parameters to override defaults

    Returns:
        PipelineConfig instance with development-optimized settings

    Example:
        >>> config = create_development_config("dev_schema", verbose=True)
    """
    thresholds = ValidationThresholds(
        bronze=overrides.pop("min_bronze_rate", 80.0),
        silver=overrides.pop("min_silver_rate", 85.0),
        gold=overrides.pop("min_gold_rate", 90.0),
    )

    return PipelineConfig(
        schema=schema,
        thresholds=thresholds,
        verbose=overrides.pop("verbose", True),
        **overrides,
    )

def create_production_config(schema: str, **overrides: Any) -> PipelineConfig:
    """
    Create a PipelineConfig optimized for production with strict validation.

    Args:
        schema: Database schema name
        **overrides: Additional configuration parameters to override defaults

    Returns:
        PipelineConfig instance with production-optimized settings

    Example:
        >>> config = create_production_config("prod_schema", verbose=False)
    """
    thresholds = ValidationThresholds(
        bronze=overrides.pop("min_bronze_rate", 95.0),
        silver=overrides.pop("min_silver_rate", 98.0),
        gold=overrides.pop("min_gold_rate", 99.5),
    )

    return PipelineConfig(
        schema=schema,
        thresholds=thresholds,
        verbose=overrides.pop("verbose", False),
        **overrides,
    )

def create_test_config(schema: str, **overrides: Any) -> PipelineConfig:
    """
    Create a PipelineConfig optimized for testing with minimal validation.

    Args:
        schema: Database schema name
        **overrides: Additional configuration parameters to override defaults

    Returns:
        PipelineConfig instance with test-optimized settings

    Example:
        >>> config = create_test_config("test_schema")
    """
    thresholds = ValidationThresholds(
        bronze=overrides.pop("min_bronze_rate", 50.0),
        silver=overrides.pop("min_silver_rate", 50.0),
        gold=overrides.pop("min_gold_rate", 50.0),
    )

    return PipelineConfig(
        schema=schema,
        thresholds=thresholds,
        verbose=overrides.pop("verbose", False),
        **overrides,
    )

In [None]:
# Module: pipeline_builder_base.runner.execution_helpers (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.models

"""
Execution helper functions for pipeline runners.

This module provides utility functions for execution mode determination,
source validation, and execution preparation.
"""

from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, Optional

# from ..models import ExecutionMode, PipelineConfig  # Removed: defined in notebook cells above

def determine_execution_mode(
    config: PipelineConfig,
    bronze_sources: Optional[Dict[str, Any]] = None,
    last_run: Optional[datetime] = None,
) -> ExecutionMode:
    """
    Determine execution mode based on configuration and context.

    Args:
        config: Pipeline configuration
        bronze_sources: Optional dictionary of bronze sources
        last_run: Optional datetime of last pipeline run

    Returns:
        ExecutionMode enum value
    """
    # Check if this is an initial load (no bronze sources provided)
    if not bronze_sources or len(bronze_sources) == 0:
        return ExecutionMode.INITIAL

    # Check if incremental mode should be used
    if should_run_incremental(config, last_run):
        return ExecutionMode.INCREMENTAL

    # Default to initial if we can't determine
    return ExecutionMode.INITIAL

def should_run_incremental(
    config: PipelineConfig, last_run: Optional[datetime] = None
) -> bool:
    """
    Determine if pipeline should run in incremental mode.

    Args:
        config: Pipeline configuration
        last_run: Optional datetime of last pipeline run

    Returns:
        True if incremental mode should be used, False otherwise
    """
    # If no last run time, must be initial load
    if last_run is None:
        return False

    # Check if config has incremental settings
    # This is a placeholder - actual logic depends on implementation
    return True

def validate_bronze_sources(
    sources: Dict[str, Any],
    expected_bronze_steps: Dict[str, Any],
    source_validator: Optional[Any] = None,
) -> None:
    """
    Validate bronze sources match expected bronze steps.

    Args:
        sources: Dictionary of bronze sources
        expected_bronze_steps: Dictionary of expected bronze steps
        source_validator: Optional validator function to check source validity

    Raises:
        ValueError: If sources don't match expected steps or are invalid
    """
    # Check that all expected bronze steps have sources
    missing_sources = set(expected_bronze_steps.keys()) - set(sources.keys())
    if missing_sources:
        raise ValueError(
            f"Missing bronze sources for steps: {', '.join(missing_sources)}"
        )

    # Check that all sources have corresponding expected steps
    unexpected_sources = set(sources.keys()) - set(expected_bronze_steps.keys())
    if unexpected_sources:
        raise ValueError(
            f"Unexpected bronze sources (no corresponding step): {', '.join(unexpected_sources)}"
        )

    # Validate each source if validator provided
    if source_validator:
        for step_name, source in sources.items():
            if not source_validator(source):
                raise ValueError(
                    f"Invalid bronze source for step '{step_name}': {type(source)}"
                )

def prepare_sources_for_execution(
    sources: Dict[str, Any],
    step_type: str,
    step_name: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Prepare sources for step execution.

    Args:
        sources: Dictionary of sources
        step_type: Type of step (bronze/silver/gold)
        step_name: Optional step name for filtering

    Returns:
        Dictionary of prepared sources
    """
    if step_type == "bronze":
        # Bronze steps use all provided sources
        return sources
    elif step_type == "silver":
        # Silver steps use bronze sources
        return {k: v for k, v in sources.items() if k in sources}
    elif step_type == "gold":
        # Gold steps use silver sources
        return {k: v for k, v in sources.items() if k in sources}

    return sources

In [None]:
# Module: pipeline_builder_base.dependencies.graph (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Dependency graph representation for the framework pipelines.

This module provides a clean, efficient representation of pipeline dependencies
that can be used for analysis and optimization.
"""

from __future__ import annotations

import logging
from collections import defaultdict, deque
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, Optional

logger = logging.getLogger(__name__)

class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

@dataclass
class StepNode:
    """Represents a single step in the dependency graph.

    Attributes:
        name: Unique identifier for this step.
        step_type: Type of step (BRONZE, SILVER, or GOLD).
        dependencies: Set of step names that this step depends on.
        dependents: Set of step names that depend on this step.
        execution_group: (Deprecated) Legacy field, no longer used. Execution
            order is determined by topological sort.
        estimated_duration: Estimated execution duration in seconds.
        metadata: Dictionary for storing custom metadata about the step.
    """

    name: str
    step_type: StepType
    dependencies: set[str] = field(default_factory=set)
    dependents: set[str] = field(default_factory=set)
    execution_group: int = 0
    estimated_duration: float = 0.0
    metadata: Dict[str, Any] = field(default_factory=dict)

class DependencyGraph:
    """
    Represents the dependency graph of a pipeline.

    This class provides efficient operations for dependency analysis,
    cycle detection, and execution planning.
    """

    def __init__(self) -> None:
        self.nodes: Dict[str, StepNode] = {}
        self._adjacency_list: Dict[str, set[str]] = defaultdict(set)
        self._reverse_adjacency_list: Dict[str, set[str]] = defaultdict(set)

    def add_node(self, node: StepNode) -> None:
        """Add a node to the dependency graph."""
        self.nodes[node.name] = node
        self._adjacency_list[node.name] = set()
        self._reverse_adjacency_list[node.name] = set()

    def add_dependency(self, from_step: str, to_step: str) -> None:
        """Add a dependency from one step to another."""
        if from_step not in self.nodes or to_step not in self.nodes:
            raise ValueError(f"Steps {from_step} or {to_step} not found in graph")

        self._adjacency_list[from_step].add(to_step)
        self._reverse_adjacency_list[to_step].add(from_step)

        # Update node dependencies
        self.nodes[from_step].dependencies.add(to_step)
        self.nodes[to_step].dependents.add(from_step)

    def get_dependencies(self, step_name: str) -> set[str]:
        """Get all dependencies for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependencies.copy()

    def get_dependents(self, step_name: str) -> set[str]:
        """Get all dependents for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependents.copy()

    def detect_cycles(self) -> list[list[str]]:
        """Detect cycles in the dependency graph using DFS."""
        visited = set()
        rec_stack = set()
        cycles = []

        def dfs(node: str, path: list[str]) -> None:
            if node in rec_stack:
                # Found a cycle
                cycle_start = path.index(node)
                cycle = path[cycle_start:] + [node]
                cycles.append(cycle)
                return

            if node in visited:
                return

            visited.add(node)
            rec_stack.add(node)
            path.append(node)

            for neighbor in self._adjacency_list[node]:
                dfs(neighbor, path)

            rec_stack.remove(node)
            path.pop()

        for node in self.nodes:
            if node not in visited:
                dfs(node, [])

        return cycles

    def topological_sort(
        self, creation_order: Optional[Dict[str, int]] = None
    ) -> list[str]:
        """
        Perform topological sort of the dependency graph.

        Returns nodes in an order such that dependencies come before dependents.
        Uses reverse adjacency list since add_dependency(A, B) means A depends on B,
        so B must come before A in the sort.

        **Explicit dependencies (e.g., source_silvers) always override creation order.**
        When multiple nodes have the same in-degree, creation_order is used as a
        tie-breaker for deterministic ordering based on step creation order.

        Args:
            creation_order: Optional dictionary mapping step names to creation order
                (lower number = created earlier). Used as tie-breaker for deterministic
                ordering when steps have no explicit dependencies. Explicit dependencies
                (via source_silvers, source_bronze, etc.) always take precedence.
        """
        in_degree = dict.fromkeys(self.nodes, 0)

        # Calculate in-degrees using reverse adjacency
        # If A depends on B, then B->A edge exists in reverse list
        for node in self.nodes:
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] += 1

        # Helper function to get creation order for sorting
        def get_sort_key(node_name: str) -> tuple[int, int]:
            """Return sort key: (in_degree, creation_order).

            Lower creation_order (earlier created) comes first.
            If creation_order not available, use a large number to sort to end.
            """
            creation_ord: int = (
                creation_order.get(node_name, 2**31 - 1)
                if creation_order
                else 2**31 - 1
            )
            return (in_degree[node_name], creation_ord)

        # Find nodes with no incoming edges (no dependencies)
        # Sort by creation order for deterministic ordering
        ready_nodes = [node for node, degree in in_degree.items() if degree == 0]
        if creation_order:
            ready_nodes.sort(key=get_sort_key)
        queue = deque(ready_nodes)
        result = []

        while queue:
            node = queue.popleft()
            result.append(node)

            # Process nodes that depend on this one
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] -= 1
                if in_degree[dependent] == 0:
                    queue.append(dependent)
                    # Re-sort queue to maintain creation order when adding new nodes
                    # Convert to list, sort, convert back to deque
                    if creation_order and len(queue) > 1:
                        queue_list = list(queue)
                        queue_list.sort(key=get_sort_key)
                        queue = deque(queue_list)

        return result

    def validate(self) -> list[str]:
        """Validate the dependency graph and return any issues."""
        issues = []

        # Check for cycles
        cycles = self.detect_cycles()
        if cycles:
            for cycle in cycles:
                issues.append(f"Circular dependency detected: {' -> '.join(cycle)}")

        # Check for missing dependencies
        for node_name, node in self.nodes.items():
            for dep in node.dependencies:
                if dep not in self.nodes:
                    issues.append(f"Node {node_name} depends on missing node {dep}")

        return issues

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the dependency graph."""
        total_nodes = len(self.nodes)
        total_edges = sum(len(deps) for deps in self._adjacency_list.values())

        # Count by step type
        type_counts: Dict[str, int] = defaultdict(int)
        for node in self.nodes.values():
            type_counts[node.step_type.value] += 1

        # Calculate average dependencies
        avg_dependencies = total_edges / total_nodes if total_nodes > 0 else 0

        return {
            "total_nodes": total_nodes,
            "total_edges": total_edges,
            "type_counts": dict(type_counts),
            "average_dependencies": avg_dependencies,
            "has_cycles": len(self.detect_cycles()) > 0,
        }

In [None]:
# Module: pipeline_builder_base.dependencies.exceptions (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Dependency analysis exceptions for the framework.

This module defines exceptions specific to dependency analysis operations.
"""

from typing import List, Optional

class DependencyError(Exception):
    """Base exception for dependency-related errors."""

    def __init__(self, message: str, step_name: Optional[str] = None):
        super().__init__(message)
        self.step_name = step_name

class DependencyAnalysisError(DependencyError):
    """Raised when dependency analysis fails."""

    def __init__(self, message: str, analysis_step: Optional[str] = None):
        super().__init__(message, analysis_step)
        self.analysis_step = analysis_step

class CircularDependencyError(DependencyError):
    """Raised when circular dependencies are detected."""

    def __init__(self, message: str, cycle: List[str]):
        super().__init__(message)
        self.cycle = cycle

class InvalidDependencyError(DependencyError):
    """Raised when invalid dependencies are detected."""

    def __init__(self, message: str, invalid_dependencies: List[str]):
        super().__init__(message)
        self.invalid_dependencies = invalid_dependencies

class DependencyConflictError(DependencyError):
    """Raised when dependency conflicts are detected."""

    def __init__(self, message: str, conflicting_steps: List[str]):
        super().__init__(message)
        self.conflicting_steps = conflicting_steps

In [None]:
# Module: pipeline_builder_base.models.steps (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Step model protocols for pipeline builders.

This module defines Protocol classes that step implementations should follow.
These protocols allow the base package to work with any step implementation
without knowing the specific details of Spark or SQL steps.
"""

from typing import Any, Dict, List, Optional, Protocol

class StepProtocol(Protocol):
    """Protocol for all pipeline steps."""

    name: str
    rules: Dict[str, Any]

    def validate(self) -> None:
        """Validate the step configuration."""
        ...

class BronzeStepProtocol(StepProtocol, Protocol):
    """Protocol for bronze layer steps."""

    incremental_col: Optional[str]

class SilverStepProtocol(StepProtocol, Protocol):
    """Protocol for silver layer steps."""

    source_bronze: str
    table_name: str

class GoldStepProtocol(StepProtocol, Protocol):
    """Protocol for gold layer steps."""

    source_silvers: Optional[List[str]]
    table_name: str

In [None]:
# Module: pipeline_builder_base.models.enums (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Enums for the Pipeline Builder models.
"""

from enum import Enum

class PipelinePhase(Enum):
    """Enumeration of pipeline phases."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

class ExecutionMode(Enum):
    """Enumeration of execution modes."""

    INITIAL = "initial"
    INCREMENTAL = "incremental"
    FULL_REFRESH = "full_refresh"
    VALIDATION_ONLY = "validation_only"

class WriteMode(Enum):
    """Enumeration of write modes."""

    OVERWRITE = "overwrite"
    APPEND = "append"

class ValidationResult(Enum):
    """Enumeration of validation results."""

    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"

In [None]:
# Module: pipeline_builder_base.models.types (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Type definitions and protocols for the Pipeline Builder models.
"""

from typing import Dict, List, Protocol, TypeVar, Union

# Specific types for model values instead of Any
ModelValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ResourceValue = Union[str, int, float, bool, List[str], Dict[str, str]]

# Generic type for pipeline results
T = TypeVar("T")

class Validatable(Protocol):
    """Protocol for objects that can be validated."""

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid."""
        ...

class Serializable(Protocol):
    """Protocol for objects that can be serialized."""

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert object to dictionary."""
        ...

    def to_json(self) -> str:
        """Convert object to JSON string."""
        ...

In [None]:
# Module: pipeline_builder_base.models.exceptions (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Custom exceptions for the Pipeline Builder models.
"""

class PipelineConfigurationError(ValueError):
    """Raised when pipeline configuration is invalid."""

    pass

class PipelineExecutionError(RuntimeError):
    """Raised when pipeline execution fails."""

    pass

In [None]:
# Module: pipeline_builder_base.writer.models (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.models

"""
Writer-specific models and type definitions.

This module contains all the TypedDict definitions and type aliases
used by the writer module. It is engine-agnostic.
"""

from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Literal, Optional, TypedDict

# from ..models import ExecutionResult  # Removed: defined in notebook cells above

# ============================================================================
# Enums
# ============================================================================

class WriteMode(Enum):
    """Write mode for log operations."""

    OVERWRITE = "overwrite"
    APPEND = "append"
    MERGE = "merge"
    IGNORE = "ignore"

# ============================================================================
# TypedDict Definitions
# ============================================================================

class LogRow(TypedDict):
    """
    Enhanced log row with full type safety and framework integration.

    This is an engine-agnostic log row structure that can be used
    by both Spark and SQL implementations.
    """

    # Run-level information
    run_id: str
    run_mode: Literal["initial", "incremental", "full_refresh", "validation_only"]
    run_started_at: Optional[datetime]
    run_ended_at: Optional[datetime]

    # Execution context
    execution_id: str
    pipeline_id: str
    schema: str

    # Step-level information
    phase: Literal["bronze", "silver", "gold", "pipeline"]
    step_name: str
    step_type: str

    # Timing information
    start_time: Optional[datetime]
    end_time: Optional[datetime]
    duration_secs: float

    # Table information
    table_fqn: Optional[str]
    write_mode: Optional[Literal["overwrite", "append"]]

    # Data metrics
    input_rows: Optional[int]
    output_rows: Optional[int]
    rows_written: Optional[int]
    rows_processed: int
    table_total_rows: Optional[int]  # Total rows in table after this write

    # Validation metrics
    valid_rows: int
    invalid_rows: int
    validation_rate: float

    # Execution status
    success: bool
    error_message: Optional[str]

    # Performance metrics
    memory_usage_mb: Optional[float]
    cpu_usage_percent: Optional[float]

    # Metadata
    metadata: Dict[str, Any]

class WriterMetrics(TypedDict):
    """Metrics for writer operations."""

    total_writes: int
    successful_writes: int
    failed_writes: int
    total_duration_secs: float
    avg_write_duration_secs: float
    total_rows_written: int
    memory_usage_peak_mb: float

# ============================================================================
# Configuration Models
# ============================================================================

@dataclass
class WriterConfig:
    """
    Configuration for the LogWriter.

    Provides comprehensive configuration options for the writer module
    including table settings, performance tuning, and feature flags.
    """

    table_schema: str
    table_name: str
    write_mode: WriteMode = WriteMode.APPEND
    enable_analytics: bool = True
    enable_monitoring: bool = True
    enable_quality_checks: bool = True
    batch_size: int = 1000
    max_retries: int = 3
    retry_delay_secs: float = 1.0

    def validate(self) -> None:
        """Validate the writer configuration."""
        if not self.table_schema or not isinstance(self.table_schema, str):
            raise ValueError("table_schema must be a non-empty string")
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValueError("table_name must be a non-empty string")
        if self.batch_size < 1:
            raise ValueError("batch_size must be at least 1")
        if self.max_retries < 0:
            raise ValueError("max_retries must be non-negative")
        if self.retry_delay_secs < 0:
            raise ValueError("retry_delay_secs must be non-negative")

# ============================================================================
# Utility Functions
# ============================================================================

def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: Literal[
        "initial", "incremental", "full_refresh", "validation_only"
    ] = "initial",
    metadata: Optional[Dict[str, Any]] = None,
) -> list[LogRow]:
    """
    Create log rows from an execution result.

    This is an engine-agnostic function that creates log rows from
    execution results. Engine-specific implementations can use this
    as a base and extend it as needed.

    Args:
        execution_result: The execution result
        run_id: Run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List of log rows
    """
    log_rows = []

    # Create a main log row for the execution
    context = execution_result.context
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,
        "run_started_at": context.start_time,
        "run_ended_at": context.end_time,
        "execution_id": context.execution_id,
        "pipeline_id": context.pipeline_id,
        "schema": context.schema,
        "phase": "pipeline",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": context.start_time,
        "end_time": context.end_time,
        "duration_secs": context.duration_secs or 0.0,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": None,
        "output_rows": None,
        "rows_written": None,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": execution_result.success,
        "error_message": None,
        "memory_usage_mb": None,
        "cpu_usage_percent": None,
        "metadata": metadata or {},
    }

    log_rows.append(main_row)

    # Add step results
    for step_result in execution_result.step_results:
        step_row: LogRow = {
            "run_id": run_id,
            "run_mode": run_mode,
            "run_started_at": context.start_time,
            "run_ended_at": context.end_time,
            "execution_id": context.execution_id,
            "pipeline_id": context.pipeline_id,
            "schema": context.schema,
            "phase": step_result.phase.value,
            "step_name": step_result.step_name,
            "step_type": step_result.step_type or "unknown",
            "start_time": step_result.start_time,
            "end_time": step_result.end_time,
            "duration_secs": step_result.duration_secs,
            "table_fqn": step_result.table_fqn,
            "write_mode": step_result.write_mode,  # type: ignore[typeddict-item]
            "input_rows": step_result.input_rows,
            "output_rows": step_result.rows_written,
            "rows_written": step_result.rows_written,
            "rows_processed": step_result.rows_processed,
            "table_total_rows": None,
            "valid_rows": step_result.rows_processed,
            "invalid_rows": 0,
            "validation_rate": step_result.validation_rate,
            "success": step_result.success,
            "error_message": step_result.error_message,
            "memory_usage_mb": None,
            "cpu_usage_percent": None,
            "metadata": {},
        }
        log_rows.append(step_row)

    return log_rows

def validate_log_data(log_rows: list[LogRow]) -> None:
    """
    Validate log data for quality and consistency.

    Args:
        log_rows: List of log rows to validate

    Raises:
        ValueError: If validation fails
    """
    if not log_rows:
        return

    # Basic validation - check required fields
    required_fields = {"run_id", "phase", "step_name"}
    for i, row in enumerate(log_rows):
        missing_fields = required_fields - set(row.keys())
        if missing_fields:
            raise ValueError(f"Log row {i} missing required fields: {missing_fields}")

In [None]:
# Module: pipeline_builder_base.steps.utils (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Step utility functions.

This module provides utility functions for working with pipeline steps.
"""

from __future__ import annotations

from typing import Any, List

def classify_step_type(step: Any) -> str:
    """
    Classify step type from step object.

    Args:
        step: Step object to classify

    Returns:
        Step type: 'bronze', 'silver', 'gold', or 'unknown'
    """
    # Check if step has type attribute
    if hasattr(step, "type") and step.type:
        step_type = str(step.type).lower()
        if step_type in ("bronze", "silver", "gold"):
            return step_type

    # Determine type from class name
    class_name = step.__class__.__name__
    if "Bronze" in class_name:
        return "bronze"
    elif "Silver" in class_name:
        return "silver"
    elif "Gold" in class_name:
        return "gold"

    return "unknown"

def extract_step_dependencies(step: Any) -> List[str]:
    """
    Extract dependencies from a step.

    Args:
        step: Step object to analyze

    Returns:
        List of dependency step names
    """
    dependencies: List[str] = []

    # Check for source_bronze (silver steps)
    source_bronze = getattr(step, "source_bronze", None)
    if source_bronze:
        dependencies.append(source_bronze)

    # Check for source_silvers (gold steps)
    source_silvers = getattr(step, "source_silvers", None)
    if source_silvers:
        if isinstance(source_silvers, list):
            dependencies.extend(source_silvers)
        elif isinstance(source_silvers, str):
            dependencies.append(source_silvers)

    # Check for source attribute (backward compatibility)
    source = getattr(step, "source", None)
    if source and source not in dependencies:
        if isinstance(source, str):
            dependencies.append(source)
        elif isinstance(source, list):
            dependencies.extend(source)

    return dependencies

def get_step_target(step: Any) -> str:
    """
    Get target table name from a step.

    Args:
        step: Step object

    Returns:
        Target table name, or empty string if not found
    """
    # Check for table_name attribute
    table_name = getattr(step, "table_name", None)
    if table_name:
        return str(table_name)

    # Check for target attribute
    target = getattr(step, "target", None)
    if target:
        return str(target)

    return ""

def normalize_step_name(name: str) -> str:
    """
    Normalize step name (trim whitespace, convert to lowercase).

    Args:
        name: Step name to normalize

    Returns:
        Normalized step name
    """
    if not name:
        return ""
    return name.strip().lower()

In [None]:
# Module: pipeline_builder_base.errors.context (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Error context builders and suggestion generators.

This module provides utilities for building structured error context
and generating helpful error suggestions.
"""

from __future__ import annotations

from typing import Any, Dict, List, Union

# Type alias for error context
ErrorContextType = Dict[
    str, Union[str, int, float, bool, List[str], Dict[str, str], None]
]

class ErrorContext:
    """
    Structured error context for better error messages.
    """

    def __init__(self, **kwargs: Any):
        """
        Initialize error context with key-value pairs.

        Args:
            **kwargs: Context key-value pairs
        """
        self.context: ErrorContextType = dict(kwargs)

    def add(self, key: str, value: Any) -> None:
        """
        Add a context value.

        Args:
            key: Context key
            value: Context value
        """
        self.context[key] = value

    def to_dict(self) -> ErrorContextType:
        """
        Convert context to dictionary.

        Returns:
            Dictionary representation of context
        """
        return self.context.copy()

    def __repr__(self) -> str:
        """Return string representation."""
        return f"ErrorContext({self.context})"

class SuggestionGenerator:
    """
    Generator for helpful error suggestions.
    """

    @staticmethod
    def suggest_fix_for_missing_dependency(
        step_name: str, missing: str, step_type: str = "step"
    ) -> List[str]:
        """
        Generate suggestions for missing dependency.

        Args:
            step_name: Name of the step with missing dependency
            missing: Name of the missing dependency
            step_type: Type of step (bronze/silver/gold)

        Returns:
            List of suggestion strings
        """
        suggestions = [
            f"Add {step_type} step '{missing}' before '{step_name}'",
            f"Check spelling of '{missing}'",
            f"Ensure '{missing}' is defined in the pipeline",
        ]

        if step_type == "silver":
            suggestions.append(
                f"Bronze step '{missing}' must be added with with_bronze_rules()"
            )
        elif step_type == "gold":
            suggestions.append(
                f"Silver step '{missing}' must be added with add_silver_transform()"
            )

        return suggestions

    @staticmethod
    def suggest_fix_for_duplicate_name(name: str, step_type: str = "step") -> List[str]:
        """
        Generate suggestions for duplicate step name.

        Args:
            name: Duplicate step name
            step_type: Type of step (bronze/silver/gold)

        Returns:
            List of suggestion strings
        """
        return [
            f"Use a different name for this {step_type} step",
            f"Remove the existing {step_type} step '{name}' first",
            f"Check if '{name}' was already added to the pipeline",
        ]

    @staticmethod
    def suggest_fix_for_invalid_schema(schema: str) -> List[str]:
        """
        Generate suggestions for invalid schema name.

        Args:
            schema: Invalid schema name

        Returns:
            List of suggestion strings
        """
        return [
            "Schema name must be a non-empty string",
            "Schema name must be 128 characters or less",
            "Schema name cannot contain only whitespace",
            f"Check the schema name: '{schema}'",
        ]

    @staticmethod
    def suggest_fix_for_missing_rules(
        step_name: str, step_type: str = "step"
    ) -> List[str]:
        """
        Generate suggestions for missing validation rules.

        Args:
            step_name: Name of step missing rules
            step_type: Type of step (bronze/silver/gold)

        Returns:
            List of suggestion strings
        """
        return [
            f"{step_type.capitalize()} step '{step_name}' requires validation rules",
            "Add rules dictionary with column validation rules",
            "Rules cannot be empty",
        ]

def build_validation_context(step: Any, step_type: str) -> ErrorContextType:
    """
    Build error context for validation errors.

    Args:
        step: Step object
        step_type: Type of step (bronze/silver/gold)

    Returns:
        Error context dictionary
    """
    context: ErrorContextType = {
        "step_type": step_type,
    }

    step_name = getattr(step, "name", None)
    if step_name:
        context["step_name"] = step_name

    # Add step-specific context
    if step_type == "silver":
        source_bronze = getattr(step, "source_bronze", None)
        if source_bronze:
            context["source_bronze"] = source_bronze
    elif step_type == "gold":
        source_silvers = getattr(step, "source_silvers", None)
        if source_silvers:
            context["source_silvers"] = (
                source_silvers if isinstance(source_silvers, list) else [source_silvers]
            )

    return context

def build_execution_context(step: Any, error: Exception) -> ErrorContextType:
    """
    Build error context for execution errors.

    Args:
        step: Step object that failed
        error: Exception that occurred

    Returns:
        Error context dictionary
    """
    context: ErrorContextType = {
        "error_type": error.__class__.__name__,
        "error_message": str(error),
    }

    step_name = getattr(step, "name", None)
    if step_name:
        context["step_name"] = step_name

    step_type = getattr(step, "type", None)
    if step_type:
        context["step_type"] = step_type

    return context

In [None]:
# Module: pipeline_builder_base.builder.helpers (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Builder helper functions for creating step dictionaries.

This module provides helper functions for creating step configuration dictionaries.
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional

def create_bronze_step_dict(
    name: str,
    rules: Dict[str, Any],
    incremental_col: Optional[str] = None,
    schema: Optional[str] = None,
    **kwargs: Any,
) -> Dict[str, Any]:
    """
    Create a dictionary representing a bronze step configuration.

    Args:
        name: Step name
        rules: Validation rules dictionary
        incremental_col: Optional incremental column name
        schema: Optional schema name
        **kwargs: Additional step attributes

    Returns:
        Dictionary with bronze step configuration

    Example:
        >>> step_dict = create_bronze_step_dict(
        ...     name="events",
        ...     rules={"user_id": ["not_null"]},
        ...     incremental_col="timestamp"
        ... )
    """
    step_dict: Dict[str, Any] = {
        "name": name,
        "rules": rules,
        "incremental_col": incremental_col,
        "schema": schema,
        **kwargs,
    }
    return step_dict

def create_silver_step_dict(
    name: str,
    source_bronze: str,
    transform: Any,
    rules: Dict[str, Any],
    table_name: str,
    watermark_col: Optional[str] = None,
    schema: Optional[str] = None,
    **kwargs: Any,
) -> Dict[str, Any]:
    """
    Create a dictionary representing a silver step configuration.

    Args:
        name: Step name
        source_bronze: Source bronze step name
        transform: Transformation function
        rules: Validation rules dictionary
        table_name: Target table name
        watermark_col: Optional watermark column name
        schema: Optional schema name
        **kwargs: Additional step attributes

    Returns:
        Dictionary with silver step configuration

    Example:
        >>> step_dict = create_silver_step_dict(
        ...     name="clean_events",
        ...     source_bronze="events",
        ...     transform=clean_func,
        ...     rules={"user_id": ["not_null"]},
        ...     table_name="clean_events"
        ... )
    """
    step_dict: Dict[str, Any] = {
        "name": name,
        "source_bronze": source_bronze,
        "transform": transform,
        "rules": rules,
        "table_name": table_name,
        "watermark_col": watermark_col,
        "schema": schema,
        **kwargs,
    }
    return step_dict

def create_gold_step_dict(
    name: str,
    transform: Any,
    rules: Dict[str, Any],
    table_name: str,
    source_silvers: Optional[List[str]] = None,
    schema: Optional[str] = None,
    **kwargs: Any,
) -> Dict[str, Any]:
    """
    Create a dictionary representing a gold step configuration.

    Args:
        name: Step name
        transform: Transformation function
        rules: Validation rules dictionary
        table_name: Target table name
        source_silvers: Optional list of source silver step names
        schema: Optional schema name
        **kwargs: Additional step attributes

    Returns:
        Dictionary with gold step configuration

    Example:
        >>> step_dict = create_gold_step_dict(
        ...     name="daily_metrics",
        ...     transform=metrics_func,
        ...     rules={"metric": ["not_null"]},
        ...     table_name="daily_metrics",
        ...     source_silvers=["clean_events"]
        ... )
    """
    step_dict: Dict[str, Any] = {
        "name": name,
        "transform": transform,
        "rules": rules,
        "table_name": table_name,
        "source_silvers": source_silvers or [],
        "schema": schema,
        **kwargs,
    }
    return step_dict

In [None]:
# Module: pipeline_builder_base.validation.protocols (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.models

"""
Protocol definitions for validator interfaces.

This module defines Protocol classes that specify the expected interfaces
for validators, ensuring consistent return types and method signatures
across different validator implementations.
"""

from __future__ import annotations

from typing import Any, Dict, List, Protocol

# from ..models import PipelineConfig  # Removed: defined in notebook cells above

class PipelineValidatorProtocol(Protocol):
    """
    Protocol defining the interface for pipeline validators.

    This protocol ensures that all pipeline validators have consistent
    method signatures and return types, preventing type mismatch bugs.

    Note: Different implementations may return different types:
    - UnifiedValidator returns List[str]
    - UnifiedValidator returns ValidationResult
    """

    def validate_pipeline(
        self,
        config: PipelineConfig,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> List[str] | Any:
        """
        Validate entire pipeline configuration.

        Args:
            config: Pipeline configuration
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            List[str] or ValidationResult depending on implementation
        """
        ...

    def validate_schema(self, schema: str) -> List[str]:
        """
        Validate schema name.

        Args:
            schema: Schema name to validate

        Returns:
            List of validation errors (empty if valid)
        """
        ...

class ValidationResultProtocol(Protocol):
    """
    Protocol defining the interface for validation results.

    This protocol ensures that ValidationResult objects have consistent
    attributes that can be safely accessed.
    """

    @property
    def errors(self) -> List[str]:
        """List of validation error messages."""
        ...

    @property
    def warnings(self) -> List[str]:
        """List of validation warnings."""
        ...

    @property
    def is_valid(self) -> bool:
        """Whether validation passed."""
        ...

In [None]:
# Module: pipeline_builder_base.validation.utils (pipeline_builder_base)
#
# Dependencies: None (base module)

"""
Utility functions for the framework validation.

This module provides utility functions for data analysis and validation operations.
"""

from __future__ import annotations

from typing import Any, Dict, List, Set

def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
    """
    Safely divide two numbers, returning default if denominator is zero or None.

    Args:
        numerator: The numerator
        denominator: The denominator
        default: Default value to return if denominator is zero or None

    Returns:
        The division result or default value
    """
    if denominator is None or numerator is None or denominator == 0:
        return default
    return numerator / denominator

def validate_step_name(name: str) -> bool:
    """
    Validate step name format.

    Args:
        name: Step name to validate

    Returns:
        True if valid, False otherwise
    """
    if not name:
        return False
    if not isinstance(name, str):
        return False  # type: ignore[unreachable]
    if not name.strip():
        return False
    if len(name) > 128:  # Reasonable limit
        return False
    return True

def validate_schema_name(schema: str) -> bool:
    """
    Validate schema name format.

    Args:
        schema: Schema name to validate

    Returns:
        True if valid, False otherwise
    """
    if not schema:
        return False
    if not isinstance(schema, str):
        return False  # type: ignore[unreachable]
    if not schema.strip():
        return False
    if len(schema) > 128:  # Reasonable limit
        return False
    return True

def check_duplicate_names(items: List[Any], name_attr: str = "name") -> List[str]:
    """
    Check for duplicate names in a list of items.

    Args:
        items: List of items to check
        name_attr: Attribute name to use for getting the name

    Returns:
        List of duplicate names found
    """
    seen: Dict[str, int] = {}
    duplicates: List[str] = []

    for item in items:
        name = getattr(item, name_attr, None)
        if name:
            if name in seen:
                seen[name] += 1
                if name not in duplicates:
                    duplicates.append(name)
            else:
                seen[name] = 1

    return duplicates

def validate_dependency_chain(steps: Dict[str, Any]) -> List[str]:
    """
    Validate dependency chain and detect circular dependencies.

    Args:
        steps: Dictionary of steps with their dependencies

    Returns:
        List of validation errors (empty if valid)
    """
    errors: List[str] = []

    # Build dependency graph
    dependencies: Dict[str, List[str]] = {}
    for step_name, step in steps.items():
        deps = []
        if hasattr(step, "source_bronze") and step.source_bronze:
            deps.append(step.source_bronze)
        if hasattr(step, "source_silvers") and step.source_silvers:
            if isinstance(step.source_silvers, list):
                deps.extend(step.source_silvers)
        if deps:
            dependencies[step_name] = deps

    # Check for circular dependencies using DFS
    visited: Set[str] = set()
    rec_stack: Set[str] = set()

    def has_cycle(node: str) -> bool:
        """Check if there's a cycle starting from node."""
        visited.add(node)
        rec_stack.add(node)

        for neighbor in dependencies.get(node, []):
            if neighbor not in visited:
                if has_cycle(neighbor):
                    return True
            elif neighbor in rec_stack:
                # Found a back edge, cycle exists
                return True

        rec_stack.remove(node)
        return False

    # Check all nodes for cycles
    all_nodes = set(dependencies.keys())
    for node in all_nodes:
        if node not in visited:
            if has_cycle(node):
                errors.append(f"Circular dependency detected involving step '{node}'")

    return errors

In [None]:
# Module: pipeline_builder.typing_stubs (pipeline_builder)
#
# Dependencies: None (base module)

"""
Type stubs and Protocol classes for dynamic PySpark/mock-spark attributes.

This module provides Protocol classes that define the expected interface
for DataFrame, Column, and SparkSession objects, allowing mypy to understand
dynamic attributes that are added at runtime.
"""

from typing import Any, Protocol

class DataFrameProtocol(Protocol):
    """Protocol for DataFrame-like objects with dynamic attributes."""

    def write(self) -> Any:
        """Get DataFrameWriter for writing data."""
        ...

    def filter(self, condition: Any) -> Any:
        """Filter rows based on condition."""
        ...

    def select(self, *cols: Any) -> Any:
        """Select columns."""
        ...

    def withColumn(self, colName: str, col: Any) -> Any:
        """Add or replace a column."""
        ...

    def withColumnRenamed(self, existing: str, new: str) -> Any:
        """Rename a column."""
        ...

    def count(self) -> int:
        """Count rows."""
        ...

    def collect(self) -> list[Any]:
        """Collect rows to driver."""
        ...

    def cache(self) -> Any:
        """Cache the DataFrame."""
        ...

    def show(self, n: int = 20, truncate: bool = True) -> None:
        """Show DataFrame contents."""
        ...

class ColumnProtocol(Protocol):
    """Protocol for Column-like objects with dynamic attributes."""

    def isNotNull(self) -> Any:
        """Check if column is not null."""
        ...

    def isNull(self) -> Any:
        """Check if column is null."""
        ...

    def __eq__(self, other: Any) -> Any:
        """Equality comparison."""
        ...

    def __ne__(self, other: Any) -> Any:
        """Inequality comparison."""
        ...

    def __gt__(self, other: Any) -> Any:
        """Greater than comparison."""
        ...

    def __lt__(self, other: Any) -> Any:
        """Less than comparison."""
        ...

    def __ge__(self, other: Any) -> Any:
        """Greater than or equal comparison."""
        ...

    def __le__(self, other: Any) -> Any:
        """Less than or equal comparison."""
        ...

    def __add__(self, other: Any) -> Any:
        """Addition operator."""
        ...

    def __sub__(self, other: Any) -> Any:
        """Subtraction operator."""
        ...

    def __mul__(self, other: Any) -> Any:
        """Multiplication operator."""
        ...

    def __div__(self, other: Any) -> Any:
        """Division operator."""
        ...

    def cast(self, dataType: Any) -> Any:
        """Cast column to different type."""
        ...

class SparkSessionProtocol(Protocol):
    """Protocol for SparkSession-like objects with dynamic attributes."""

    catalog: Any
    """Catalog for accessing databases and tables."""

    def table(self, name: str) -> Any:
        """Get a table as DataFrame."""
        ...

    def createDataFrame(
        self, data: Any, schema: Any = None, samplingRatio: Any = None
    ) -> Any:
        """Create DataFrame from data."""
        ...

    def sql(self, sqlQuery: str) -> Any:
        """Execute SQL query and return DataFrame."""
        ...

    def stop(self) -> None:
        """Stop the SparkSession."""
        ...

    def sparkContext(self) -> Any:
        """Get SparkContext."""
        ...

    def conf(self) -> Any:
        """Get SparkConf."""
        ...

In [None]:
# Module: pipeline_builder.protocols (pipeline_builder)
#
# Dependencies: None (base module)

"""
Protocol definitions for engine-agnostic Spark interfaces.

These protocols represent the minimal surface area used by pipeline_builder.
Any engine (PySpark, sparkless, or other) must satisfy these to work with
pipeline_builder. Keep these protocols lean—only include members actually
consumed in src.
"""

from __future__ import annotations

from typing import Any, Protocol, runtime_checkable

@runtime_checkable
class ColumnProtocol(Protocol):
    """Column-like object with comparison and basic ops."""

    def isNotNull(self) -> Any: ...
    def isNull(self) -> Any: ...
    def cast(self, dataType: Any) -> Any: ...
    def __eq__(self, other: Any) -> Any: ...
    def __ne__(self, other: Any) -> Any: ...
    def __gt__(self, other: Any) -> Any: ...
    def __lt__(self, other: Any) -> Any: ...
    def __ge__(self, other: Any) -> Any: ...
    def __le__(self, other: Any) -> Any: ...
    def __add__(self, other: Any) -> Any: ...
    def __sub__(self, other: Any) -> Any: ...
    def __mul__(self, other: Any) -> Any: ...
    def __truediv__(self, other: Any) -> Any: ...

@runtime_checkable
class DataFrameProtocol(Protocol):
    """DataFrame-like object used throughout pipeline_builder."""

    # Core accessors
    def schema(self) -> Any: ...
    @property
    def columns(self) -> list[str]: ...

    # Actions
    def count(self) -> int: ...
    def collect(self) -> list[Any]: ...
    def show(self, n: int = 20, truncate: bool = True) -> None: ...

    # Transformations
    def filter(self, condition: Any) -> Any: ...
    def select(self, *cols: Any) -> Any: ...
    def withColumn(self, colName: str, col: Any) -> Any: ...
    def withColumnRenamed(self, existing: str, new: str) -> Any: ...
    def groupBy(self, *cols: Any) -> Any: ...
    def agg(self, *exprs: Any, **kwargs: Any) -> Any: ...
    def limit(self, num: int) -> Any: ...
    def cache(self) -> Any: ...

    # SQL helpers
    def createOrReplaceTempView(self, name: str) -> None: ...

    # Writer
    @property
    def write(self) -> Any: ...

@runtime_checkable
class FunctionsProtocol(Protocol):
    """Functions module interface (col, lit, aggregations, etc.)."""

    def col(self, col_name: str) -> ColumnProtocol: ...
    def expr(self, expr: str) -> ColumnProtocol: ...
    def lit(self, value: Any) -> ColumnProtocol: ...
    def when(self, condition: ColumnProtocol, value: Any) -> ColumnProtocol: ...
    def count(self, col: Any = "*") -> ColumnProtocol: ...
    def countDistinct(self, *cols: Any) -> ColumnProtocol: ...
    def sum(self, col: Any) -> ColumnProtocol: ...
    def max(self, col: Any) -> ColumnProtocol: ...
    def min(self, col: Any) -> ColumnProtocol: ...
    def avg(self, col: Any) -> ColumnProtocol: ...
    def length(self, col: Any) -> ColumnProtocol: ...
    def date_trunc(self, fmt: str, col: Any) -> ColumnProtocol: ...
    def dayofweek(self, col: Any) -> ColumnProtocol: ...
    def current_timestamp(self) -> ColumnProtocol: ...

@runtime_checkable
class TypesProtocol(Protocol):
    """Types namespace used for schemas and fields."""

    StructType: Any
    StructField: Any
    StringType: Any
    IntegerType: Any
    FloatType: Any
    DoubleType: Any
    LongType: Any
    TimestampType: Any
    BooleanType: Any

@runtime_checkable
class WindowProtocol(Protocol):
    """Window spec placeholder."""

    def orderBy(self, *cols: Any, **kwargs: Any) -> Any: ...
    def partitionBy(self, *cols: Any) -> Any: ...

@runtime_checkable
class AnalysisExceptionProtocol(Protocol):
    """Exception type placeholder for analysis errors."""

    @property
    def desc(self) -> str: ...

@runtime_checkable
class SparkSessionProtocol(Protocol):
    """SparkSession-like interface."""

    @property
    def catalog(self) -> Any: ...
    @property
    def conf(self) -> Any: ...

    def table(self, name: str) -> DataFrameProtocol: ...
    def createDataFrame(
        self, data: Any, schema: Any = None, samplingRatio: Any = None
    ) -> DataFrameProtocol: ...
    def sql(self, sqlQuery: str) -> DataFrameProtocol: ...
    def stop(self) -> None: ...

    # Builder/config checks
    @property
    def _jsparkSession(self) -> Any: ...  # optional; used for ids

__all__ = [
    "ColumnProtocol",
    "DataFrameProtocol",
    "FunctionsProtocol",
    "TypesProtocol",
    "WindowProtocol",
    "AnalysisExceptionProtocol",
    "SparkSessionProtocol",
]

In [None]:
# Module: pipeline_builder.constants (pipeline_builder)
#
# Dependencies: None (base module)

"""
Constants and configuration values for the framework.

This module contains all magic numbers, default values, and configuration
constants used throughout the the codebase.
"""

# Memory and Size Constants
BYTES_PER_KB = 1024
BYTES_PER_MB = BYTES_PER_KB * 1024
BYTES_PER_GB = BYTES_PER_MB * 1024

# Default Memory Limits
DEFAULT_MAX_MEMORY_MB = 1024
DEFAULT_CACHE_MEMORY_MB = 512

# File Size Constants
DEFAULT_MAX_FILE_SIZE_MB = 10
DEFAULT_BACKUP_COUNT = 5

# Performance Constants
DEFAULT_CACHE_PARTITIONS = 200
DEFAULT_SHUFFLE_PARTITIONS = 200

# Validation Constants
DEFAULT_BRONZE_THRESHOLD = 95.0
DEFAULT_SILVER_THRESHOLD = 98.0
DEFAULT_GOLD_THRESHOLD = 99.0

# Timeout Constants (in seconds)
DEFAULT_TIMEOUT_SECONDS = 300
DEFAULT_RETRY_TIMEOUT_SECONDS = 60

# Logging Constants
DEFAULT_LOG_LEVEL = "INFO"
DEFAULT_VERBOSE = True

# Schema Constants
DEFAULT_SCHEMA = "default"
TEST_SCHEMA = "test_schema"

# Error Constants
MAX_ERROR_MESSAGE_LENGTH = 1000
MAX_STACK_TRACE_LINES = 50

# Performance Monitoring Constants
DEFAULT_METRICS_INTERVAL_SECONDS = 30
DEFAULT_ALERT_THRESHOLD_PERCENT = 80.0

In [None]:
# Module: pipeline_builder.dependencies.graph (pipeline_builder)
#
# Dependencies: None (base module)

"""
Dependency graph representation for the framework pipelines.

This module provides a clean, efficient representation of pipeline dependencies
that can be used for dependency analysis, cycle detection, execution planning,
and optimization. The graph supports topological sorting and validation.

**Key Features:**
    - **Dependency Tracking**: Track dependencies and dependents for each step
    - **Cycle Detection**: Detect circular dependencies in the pipeline
    - **Topological Sort**: Order steps by dependency requirements for sequential execution
    - **Validation**: Validate graph structure and detect issues

**Common Use Cases:**
    - Analyze pipeline dependencies before execution
    - Detect circular dependencies that would cause execution failures
    - Determine execution order for sequential processing using topological sort

Example:
    >>> from pipeline_builder.dependencies.graph import (
    ...     DependencyGraph,
    ...     StepNode,
    ...     StepType
    ... )
    >>>
    >>> # Create dependency graph
    >>> graph = DependencyGraph()
    >>>
    >>> # Add nodes
    >>> bronze = StepNode("bronze_step", StepType.BRONZE)
    >>> silver = StepNode("silver_step", StepType.SILVER)
    >>> graph.add_node(bronze)
    >>> graph.add_node(silver)
    >>>
    >>> # Add dependency (silver depends on bronze)
    >>> graph.add_dependency("silver_step", "bronze_step")
    >>>
    >>> # Validate and get execution order
    >>> issues = graph.validate()
    >>> execution_order = graph.topological_sort()
    >>> print(execution_order)  # ["bronze_step", "silver_step"]
"""

from __future__ import annotations

import logging
from collections import defaultdict, deque
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, Optional

logger = logging.getLogger(__name__)

class StepType(Enum):
    """Types of pipeline steps in the Medallion Architecture.

    Represents the three layers of the Medallion Architecture:
    - BRONZE: Raw data ingestion and validation layer
    - SILVER: Cleaned and enriched data layer
    - GOLD: Business-ready analytics and reporting layer

    Example:
        >>> from pipeline_builder.dependencies.graph import StepType
        >>> step_type = StepType.BRONZE
        >>> print(step_type.value)  # "bronze"
    """

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

@dataclass
class StepNode:
    """Represents a single step in the dependency graph.

    A StepNode contains all information about a pipeline step including its
    dependencies, dependents, execution metadata, and custom metadata.

    Attributes:
        name: Unique identifier for this step.
        step_type: Type of step (BRONZE, SILVER, or GOLD).
        dependencies: Set of step names that this step depends on. Steps in
            this set must complete before this step can execute.
        dependents: Set of step names that depend on this step. These steps
            cannot execute until this step completes.
        execution_group: (Deprecated) Legacy field, no longer used. Execution
            order is determined by topological sort.
        estimated_duration: Estimated execution duration in seconds. Used
            for optimization and scheduling. Defaults to 0.0.
        metadata: Dictionary for storing custom metadata about the step.
            Can contain any key-value pairs.

    Example:
        >>> from pipeline_builder.dependencies.graph import StepNode, StepType
        >>> node = StepNode(
        ...     name="user_events",
        ...     step_type=StepType.BRONZE,
        ...     estimated_duration=10.5,
        ...     metadata={"source": "kafka", "partition_count": 4}
        ... )
    """

    name: str
    step_type: StepType
    dependencies: set[str] = field(default_factory=set)
    dependents: set[str] = field(default_factory=set)
    execution_group: int = 0
    estimated_duration: float = 0.0
    metadata: Dict[str, Any] = field(default_factory=dict)

class DependencyGraph:
    """Represents the dependency graph of a pipeline.

    This class provides efficient operations for dependency analysis,
    cycle detection, and execution planning. It maintains both forward
    and reverse adjacency lists for efficient traversal in both directions.

    **Key Operations:**
        - Add nodes and dependencies
        - Detect circular dependencies
        - Perform topological sort for execution order
        - Validate graph structure

    Attributes:
        nodes: Dictionary mapping step names to StepNode instances.
        _adjacency_list: Forward adjacency list for dependency traversal.
        _reverse_adjacency_list: Reverse adjacency list for dependent traversal.

    Example:
        >>> from pipeline_builder.dependencies.graph import (
        ...     DependencyGraph,
        ...     StepNode,
        ...     StepType
        ... )
        >>>
        >>> graph = DependencyGraph()
        >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
        >>> graph.add_node(StepNode("silver", StepType.SILVER))
        >>> graph.add_dependency("silver", "bronze")
        >>> execution_order = graph.topological_sort()
    """

    def __init__(self) -> None:
        """Initialize an empty dependency graph."""
        self.nodes: Dict[str, StepNode] = {}
        self._adjacency_list: Dict[str, set[str]] = defaultdict(set)
        self._reverse_adjacency_list: Dict[str, set[str]] = defaultdict(set)

    def add_node(self, node: StepNode) -> None:
        """Add a node to the dependency graph.

        Adds a StepNode to the graph and initializes its adjacency list entries.
        If a node with the same name already exists, it will be replaced.

        Args:
            node: StepNode instance to add to the graph.

        Example:
            >>> graph = DependencyGraph()
            >>> node = StepNode("bronze_step", StepType.BRONZE)
            >>> graph.add_node(node)
        """
        self.nodes[node.name] = node
        self._adjacency_list[node.name] = set()
        self._reverse_adjacency_list[node.name] = set()

    def add_dependency(self, from_step: str, to_step: str) -> None:
        """Add a dependency from one step to another.

        Creates a dependency relationship where `from_step` depends on `to_step`.
        This means `to_step` must complete before `from_step` can execute.

        Args:
            from_step: Name of the step that depends on `to_step`.
            to_step: Name of the step that `from_step` depends on.

        Raises:
            ValueError: If either step is not found in the graph.

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
            >>> graph.add_node(StepNode("silver", StepType.SILVER))
            >>> # Silver depends on bronze
            >>> graph.add_dependency("silver", "bronze")
        """
        if from_step not in self.nodes or to_step not in self.nodes:
            raise ValueError(f"Steps {from_step} or {to_step} not found in graph")

        self._adjacency_list[from_step].add(to_step)
        self._reverse_adjacency_list[to_step].add(from_step)

        # Update node dependencies
        self.nodes[from_step].dependencies.add(to_step)
        self.nodes[to_step].dependents.add(from_step)

    def get_dependencies(self, step_name: str) -> set[str]:
        """Get all dependencies for a step.

        Returns a copy of the set of step names that the specified step
        depends on. These steps must complete before the specified step
        can execute.

        Args:
            step_name: Name of the step to get dependencies for.

        Returns:
            Set of step names that the specified step depends on. Returns
            an empty set if the step is not found in the graph.

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
            >>> graph.add_node(StepNode("silver", StepType.SILVER))
            >>> graph.add_dependency("silver", "bronze")
            >>> deps = graph.get_dependencies("silver")
            >>> print(deps)  # {"bronze"}
        """
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependencies.copy()

    def get_dependents(self, step_name: str) -> set[str]:
        """Get all dependents for a step.

        Returns a copy of the set of step names that depend on the specified
        step. These steps cannot execute until the specified step completes.

        Args:
            step_name: Name of the step to get dependents for.

        Returns:
            Set of step names that depend on the specified step. Returns
            an empty set if the step is not found in the graph.

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
            >>> graph.add_node(StepNode("silver", StepType.SILVER))
            >>> graph.add_dependency("silver", "bronze")
            >>> dependents = graph.get_dependents("bronze")
            >>> print(dependents)  # {"silver"}
        """
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependents.copy()

    def detect_cycles(self) -> list[list[str]]:
        """Detect cycles in the dependency graph using DFS.

        Detects all circular dependencies in the graph using depth-first search.
        A cycle indicates that there's a circular dependency that would prevent
        execution (e.g., A depends on B, B depends on A).

        Returns:
            List of cycles, where each cycle is a list of step names forming
            a circular dependency. Returns an empty list if no cycles are found.

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("step_a", StepType.BRONZE))
            >>> graph.add_node(StepNode("step_b", StepType.SILVER))
            >>> graph.add_dependency("step_a", "step_b")
            >>> graph.add_dependency("step_b", "step_a")  # Creates cycle
            >>> cycles = graph.detect_cycles()
            >>> print(cycles)  # [["step_a", "step_b", "step_a"]]
        """
        visited = set()
        rec_stack = set()
        cycles = []

        def dfs(node: str, path: list[str]) -> None:
            if node in rec_stack:
                # Found a cycle
                cycle_start = path.index(node)
                cycle = path[cycle_start:] + [node]
                cycles.append(cycle)
                return

            if node in visited:
                return

            visited.add(node)
            rec_stack.add(node)
            path.append(node)

            for neighbor in self._adjacency_list[node]:
                dfs(neighbor, path)

            rec_stack.remove(node)
            path.pop()

        for node in self.nodes:
            if node not in visited:
                dfs(node, [])

        return cycles

    def topological_sort(
        self, creation_order: Optional[Dict[str, int]] = None
    ) -> list[str]:
        """Perform topological sort of the dependency graph.

        Returns nodes in an order such that all dependencies come before their
        dependents. This provides a valid execution order for the pipeline steps.

        **Algorithm:**
            Uses Kahn's algorithm with in-degree counting. Steps with no
            dependencies (in-degree 0) are processed first, then their dependents
            are processed when all their dependencies are satisfied.

            **Explicit dependencies (e.g., source_silvers) always override creation order.**
            When multiple nodes have the same in-degree (no dependencies or same
            dependency level), creation_order is used as a tie-breaker to ensure
            deterministic ordering based on when steps were added to the pipeline.

        Args:
            creation_order: Optional dictionary mapping step names to creation order
                (lower number = created earlier). Used as tie-breaker for deterministic
                ordering when steps have no explicit dependencies. Explicit dependencies
                (via source_silvers, source_bronze, etc.) always take precedence.

        Returns:
            List of step names in topological order. If there are cycles in
            the graph, the result may be incomplete (some steps may be missing).

        Raises:
            RuntimeError: If cycles are detected (topological sort is not
                possible for cyclic graphs).

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
            >>> graph.add_node(StepNode("silver", StepType.SILVER))
            >>> graph.add_node(StepNode("gold", StepType.GOLD))
            >>> graph.add_dependency("silver", "bronze")
            >>> graph.add_dependency("gold", "silver")
            >>> order = graph.topological_sort()
            >>> print(order)  # ["bronze", "silver", "gold"]
        """
        in_degree = dict.fromkeys(self.nodes, 0)

        # Calculate in-degrees using reverse adjacency
        # If A depends on B, then B->A edge exists in reverse list
        for node in self.nodes:
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] += 1

        # Helper function to get creation order for sorting
        def get_sort_key(node_name: str) -> tuple[int, int]:
            """Return sort key: (in_degree, creation_order).

            Lower creation_order (earlier created) comes first.
            If creation_order not available, use a large number to sort to end.
            """
            creation_ord: int = (
                creation_order.get(node_name, 2**31 - 1)
                if creation_order
                else 2**31 - 1
            )
            return (in_degree[node_name], creation_ord)

        # Find nodes with no incoming edges (no dependencies)
        # Sort by creation order for deterministic ordering
        ready_nodes = [node for node, degree in in_degree.items() if degree == 0]
        if creation_order:
            ready_nodes.sort(key=get_sort_key)
        queue = deque(ready_nodes)
        result = []

        while queue:
            node = queue.popleft()
            result.append(node)

            # Process nodes that depend on this one
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] -= 1
                if in_degree[dependent] == 0:
                    queue.append(dependent)
                    # Re-sort queue to maintain creation order when adding new nodes
                    # Convert to list, sort, convert back to deque
                    if creation_order and len(queue) > 1:
                        queue_list = list(queue)
                        queue_list.sort(key=get_sort_key)
                        queue = deque(queue_list)

        return result

    def validate(self) -> list[str]:
        """Validate the dependency graph and return any issues.

        Checks the graph for common issues including:
        - Circular dependencies (cycles)
        - Missing dependencies (steps that reference non-existent steps)

        Returns:
            List of validation issue messages. Returns an empty list if the
            graph is valid. Each message describes a specific issue found.

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("step_a", StepType.BRONZE))
            >>> graph.add_node(StepNode("step_b", StepType.SILVER))
            >>> # Add invalid dependency
            >>> graph.nodes["step_b"].dependencies.add("missing_step")
            >>> issues = graph.validate()
            >>> print(issues)  # ["Node step_b depends on missing node missing_step"]
        """
        issues = []

        # Check for cycles
        cycles = self.detect_cycles()
        if cycles:
            for cycle in cycles:
                issues.append(f"Circular dependency detected: {' -> '.join(cycle)}")

        # Check for missing dependencies
        for node_name, node in self.nodes.items():
            for dep in node.dependencies:
                if dep not in self.nodes:
                    issues.append(f"Node {node_name} depends on missing node {dep}")

        return issues

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the dependency graph.

        Calculates and returns various statistics about the graph structure,
        including node counts, edge counts, type distribution, and cycle
        detection.

        Returns:
            Dictionary containing statistics with the following keys:
                - `total_nodes`: Total number of nodes in the graph
                - `total_edges`: Total number of dependency edges
                - `type_counts`: Dictionary mapping step types to counts
                - `average_dependencies`: Average number of dependencies per node
                - `has_cycles`: Boolean indicating if cycles are detected

        Example:
            >>> graph = DependencyGraph()
            >>> graph.add_node(StepNode("bronze", StepType.BRONZE))
            >>> graph.add_node(StepNode("silver", StepType.SILVER))
            >>> stats = graph.get_stats()
            >>> print(f"Total nodes: {stats['total_nodes']}")  # 2
            >>> print(f"Has cycles: {stats['has_cycles']}")  # False
        """
        total_nodes = len(self.nodes)
        total_edges = sum(len(deps) for deps in self._adjacency_list.values())

        # Count by step type
        type_counts: Dict[str, int] = defaultdict(int)
        for node in self.nodes.values():
            type_counts[node.step_type.value] += 1

        # Calculate average dependencies
        avg_dependencies = total_edges / total_nodes if total_nodes > 0 else 0

        return {
            "total_nodes": total_nodes,
            "total_edges": total_edges,
            "type_counts": dict(type_counts),
            "average_dependencies": avg_dependencies,
            "has_cycles": len(self.detect_cycles()) > 0,
        }

In [None]:
# Module: pipeline_builder.dependencies.analyzer (pipeline_builder)
#
# Dependencies: pipeline_builder_base.dependencies

"""
Unified dependency analyzer for the framework pipelines.

This module re-exports the base DependencyAnalyzer which works with Spark steps
via protocol-based typing.
"""

from __future__ import annotations

# Re-export from base - the base analyzer uses protocols so it works with Spark steps
# from .dependencies import (  # Removed: defined in notebook cells above
    # AnalysisStrategy,
    # DependencyAnalysisResult,
    # DependencyAnalyzer,
    # DependencyError,
    # DependencyGraph,
    # StepNode,
    # StepType,
# )

# Keep for backward compatibility - the base analyzer works with any step type via protocols
__all__ = [
    "DependencyAnalyzer",
    "DependencyAnalysisResult",
    "AnalysisStrategy",
    "DependencyGraph",
    "StepNode",
    "StepType",
    "DependencyError",
]

In [None]:
# Module: pipeline_builder.dependencies.exceptions (pipeline_builder)
#
# Dependencies: None (base module)

"""
Dependency analysis exceptions for the framework.

This module defines exceptions specific to dependency analysis operations.
"""

from typing import List, Optional

class DependencyError(Exception):
    """Base exception for dependency-related errors."""

    def __init__(self, message: str, step_name: Optional[str] = None):
        super().__init__(message)
        self.step_name = step_name

class DependencyAnalysisError(DependencyError):
    """Raised when dependency analysis fails."""

    def __init__(self, message: str, analysis_step: Optional[str] = None):
        super().__init__(message, analysis_step)
        self.analysis_step = analysis_step

class CircularDependencyError(DependencyError):
    """Raised when circular dependencies are detected."""

    def __init__(self, message: str, cycle: List[str]):
        super().__init__(message)
        self.cycle = cycle

class InvalidDependencyError(DependencyError):
    """Raised when invalid dependencies are detected."""

    def __init__(self, message: str, invalid_dependencies: List[str]):
        super().__init__(message)
        self.invalid_dependencies = invalid_dependencies

class DependencyConflictError(DependencyError):
    """Raised when dependency conflicts are detected."""

    def __init__(self, message: str, conflicting_steps: List[str]):
        super().__init__(message)
        self.conflicting_steps = conflicting_steps

In [None]:
# Module: pipeline_builder.models.enums (pipeline_builder)
#
# Dependencies: None (base module)

"""
Enums for the Pipeline Builder models.

This module provides enumeration classes for pipeline phases, execution modes,
write modes, and validation results. These enums ensure type safety and
provide clear constants for use throughout the pipeline system.

Key Components:
    - **PipelinePhase**: Medallion Architecture layers (BRONZE, SILVER, GOLD)
    - **ExecutionMode**: Pipeline execution modes (INITIAL, INCREMENTAL, etc.)
    - **WriteMode**: Data write modes (OVERWRITE, APPEND)
    - **ValidationResult**: Validation outcomes (PASSED, FAILED, WARNING)

Example:
    >>> from pipeline_builder.models.enums import (
    ...     PipelinePhase,
    ...     ExecutionMode,
    ...     WriteMode,
    ...     ValidationResult
    ... )
    >>>
    >>> # Use pipeline phase
    >>> phase = PipelinePhase.BRONZE
    >>> print(phase.value)  # "bronze"
    >>>
    >>> # Use execution mode
    >>> mode = ExecutionMode.INITIAL
    >>> print(mode.value)  # "initial"
"""

from enum import Enum

class PipelinePhase(Enum):
    """Enumeration of pipeline phases.

    Represents the three layers of the Medallion Architecture:
    - BRONZE: Raw data ingestion and validation layer
    - SILVER: Cleaned and enriched data layer
    - GOLD: Business-ready analytics and reporting layer

    Example:
        >>> from pipeline_builder.models.enums import PipelinePhase
        >>> phase = PipelinePhase.BRONZE
        >>> print(phase.value)  # "bronze"
    """

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

class ExecutionMode(Enum):
    """Enumeration of execution modes.

    Defines how a pipeline should be executed:
    - INITIAL: First-time execution with full data processing
    - INCREMENTAL: Process only new data based on watermark columns
    - FULL_REFRESH: Reprocess all data, overwriting existing results
    - VALIDATION_ONLY: Validate data without writing results

    Example:
        >>> from pipeline_builder.models.enums import ExecutionMode
        >>> mode = ExecutionMode.INCREMENTAL
        >>> print(mode.value)  # "incremental"
    """

    INITIAL = "initial"
    INCREMENTAL = "incremental"
    FULL_REFRESH = "full_refresh"
    VALIDATION_ONLY = "validation_only"

class WriteMode(Enum):
    """Enumeration of write modes.

    Defines how data should be written to tables:
    - OVERWRITE: Replace all existing data in the table
    - APPEND: Add new data to existing table data

    Example:
        >>> from pipeline_builder.models.enums import WriteMode
        >>> mode = WriteMode.OVERWRITE
        >>> print(mode.value)  # "overwrite"
    """

    OVERWRITE = "overwrite"
    APPEND = "append"

class ValidationResult(Enum):
    """Enumeration of validation results.

    Represents the outcome of data validation:
    - PASSED: Validation succeeded, data meets quality requirements
    - FAILED: Validation failed, data does not meet quality requirements
    - WARNING: Validation passed but with warnings (e.g., low validation rate)

    Example:
        >>> from pipeline_builder.models.enums import ValidationResult
        >>> result = ValidationResult.PASSED
        >>> print(result.value)  # "passed"
    """

    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"

In [None]:
# Module: pipeline_builder.models.exceptions (pipeline_builder)
#
# Dependencies: None (base module)

"""
Custom exceptions for the Pipeline Builder models.

This module provides custom exception classes for pipeline configuration
and execution errors. These exceptions provide clearer error semantics
than generic Python exceptions.

Key Components:
    - **PipelineConfigurationError**: Raised when pipeline configuration
      is invalid (e.g., missing required fields, invalid values)
    - **PipelineExecutionError**: Raised when pipeline execution fails
      (e.g., step execution errors, validation failures)

Example:
    >>> from pipeline_builder.models.exceptions import (
    ...     PipelineConfigurationError,
    ...     PipelineExecutionError
    ... )
    >>>
    >>> # Raise configuration error
    >>> if not schema:
    ...     raise PipelineConfigurationError("Schema name is required")
    >>>
    >>> # Raise execution error
    >>> if step_failed:
    ...     raise PipelineExecutionError("Step execution failed")
"""

class PipelineConfigurationError(ValueError):
    """Raised when pipeline configuration is invalid.

    This exception is raised when pipeline configuration objects (e.g.,
    PipelineConfig, step configurations) are invalid. It indicates a
    problem with the configuration itself, not with execution.

    Common causes:
        - Missing required fields
        - Invalid field values (e.g., negative thresholds)
        - Inconsistent configuration (e.g., invalid schema name)

    Example:
        >>> from pipeline_builder.models.exceptions import PipelineConfigurationError
        >>> if not schema:
        ...     raise PipelineConfigurationError("Schema name is required")
    """

    pass

class PipelineExecutionError(RuntimeError):
    """Raised when pipeline execution fails.

    This exception is raised when pipeline execution encounters an error
    during runtime. It indicates a problem with execution, not with
    configuration.

    Common causes:
        - Step execution failures
        - Data validation failures
        - Write operation failures
        - Resource constraints (memory, disk space)

    Example:
        >>> from pipeline_builder.models.exceptions import PipelineExecutionError
        >>> if validation_rate < threshold:
        ...     raise PipelineExecutionError(
        ...         f"Validation rate {validation_rate}% below threshold {threshold}%"
        ...     )
    """

    pass

In [None]:
# Module: pipeline_builder.storage.schema_utils (pipeline_builder)
#
# Dependencies: None (base module)

"""
Schema utility functions.

This module provides utility functions for schema operations that are used
by both execution and storage modules.
"""

from __future__ import annotations

from typing import Any, Optional

def get_existing_schema_safe(spark: Any, table_name: str) -> Optional[Any]:
    """
    Safely get the schema of an existing table.

    Tries multiple methods to get the schema:
    1. Direct schema from spark.table()
    2. If empty schema (catalog sync issue), try DESCRIBE TABLE
    3. If still empty, try reading a sample of data to infer schema

    Args:
        spark: Spark session
        table_name: Fully qualified table name

    Returns:
        StructType schema if table exists and schema is readable (may be empty struct<>), None if table doesn't exist or schema can't be read
    """
    try:
        table_df = spark.table(table_name)
        schema = table_df.schema

        # If schema is empty (catalog sync issue), try DESCRIBE TABLE as fallback
        if not schema.fields or len(schema.fields) == 0:
            try:
                # Try DESCRIBE TABLE to get schema information
                describe_df = spark.sql(f"DESCRIBE TABLE {table_name}")
                describe_rows = describe_df.collect()

                # If DESCRIBE returns rows with column info, try to read schema from data
                if describe_rows and len(describe_rows) > 0:
                    # Try reading a sample row to infer schema
                    try:
                        sample_df = spark.sql(f"SELECT * FROM {table_name} LIMIT 1")
                        inferred_schema = sample_df.schema
                        if inferred_schema.fields and len(inferred_schema.fields) > 0:
                            return inferred_schema
                    except Exception:
                        pass
            except Exception:
                pass

        # Return schema even if empty (struct<>) - caller will handle empty schemas specially
        return schema
    except Exception:
        pass
    return None

def schemas_match(existing_schema: Any, output_schema: Any) -> tuple[bool, list[str]]:
    """
    Compare two schemas and determine if they match exactly.

    Args:
        existing_schema: Schema of the existing table
        output_schema: Schema of the output DataFrame

    Returns:
        Tuple of (matches: bool, differences: list[str])
        differences contains descriptions of any mismatches
    """
    differences = []

    # Extract field dictionaries
    existing_fields = (
        {f.name: f for f in existing_schema.fields} if existing_schema.fields else {}
    )
    output_fields = (
        {f.name: f for f in output_schema.fields} if output_schema.fields else {}
    )

    existing_columns = set(existing_fields.keys())
    output_columns = set(output_fields.keys())

    # Check for missing columns in output
    missing_in_output = existing_columns - output_columns
    if missing_in_output:
        differences.append(f"Missing columns in output: {sorted(missing_in_output)}")

    # Check for new columns in output
    new_in_output = output_columns - existing_columns
    if new_in_output:
        differences.append(
            f"New columns in output (not in existing table): {sorted(new_in_output)}"
        )

    # Check for type mismatches and nullable changes in common columns
    common_columns = existing_columns & output_columns
    type_mismatches = []
    nullable_changes = []
    for col in common_columns:
        existing_field = existing_fields[col]
        output_field = output_fields[col]

        # Check type mismatch
        if existing_field.dataType != output_field.dataType:
            type_mismatches.append(
                f"{col}: existing={existing_field.dataType}, "
                f"output={output_field.dataType}"
            )

        # Check nullable changes (nullable -> non-nullable is stricter, non-nullable -> nullable is more lenient)
        existing_nullable = getattr(existing_field, "nullable", True)
        output_nullable = getattr(output_field, "nullable", True)
        if existing_nullable != output_nullable:
            if not existing_nullable and output_nullable:
                # Existing is non-nullable, output is nullable - this is usually OK (more lenient)
                nullable_changes.append(
                    f"{col}: nullable changed from False to True (more lenient - usually OK)"
                )
            else:
                # Existing is nullable, output is non-nullable - this is stricter and may cause issues
                nullable_changes.append(
                    f"{col}: nullable changed from True to False (stricter - may cause issues if data has nulls)"
                )

    if type_mismatches:
        differences.append(f"Type mismatches: {', '.join(type_mismatches)}")

    if nullable_changes:
        # Note nullable changes but don't fail validation for them (Delta Lake handles this)
        differences.append(
            f"Nullable changes (informational): {', '.join(nullable_changes)}"
        )

    # Check for column order differences (informational only - order doesn't affect functionality)
    existing_order = list(existing_fields.keys())
    output_order = list(output_fields.keys())
    if (
        existing_order != output_order
        and common_columns == existing_columns == output_columns
    ):
        # All columns match, just order is different
        differences.append(
            f"Column order differs (informational - order doesn't affect functionality): "
            f"existing={existing_order}, output={output_order}"
        )

    return len(
        [d for d in differences if "informational" not in d.lower()]
    ) == 0, differences

In [None]:
# Module: pipeline_builder.writer.exceptions (pipeline_builder)
#
# Dependencies: None (base module)

"""
Writer-specific exceptions.

This module contains all the custom exceptions used by the writer module,
providing clear error handling and debugging information.
"""

from __future__ import annotations

from typing import Any, Dict, Optional

class WriterError(Exception):
    """
    Base exception for all writer-related errors.

    Provides a common base class for all writer exceptions with
    enhanced error context and suggestions.
    """

    def __init__(
        self,
        message: str,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
        cause: Optional[Exception] = None,
    ) -> None:
        """
        Initialize the writer error.

        Args:
            message: Error message
            context: Additional context information
            suggestions: List of suggestions to resolve the error
            cause: The underlying exception that caused this error
        """
        super().__init__(message)
        self.message = message
        self.context = context or {}
        self.suggestions = suggestions or []
        self.cause = cause

    def __str__(self) -> str:
        """Return formatted error message."""
        msg = self.message
        if self.context:
            msg += f"\nContext: {self.context}"
        if self.suggestions:
            msg += f"\nSuggestions: {'; '.join(self.suggestions)}"
        return msg

class WriterValidationError(WriterError):
    """
    Raised when writer validation fails.

    This exception is raised when data validation fails during
    the writing process, such as invalid log rows or schema mismatches.
    """

    def __init__(
        self,
        message: str,
        validation_errors: Optional[list[str]] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
    ) -> None:
        """
        Initialize validation error.

        Args:
            message: Error message
            validation_errors: List of specific validation errors
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.validation_errors = validation_errors or []

class WriterConfigurationError(WriterError):
    """
    Raised when writer configuration is invalid.

    This exception is raised when the WriterConfig contains
    invalid values or conflicting settings.
    """

    def __init__(
        self,
        message: str,
        config_errors: Optional[list[str]] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
    ) -> None:
        """
        Initialize configuration error.

        Args:
            message: Error message
            config_errors: List of specific configuration errors
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.config_errors = config_errors or []

class WriterTableError(WriterError):
    """
    Raised when table operations fail.

    This exception is raised when there are issues with Delta table
    operations, such as table creation, writing, or schema evolution.
    """

    def __init__(
        self,
        message: str,
        table_name: Optional[str] = None,
        operation: Optional[str] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
        cause: Optional[Exception] = None,
    ) -> None:
        """
        Initialize table error.

        Args:
            message: Error message
            table_name: Name of the table that caused the error
            operation: The operation that failed
            context: Additional context information
            suggestions: List of suggestions to resolve the error
            cause: The underlying exception that caused this error
        """
        super().__init__(message, context, suggestions, cause)
        self.table_name = table_name
        self.operation = operation

class WriterPerformanceError(WriterError):
    """
    Raised when performance thresholds are exceeded.

    This exception is raised when operations take longer than expected
    or consume more resources than configured limits.
    """

    def __init__(
        self,
        message: str,
        actual_duration: Optional[float] = None,
        expected_duration: Optional[float] = None,
        actual_memory: Optional[float] = None,
        expected_memory: Optional[float] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
    ) -> None:
        """
        Initialize performance error.

        Args:
            message: Error message
            actual_duration: Actual duration in seconds
            expected_duration: Expected duration in seconds
            actual_memory: Actual memory usage in MB
            expected_memory: Expected memory usage in MB
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.actual_duration = actual_duration
        self.expected_duration = expected_duration
        self.actual_memory = actual_memory
        self.expected_memory = expected_memory

class WriterSchemaError(WriterError):
    """
    Raised when schema operations fail.

    This exception is raised when there are issues with schema
    validation, evolution, or compatibility.
    """

    def __init__(
        self,
        message: str,
        schema_errors: Optional[list[str]] = None,
        expected_schema: Optional[str] = None,
        actual_schema: Optional[str] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
    ) -> None:
        """
        Initialize schema error.

        Args:
            message: Error message
            schema_errors: List of specific schema errors
            expected_schema: Expected schema definition
            actual_schema: Actual schema definition
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.schema_errors = schema_errors or []
        self.expected_schema = expected_schema
        self.actual_schema = actual_schema

class WriterDataQualityError(WriterError):
    """
    Raised when data quality checks fail.

    This exception is raised when data quality validation fails,
    such as when validation rates are too low or data anomalies are detected.
    """

    def __init__(
        self,
        message: str,
        quality_issues: Optional[list[str]] = None,
        validation_rate: Optional[float] = None,
        threshold: Optional[float] = None,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[list[str]] = None,
    ) -> None:
        """
        Initialize data quality error.

        Args:
            message: Error message
            quality_issues: List of specific quality issues
            validation_rate: Actual validation rate
            threshold: Expected validation threshold
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.quality_issues = quality_issues or []
        self.validation_rate = validation_rate
        self.threshold = threshold

In [None]:
# Module: abstracts.rules (abstracts)
#
# Dependencies: None (base module)

from typing import Protocol

# Rules Protocol is compatible with ColumnRules (Dict[str, List[Union[str, Column]]])
# Any dictionary mapping column names to rule lists satisfies this Protocol.
class Rules(Protocol):
    """
    Protocol for validation rules.

    This Protocol is satisfied by ColumnRules (Dict[str, List[Union[str, Column]]])
    and any dictionary mapping column names to rule lists.
    """

    ...

In [None]:
# Module: abstracts.transformer (abstracts)
#
# Dependencies: None (base module)

from typing import Protocol

# Transformer Protocol is compatible with SilverTransformFunction and GoldTransformFunction
# Any callable that transforms data satisfies this Protocol.
class Transformer(Protocol):
    """
    Protocol for transformation functions.

    This Protocol is satisfied by:
    - SilverTransformFunction: Callable[[SparkSession, DataFrame, Dict[str, DataFrame]], DataFrame]
    - GoldTransformFunction: Callable[[SparkSession, Dict[str, DataFrame]], DataFrame]
    - Any callable that transforms data sources
    """

    ...

In [None]:
# Module: abstracts.source (abstracts)
#
# Dependencies: None (base module)

from typing import Protocol

# Source Protocol is compatible with DataFrame and any object that can be used as a data source.
# DataFrame naturally satisfies this Protocol since it's a structural type.
class Source(Protocol):
    """
    Protocol for data sources in the pipeline.

    This Protocol is satisfied by DataFrame and any object that can be used
    as a data source. DataFrame naturally satisfies this Protocol via duck typing.
    """

    ...

In [None]:
# Module: abstracts.reports.run (abstracts)
#
# Dependencies: None (base module)

from __future__ import annotations

from datetime import datetime
from typing import List, Optional, Protocol

class Report(Protocol):
    """
    Protocol for pipeline execution reports.

    This Protocol is satisfied by PipelineReport and any object that provides
    pipeline execution results and metrics.
    """

    pipeline_id: str
    execution_id: str
    status: (
        str  # or enum - can be accessed via .value for enums or directly for strings
    )
    start_time: datetime
    end_time: Optional[datetime]
    duration_seconds: float
    errors: List[str]

    @property
    def success(self) -> bool:
        """Whether the pipeline executed successfully."""
        ...

In [None]:
# Module: pipeline_builder_base.runner.base_runner (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.logging, pipeline_builder_base.models

"""
Base runner with common runner patterns.

This module provides a base BaseRunner class that can be used
by all pipeline runner implementations to reduce code duplication.
"""

from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, List, Optional

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..models import ExecutionResult, PipelineConfig, StepResult  # Removed: defined in notebook cells above

class BaseRunner:
    """
    Base runner with common runner patterns.

    This class provides shared runner functionality that can be used
    by all pipeline runner implementations.
    """

    def __init__(
        self,
        config: PipelineConfig,
        logger: Optional[PipelineLogger] = None,
    ):
        """
        Initialize the base runner.

        Args:
            config: Pipeline configuration
            logger: Optional logger instance
        """
        self.config = config
        self.logger = logger or PipelineLogger()

    def _handle_step_error(
        self, step: Any, error: Exception, step_type: str = "step"
    ) -> None:
        """
        Handle step execution error with logging.

        Args:
            step: Step object that failed
            error: Exception that occurred
            step_type: Type of step (bronze/silver/gold) for logging
        """
        step_name = getattr(step, "name", "unknown")
        self.logger.error(
            f"❌ {step_type.capitalize()} step '{step_name}' failed: {str(error)}"
        )
        self.logger.debug(f"Error details: {error}", exc_info=True)

    def _collect_step_results(self, results: List[StepResult]) -> ExecutionResult:
        """
        Collect step results into an execution result.

        Args:
            results: List of step results

        Returns:
            ExecutionResult with aggregated results
        """
        from datetime import datetime, timezone

        # from ..models import ExecutionContext, ExecutionMode, PipelineMetrics  # Removed: defined in notebook cells above

        # Aggregate metrics from step results
        metrics = PipelineMetrics.from_step_results(results)

        # Determine overall status
        all_succeeded = all(result.success for result in results)

        # Create execution context
        context = ExecutionContext(
            mode=ExecutionMode.INITIAL,
            start_time=datetime.now(timezone.utc),
            end_time=datetime.now(timezone.utc),
        )

        return ExecutionResult(
            context=context,
            step_results=results,
            metrics=metrics,
            success=all_succeeded,
        )

    def _create_pipeline_report(
        self,
        status: str,
        start_time: datetime,
        end_time: datetime,
        results: Optional[ExecutionResult] = None,
        errors: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Create a pipeline execution report.

        Args:
            status: Pipeline execution status
            start_time: Pipeline start time
            end_time: Pipeline end time
            results: Optional execution results
            errors: Optional list of error messages

        Returns:
            Dictionary representing pipeline report
        """
        duration = (end_time - start_time).total_seconds()

        report: Dict[str, Any] = {
            "status": status,
            "start_time": start_time,
            "end_time": end_time,
            "duration_seconds": duration,
        }

        if results:
            report["results"] = results
            report["step_count"] = (
                len(results.step_results) if results.step_results else 0
            )
            report["success_count"] = (
                sum(1 for r in results.step_results if r.success)
                if results.step_results
                else 0
            )

        if errors:
            report["errors"] = errors
            report["error_count"] = len(errors)

        return report

    def _aggregate_step_reports(self, reports: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Aggregate multiple step reports into a single report.

        Args:
            reports: List of step reports

        Returns:
            Aggregated report dictionary
        """
        if not reports:
            return {
                "status": "unknown",
                "step_count": 0,
                "success_count": 0,
            }

        all_succeeded = all(r.get("status") == "success" for r in reports)
        status = "success" if all_succeeded else "partial_failure"

        total_duration = sum(r.get("duration_seconds", 0.0) for r in reports)

        return {
            "status": status,
            "step_count": len(reports),
            "success_count": sum(1 for r in reports if r.get("status") == "success"),
            "total_duration_seconds": total_duration,
            "step_reports": reports,
        }

In [None]:
# Module: pipeline_builder_base.validation.pipeline_validator (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.logging, pipeline_builder_base.models

"""
Base pipeline validator with common validation logic.

This module provides a base UnifiedValidator class that can be used
by all pipeline builder implementations to validate pipeline configurations.
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional, Set

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..models import PipelineConfig  # Removed: defined in notebook cells above

class UnifiedValidator:
    """
    Base pipeline validator with common validation logic.

    This class provides shared validation patterns that can be used
    by all pipeline builder implementations.
    """

    def __init__(self, logger: Optional[PipelineLogger] = None):
        """
        Initialize the pipeline validator.

        Args:
            logger: Optional logger instance for validation messages
        """
        self.logger = logger or PipelineLogger()

    def validate_step_names(
        self,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate that all step names are unique and valid.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []
        all_step_names: Set[str] = set()

        # Check bronze steps
        for name in bronze_steps.keys():
            if not name or not isinstance(name, str):
                errors.append(f"Invalid bronze step name: {name}")
            elif name in all_step_names:
                errors.append(f"Duplicate step name found: {name}")
            else:
                all_step_names.add(name)

        # Check silver steps
        for name in silver_steps.keys():
            if not name or not isinstance(name, str):
                errors.append(f"Invalid silver step name: {name}")
            elif name in all_step_names:
                errors.append(f"Duplicate step name found: {name}")
            else:
                all_step_names.add(name)

        # Check gold steps
        for name in gold_steps.keys():
            if not name or not isinstance(name, str):
                errors.append(f"Invalid gold step name: {name}")
            elif name in all_step_names:
                errors.append(f"Duplicate step name found: {name}")
            else:
                all_step_names.add(name)

        return errors

    def validate_bronze_steps(
        self,
        bronze_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate bronze steps configuration.

        Args:
            bronze_steps: Dictionary of bronze steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        for step_name, step in bronze_steps.items():
            # Check if step has rules
            if not hasattr(step, "rules") or not step.rules:
                errors.append(f"Bronze step '{step_name}' missing validation rules")

            # Check if step has name attribute
            if not hasattr(step, "name") or not step.name:
                errors.append("Bronze step missing name attribute")

        return errors

    def validate_silver_steps(
        self,
        silver_steps: Dict[str, Any],
        bronze_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate silver steps configuration and dependencies.

        Args:
            silver_steps: Dictionary of silver steps
            bronze_steps: Dictionary of bronze steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        for step_name, step in silver_steps.items():
            # Skip validation for validation-only steps (existing=True, transform=None)
            existing = getattr(step, "existing", False)
            transform = getattr(step, "transform", None)
            if existing and transform is None:
                # Validation-only step - only check rules and table_name
                if not hasattr(step, "rules") or not step.rules:
                    errors.append(f"Silver step '{step_name}' missing validation rules")
                if not hasattr(step, "table_name") or not step.table_name:
                    errors.append(f"Silver step '{step_name}' missing table_name")
                continue

            # SQL-source steps (sql_source set) have no source_bronze
            if getattr(step, "sql_source", None) is not None:
                if not hasattr(step, "rules") or not step.rules:
                    errors.append(f"Silver step '{step_name}' missing validation rules")
                if not hasattr(step, "table_name") or not step.table_name:
                    errors.append(f"Silver step '{step_name}' missing table_name")
                continue

            # Check if step has source_bronze (for non-validation-only steps)
            source_bronze = getattr(step, "source_bronze", None)
            if not source_bronze:
                errors.append(f"Silver step '{step_name}' missing source_bronze")

            # Check if source_bronze exists in bronze_steps
            elif source_bronze not in bronze_steps:
                errors.append(
                    f"Silver step '{step_name}' depends on non-existent bronze step '{source_bronze}'"
                )

            # Check if step has rules
            if not hasattr(step, "rules") or not step.rules:
                errors.append(f"Silver step '{step_name}' missing validation rules")

            # Check if step has table_name
            if not hasattr(step, "table_name") or not step.table_name:
                errors.append(f"Silver step '{step_name}' missing table_name")

        return errors

    def validate_gold_steps(
        self,
        gold_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate gold steps configuration and dependencies.

        Args:
            gold_steps: Dictionary of gold steps
            silver_steps: Dictionary of silver steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        for step_name, step in gold_steps.items():
            # Check if step has source_silvers
            source_silvers = getattr(step, "source_silvers", None)
            if source_silvers:
                if not isinstance(source_silvers, list):
                    errors.append(
                        f"Gold step '{step_name}' source_silvers must be a list"
                    )
                else:
                    for silver_name in source_silvers:
                        if silver_name not in silver_steps:
                            errors.append(
                                f"Gold step '{step_name}' depends on non-existent silver step '{silver_name}'"
                            )

            # Check if step has rules
            if not hasattr(step, "rules") or not step.rules:
                errors.append(f"Gold step '{step_name}' missing validation rules")

            # Check if step has table_name
            if not hasattr(step, "table_name") or not step.table_name:
                errors.append(f"Gold step '{step_name}' missing table_name")

        return errors

    def validate_dependencies(
        self,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate step dependencies and detect circular dependencies.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        # Build dependency graph
        dependencies: Dict[str, List[str]] = {}

        # Silver steps depend on bronze steps
        for step_name, step in silver_steps.items():
            source_bronze = getattr(step, "source_bronze", None)
            if source_bronze:
                dependencies[step_name] = [source_bronze]

        # Gold steps depend on silver steps
        for step_name, step in gold_steps.items():
            source_silvers = getattr(step, "source_silvers", None)
            if source_silvers and isinstance(source_silvers, list):
                dependencies[step_name] = source_silvers

        # Check for circular dependencies using DFS
        visited: Set[str] = set()
        rec_stack: Set[str] = set()

        def has_cycle(node: str) -> bool:
            """Check if there's a cycle starting from node."""
            visited.add(node)
            rec_stack.add(node)

            for neighbor in dependencies.get(node, []):
                if neighbor not in visited:
                    if has_cycle(neighbor):
                        return True
                elif neighbor in rec_stack:
                    # Found a back edge, cycle exists
                    return True

            rec_stack.remove(node)
            return False

        # Check all nodes for cycles
        all_nodes = set(dependencies.keys())
        for node in all_nodes:
            if node not in visited:
                if has_cycle(node):
                    errors.append(
                        f"Circular dependency detected involving step '{node}'"
                    )

        return errors

    def validate_schema(self, schema: Any) -> List[str]:
        """
        Validate schema name format.

        Args:
            schema: Schema name to validate

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        if not schema:
            errors.append("Schema name cannot be empty")
        elif not isinstance(schema, str):
            errors.append("Schema name must be a string")
        elif not schema.strip():
            errors.append("Schema name cannot be whitespace only")
        elif len(schema) > 128:  # Reasonable limit
            errors.append("Schema name is too long (max 128 characters)")

        return errors

    def validate_pipeline(
        self,
        config: PipelineConfig,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Validate entire pipeline configuration.

        Args:
            config: Pipeline configuration
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        # Validate schema
        schema_errors = self.validate_schema(config.schema)
        errors.extend(schema_errors)

        # Validate step names
        name_errors = self.validate_step_names(bronze_steps, silver_steps, gold_steps)
        errors.extend(name_errors)

        # Validate bronze steps
        bronze_errors = self.validate_bronze_steps(bronze_steps)
        errors.extend(bronze_errors)

        # Validate silver steps
        silver_errors = self.validate_silver_steps(silver_steps, bronze_steps)
        errors.extend(silver_errors)

        # Validate gold steps
        gold_errors = self.validate_gold_steps(gold_steps, silver_steps)
        errors.extend(gold_errors)

        # Validate dependencies
        dep_errors = self.validate_dependencies(bronze_steps, silver_steps, gold_steps)
        errors.extend(dep_errors)

        return errors

In [None]:
# Module: pipeline_builder_base.validation.step_validator (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.logging

"""
Base step validator with common step validation patterns.

This module provides a base StepValidator class that can be used
by all pipeline builder implementations to validate individual steps.
"""

from __future__ import annotations

from typing import Any, List, Optional

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above

class StepValidator:
    """
    Base step validator with common step validation patterns.

    This class provides shared validation patterns that can be used
    by all pipeline builder implementations.
    """

    def __init__(self, logger: Optional[PipelineLogger] = None):
        """
        Initialize the step validator.

        Args:
            logger: Optional logger instance for validation messages
        """
        self.logger = logger or PipelineLogger()

    def validate_step_name(self, name: Any, step_type: str = "step") -> List[str]:
        """
        Validate step name format.

        Args:
            name: Step name to validate (can be any type, will be validated)
            step_type: Type of step (bronze/silver/gold) for error messages

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        if not isinstance(name, str):
            errors.append(f"{step_type.capitalize()} step name must be a string")
        elif not name:
            errors.append(f"{step_type.capitalize()} step name cannot be empty")
        elif not name.strip():
            errors.append(
                f"{step_type.capitalize()} step name cannot be whitespace only"
            )
        elif len(name) > 128:  # Reasonable limit
            errors.append(
                f"{step_type.capitalize()} step name is too long (max 128 characters)"
            )

        return errors

    def validate_step_rules(self, step: Any, step_type: str = "step") -> List[str]:
        """
        Validate that step has rules.

        Args:
            step: Step object to validate
            step_type: Type of step (bronze/silver/gold) for error messages

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        if not hasattr(step, "rules"):
            errors.append(f"{step_type.capitalize()} step missing 'rules' attribute")
        elif not step.rules:
            step_name = getattr(step, "name", "unknown")
            errors.append(
                f"{step_type.capitalize()} step '{step_name}' has empty validation rules"
            )
        elif not isinstance(step.rules, dict):
            step_name = getattr(step, "name", "unknown")
            errors.append(
                f"{step_type.capitalize()} step '{step_name}' rules must be a dictionary"
            )

        return errors

    def classify_step_type(self, step: Any) -> str:
        """
        Classify step type from step object.

        Args:
            step: Step object to classify

        Returns:
            Step type: 'bronze', 'silver', 'gold', or 'unknown'
        """
        # Check if step has type attribute
        if hasattr(step, "type") and step.type:
            step_type = str(step.type).lower()
            if step_type in ("bronze", "silver", "gold"):
                return step_type

        # Determine type from class name
        class_name = step.__class__.__name__
        if "Bronze" in class_name:
            return "bronze"
        elif "Silver" in class_name:
            return "silver"
        elif "Gold" in class_name:
            return "gold"

        return "unknown"

    def validate_step_dependencies(
        self, step: Any, available_sources: List[str], step_type: str = "step"
    ) -> List[str]:
        """
        Validate step dependencies exist.

        Args:
            step: Step object to validate
            available_sources: List of available source step names
            step_type: Type of step (bronze/silver/gold) for error messages

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []
        step_name = getattr(step, "name", "unknown")

        if step_type == "silver":
            # Silver steps depend on bronze steps
            source_bronze = getattr(step, "source_bronze", None)
            if source_bronze and source_bronze not in available_sources:
                errors.append(
                    f"Silver step '{step_name}' references unknown bronze source '{source_bronze}'"
                )
        elif step_type == "gold":
            # Gold steps depend on silver steps
            source_silvers = getattr(step, "source_silvers", None)
            if source_silvers:
                if not isinstance(source_silvers, list):
                    errors.append(
                        f"Gold step '{step_name}' source_silvers must be a list"
                    )
                else:
                    for silver_name in source_silvers:
                        if silver_name not in available_sources:
                            errors.append(
                                f"Gold step '{step_name}' references unknown silver source '{silver_name}'"
                            )

        return errors

    def validate_step(
        self, step: Any, available_sources: Optional[List[str]] = None
    ) -> List[str]:
        """
        Validate a single step.

        Args:
            step: Step object to validate
            available_sources: Optional list of available source step names for dependency validation

        Returns:
            List of validation errors (empty if valid)
        """
        errors: List[str] = []

        # Classify step type
        step_type = self.classify_step_type(step)

        # Validate step name
        step_name = getattr(step, "name", None)
        if step_name:
            name_errors = self.validate_step_name(step_name, step_type)
            errors.extend(name_errors)
        else:
            errors.append(f"{step_type.capitalize()} step missing 'name' attribute")

        # Validate step rules
        rules_errors = self.validate_step_rules(step, step_type)
        errors.extend(rules_errors)

        # Validate dependencies if sources provided
        if available_sources is not None:
            dep_errors = self.validate_step_dependencies(
                step, available_sources, step_type
            )
            errors.extend(dep_errors)

        return errors

In [None]:
# Module: pipeline_builder.logging (pipeline_builder)
#
# Dependencies: pipeline_builder_base.logging

"""
Simplified logging system for the framework.

This module re-exports logging classes from pipeline_builder_base
for backward compatibility.
"""

from __future__ import annotations

# Re-export from base for backward compatibility
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

__all__ = ["PipelineLogger"]

In [None]:
# Module: pipeline_builder_base.writer.exceptions (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors

"""
Writer-specific exceptions.
"""

# from ..errors import SparkForgeError  # Removed: defined in notebook cells above

class WriterError(SparkForgeError):
    """Base exception for writer errors."""

    pass

class WriterConfigurationError(WriterError):
    """Raised when writer configuration is invalid."""

    pass

class WriterValidationError(WriterError):
    """Raised when writer validation fails."""

    pass

class WriterTableError(WriterError):
    """Raised when table operations fail."""

    pass

class WriterDataQualityError(WriterError):
    """Raised when data quality checks fail."""

    pass

class WriterPerformanceError(WriterError):
    """Raised when performance issues are detected."""

    pass

In [None]:
# Module: pipeline_builder_base.steps.manager (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors, pipeline_builder_base.validation

"""
Step manager for managing step collections.

This module provides a StepManager class for managing pipeline step collections
with validation and query capabilities.
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional

# from ..errors import ValidationError  # Removed: defined in notebook cells above
# from ..validation import StepValidator  # Removed: defined in notebook cells above

class StepManager:
    """
    Manager for pipeline step collections.

    This class provides methods for managing step collections with
    validation and query capabilities.
    """

    def __init__(self) -> None:
        """Initialize the step manager."""
        self.bronze_steps: Dict[str, Any] = {}
        self.silver_steps: Dict[str, Any] = {}
        self.gold_steps: Dict[str, Any] = {}
        self.validator = StepValidator()

    def add_step(self, step: Any, step_type: str) -> None:
        """
        Add a step to the appropriate collection.

        Args:
            step: Step object to add
            step_type: Type of step (bronze/silver/gold)

        Raises:
            ValidationError: If step is invalid or name already exists
        """
        step_name = getattr(step, "name", None)
        if not step_name:
            raise ValidationError(
                f"{step_type.capitalize()} step missing 'name' attribute"
            )

        # Check for duplicate name
        step_dict = getattr(self, f"{step_type}_steps", {})
        if step_name in step_dict:
            raise ValidationError(
                f"{step_type.capitalize()} step '{step_name}' already exists"
            )

        # Validate step
        errors = self.validator.validate_step(step)
        if errors:
            raise ValidationError(
                f"Invalid {step_type} step '{step_name}': {errors[0]}",
                context={"step_name": step_name, "step_type": step_type},
            )

        # Add step
        step_dict[step_name] = step

    def get_step(self, name: str, step_type: Optional[str] = None) -> Optional[Any]:
        """
        Get a step by name and optional type.

        Args:
            name: Step name
            step_type: Optional step type (bronze/silver/gold). If None, searches all types.

        Returns:
            Step object if found, None otherwise
        """
        if step_type:
            step_dict = getattr(self, f"{step_type}_steps", {})
            return step_dict.get(name)
        else:
            # Search all step types
            for step_type in ["bronze", "silver", "gold"]:
                step_dict = getattr(self, f"{step_type}_steps", {})
                if name in step_dict:
                    return step_dict[name]
        return None

    def get_all_steps(self) -> Dict[str, Dict[str, Any]]:
        """
        Get all steps grouped by type.

        Returns:
            Dictionary mapping step types to step dictionaries
        """
        return {
            "bronze": self.bronze_steps,
            "silver": self.silver_steps,
            "gold": self.gold_steps,
        }

    def get_steps_by_type(self, step_type: str) -> Dict[str, Any]:
        """
        Get all steps of a specific type.

        Args:
            step_type: Type of steps to get (bronze/silver/gold)

        Returns:
            Dictionary of steps of the specified type
        """
        step_dict = getattr(self, f"{step_type}_steps", {})
        return step_dict.copy()

    def validate_all_steps(self) -> List[str]:
        """
        Validate all steps in the manager.

        Returns:
            List of validation errors (empty if all valid)
        """
        errors: List[str] = []

        # Validate bronze steps
        for step_name, step in self.bronze_steps.items():
            step_errors = self.validator.validate_step(step)
            errors.extend([f"Bronze step '{step_name}': {e}" for e in step_errors])

        # Validate silver steps
        bronze_names = list(self.bronze_steps.keys())
        for step_name, step in self.silver_steps.items():
            step_errors = self.validator.validate_step(
                step, available_sources=bronze_names
            )
            errors.extend([f"Silver step '{step_name}': {e}" for e in step_errors])

        # Validate gold steps
        silver_names = list(self.silver_steps.keys())
        for step_name, step in self.gold_steps.items():
            step_errors = self.validator.validate_step(
                step, available_sources=silver_names
            )
            errors.extend([f"Gold step '{step_name}': {e}" for e in step_errors])

        return errors

In [None]:
# Module: pipeline_builder_base.builder.base_builder (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors, pipeline_builder_base.logging, pipeline_builder_base.models, pipeline_builder_base.validation

"""
Base pipeline builder with common builder patterns.

This module provides a base BasePipelineBuilder class that can be used
by all pipeline builder implementations to reduce code duplication.
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional

# from ..errors import ValidationError  # Removed: defined in notebook cells above
# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..models import PipelineConfig  # Removed: defined in notebook cells above
# from ..validation import UnifiedValidator, StepValidator  # Removed: defined in notebook cells above

class BasePipelineBuilder:
    """
    Base pipeline builder with common builder patterns.

    This class provides shared builder functionality that can be used
    by all pipeline builder implementations.
    """

    def __init__(
        self,
        config: PipelineConfig,
        logger: Optional[PipelineLogger] = None,
    ):
        """
        Initialize the base pipeline builder.

        Args:
            config: Pipeline configuration
            logger: Optional logger instance
        """
        self.config = config
        self.logger = logger or PipelineLogger()
        self.validator = UnifiedValidator(self.logger)
        self.step_validator = StepValidator(self.logger)

        # Step storage - subclasses should initialize these
        self.bronze_steps: Dict[str, Any] = {}
        self.silver_steps: Dict[str, Any] = {}
        self.gold_steps: Dict[str, Any] = {}

    def _check_duplicate_step_name(self, name: str, step_type: str) -> None:
        """
        Check if step name already exists and raise error if duplicate.

        Args:
            name: Step name to check
            step_type: Type of step (bronze/silver/gold)

        Raises:
            ValidationError: If step name already exists
        """
        step_dict = getattr(self, f"{step_type}_steps", {})
        if name in step_dict:
            raise ValidationError(
                f"{step_type.capitalize()} step '{name}' already exists",
                context={"step_name": name, "step_type": step_type},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            )

    def _validate_step_dependencies(
        self,
        step: Any,
        step_type: str,
        available_sources: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Validate step dependencies exist.

        Args:
            step: Step object to validate
            step_type: Type of step (bronze/silver/gold)
            available_sources: Optional dictionary of available source steps

        Raises:
            ValidationError: If dependencies are invalid
        """
        if step_type == "silver":
            source_bronze = getattr(step, "source_bronze", None)
            if source_bronze:
                if available_sources is None:
                    available_sources = self.bronze_steps
                if source_bronze not in available_sources:
                    raise ValidationError(
                        f"Bronze step '{source_bronze}' not found",
                        context={
                            "step_name": getattr(step, "name", "unknown"),
                            "step_type": step_type,
                            "missing_dependency": source_bronze,
                        },
                        suggestions=[
                            f"Add bronze step '{source_bronze}' first",
                            f"Check spelling of '{source_bronze}'",
                        ],
                    )
        elif step_type == "gold":
            source_silvers = getattr(step, "source_silvers", None)
            if source_silvers:
                if not isinstance(source_silvers, list):
                    raise ValidationError(
                        "Gold step source_silvers must be a list",
                        context={
                            "step_name": getattr(step, "name", "unknown"),
                            "step_type": step_type,
                        },
                    )
                if available_sources is None:
                    available_sources = self.silver_steps
                for silver_name in source_silvers:
                    if silver_name not in available_sources:
                        raise ValidationError(
                            f"Silver step '{silver_name}' not found",
                            context={
                                "step_name": getattr(step, "name", "unknown"),
                                "step_type": step_type,
                                "missing_dependency": silver_name,
                            },
                            suggestions=[
                                f"Add silver step '{silver_name}' first",
                                f"Check spelling of '{silver_name}'",
                            ],
                        )

    def _validate_schema(self, schema: str) -> None:
        """
        Validate schema name format.

        Args:
            schema: Schema name to validate

        Raises:
            ValidationError: If schema is invalid
        """
        errors = self.validator.validate_schema(schema)
        if errors:
            raise ValidationError(
                errors[0],
                context={"schema": schema},
                suggestions=[
                    "Schema name must be a non-empty string",
                    "Schema name must be 128 characters or less",
                ],
            )

    def validate_pipeline(self) -> List[str]:
        """
        Validate entire pipeline configuration.

        Returns:
            List of validation errors (empty if valid)
        """
        return self.validator.validate_pipeline(
            self.config, self.bronze_steps, self.silver_steps, self.gold_steps
        )

In [None]:
# Module: pipeline_builder.errors (pipeline_builder)
#
# Dependencies: pipeline_builder_base.errors

"""
Simplified error handling system for the framework.

This module re-exports error classes from pipeline_builder_base
for backward compatibility.
"""

from __future__ import annotations

# Re-export from base for backward compatibility
# from .errors import (  # Removed: defined in notebook cells above
    # ConfigurationError,
    # DataError,
    # ErrorCategory,
    # ErrorContext,
    # ErrorContextValue,
    # ErrorSeverity,
    # ErrorSuggestions,
    # ExecutionError,
    # PerformanceError,
    # PipelineValidationError,
    # ResourceError,
    # SparkForgeError,
    # SystemError,
    # ValidationError,
# )

__all__ = [
    "SparkForgeError",
    "ValidationError",
    "PipelineValidationError",
    "ConfigurationError",
    "ExecutionError",
    "DataError",
    "SystemError",
    "PerformanceError",
    "ResourceError",
    "ErrorSeverity",
    "ErrorCategory",
    "ErrorContext",
    "ErrorContextValue",
    "ErrorSuggestions",
]

# Backward compatibility aliases
# Note: PipelineValidationError is already imported above, so we don't redefine it here
# PipelineValidationError = ValidationError  # Already defined in import
PipelineConfigurationError = ConfigurationError
PipelineExecutionError = ExecutionError
TableOperationError = DataError
DependencyError = ValidationError
StepError = ExecutionError
PipelineError = ExecutionError

In [None]:
# Module: pipeline_builder.sql_source.models (pipeline_builder)
#
# Dependencies: pipeline_builder_base.errors

"""
SQL source configuration models for pipeline_builder.

JdbcSource: read via PySpark spark.read.jdbc() (no extra deps; JDBC driver JAR on classpath).
SqlAlchemySource: read via SQLAlchemy + pandas then spark.createDataFrame (optional [sql] extra).
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict, Optional
# from .errors import ValidationError  # Removed: defined in notebook cells above

def _validate_table_or_query(
    table: Optional[str],
    query: Optional[str],
    source_type: str,
) -> None:
    """Require exactly one of table or query."""
    has_table = table is not None and table.strip() != ""
    has_query = query is not None and query.strip() != ""
    if has_table and has_query:
        raise ValidationError(
            f"{source_type} requires exactly one of 'table' or 'query'; got both."
        )
    if not has_table and not has_query:
        raise ValidationError(
            f"{source_type} requires exactly one of 'table' or 'query'; got neither."
        )

@dataclass
class JdbcSource:
    """
    JDBC-backed SQL source for pipeline steps.

    Data is read via spark.read.jdbc(). Requires the JDBC driver JAR
    for the database to be on the Spark application classpath.

    Exactly one of `table` or `query` must be set. For `query`, Spark
    expects a subquery alias, e.g. "(SELECT * FROM t WHERE x > 1) AS q".
    """

    url: str
    properties: Dict[str, str]
    table: Optional[str] = None
    query: Optional[str] = None
    driver: Optional[str] = None

    def __post_init__(self) -> None:
        if not self.url or not isinstance(self.url, str):
            raise ValidationError("JdbcSource requires a non-empty 'url'.")
        if not isinstance(self.properties, dict):
            raise ValidationError("JdbcSource 'properties' must be a dict.")
        _validate_table_or_query(self.table, self.query, "JdbcSource")

@dataclass
class SqlAlchemySource:
    """
    SQLAlchemy-backed SQL source for pipeline steps.

    Data is read via SQLAlchemy + pandas (read_sql_table or read_sql)
    then spark.createDataFrame(). Requires pip install pipeline_builder[sql].

    Exactly one of `table` or `query` must be set. Optionally set `schema`
    for table reads (e.g. 'public' for PostgreSQL).
    """

    url: Optional[str] = None
    engine: Any = None
    table: Optional[str] = None
    query: Optional[str] = None
    schema: Optional[str] = None

    def __post_init__(self) -> None:
        has_url = self.url is not None and (
            isinstance(self.url, str) and self.url.strip() != ""
        )
        has_engine = self.engine is not None
        if has_url and has_engine:
            raise ValidationError(
                "SqlAlchemySource requires exactly one of 'url' or 'engine'; got both."
            )
        if not has_url and not has_engine:
            raise ValidationError(
                "SqlAlchemySource requires exactly one of 'url' or 'engine'; got neither."
            )
        _validate_table_or_query(self.table, self.query, "SqlAlchemySource")

In [None]:
# Module: pipeline_builder.errors.error_handler (pipeline_builder)
#
# Dependencies: pipeline_builder_base.errors, pipeline_builder_base.logging

"""Error handling utilities for pipeline operations.

This module provides centralized error handling with consistent error wrapping
and context addition. The ErrorHandler ensures all errors are wrapped in
ExecutionError with appropriate context and suggestions.
"""

from __future__ import annotations

from contextlib import contextmanager
from functools import wraps
from typing import Any, Callable, Dict, Generator, List, Optional, TypeVar, Union
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

F = TypeVar("F", bound=Callable[..., Any])
ContextType = Union[Optional[Dict[str, Any]], Callable[..., Dict[str, Any]]]
SuggestionsType = Union[Optional[List[str]], Callable[..., List[str]]]

class ErrorHandler:
    """Centralized error handler for pipeline operations.

    Provides consistent error wrapping and context addition. Ensures all
    errors are wrapped in ExecutionError with appropriate context and
    suggestions for debugging.

    Attributes:
        logger: PipelineLogger instance for logging.

    Example:
        Using as context manager:

        >>> from pipeline_builder.errors.error_handler import ErrorHandler
        >>>
        >>> handler = ErrorHandler()
        >>> with handler.handle_errors(
        ...     "table write",
        ...     context={"table": "analytics.events"},
        ...     suggestions=["Check table permissions", "Verify schema"]
        ... ):
        ...     df.write.saveAsTable("analytics.events")

        Using as decorator:

        >>> @handler.wrap_error("data validation")
        >>> def validate_data(df):
        ...     # validation logic
        ...     pass
    """

    def __init__(
        self,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the error handler.

        Args:
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.logger = logger or PipelineLogger()

    @contextmanager
    def handle_errors(
        self,
        operation: str,
        context: Optional[Dict[str, Any]] = None,
        suggestions: Optional[List[str]] = None,
    ) -> Generator[None, None, None]:
        """Context manager for error handling.

        Wraps code in a context manager that catches exceptions and wraps
        them in ExecutionError with context and suggestions. ExecutionError
        exceptions are re-raised as-is.

        Args:
            operation: Description of the operation being performed (used in
                error messages).
            context: Optional dictionary with additional context about the
                operation (e.g., table name, step name).
            suggestions: Optional list of suggestions for fixing errors.

        Yields:
            None (context manager yields control to the wrapped code).

        Raises:
            ExecutionError: Wrapped error with context and suggestions.
                ExecutionError exceptions are re-raised as-is without wrapping.

        Example:
            >>> with handler.handle_errors(
            ...     "table write",
            ...     context={"table": "analytics.events"},
            ...     suggestions=["Check permissions", "Verify schema"]
            ... ):
            ...     df.write.saveAsTable("analytics.events")
        """
        try:
            yield
        except ExecutionError:
            # Re-raise ExecutionError as-is (already has context)
            raise
        except Exception as e:
            # Wrap other exceptions
            raise ExecutionError(
                f"Error during {operation}: {str(e)}",
                context=context or {},
                suggestions=suggestions or [],
            ) from e

    def wrap_error(
        self,
        operation: str,
        context: ContextType = None,
        suggestions: SuggestionsType = None,
    ) -> Callable[[F], F]:
        """Decorator for wrapping function errors.

        Decorator that wraps function exceptions in ExecutionError with context
        and suggestions. Context and suggestions can be callables that receive
        function arguments for dynamic error messages.

        Args:
            operation: Description of the operation being performed (used in
                error messages).
            context: Optional dictionary with additional context, or a callable
                that receives function args and returns a context dictionary.
            suggestions: Optional list of suggestions, or a callable that
                receives function args and returns a list of suggestions.

        Returns:
            Decorator function that wraps the target function.

        Example:
            >>> @handler.wrap_error(
            ...     "data validation",
            ...     context=lambda df, rules: {"df_rows": df.count(), "rules_count": len(rules)},
            ...     suggestions=["Check data quality", "Review validation rules"]
            ... )
            >>> def validate_data(df, rules):
            ...     # validation logic
            ...     pass
        """

        def decorator(func: F) -> F:
            @wraps(func)
            def wrapper(*args: Any, **kwargs: Any) -> Any:
                try:
                    # Build context if it's a callable
                    if callable(context):
                        ctx = context(*args, **kwargs)
                    else:
                        ctx = context or {}

                    # Build suggestions if it's a callable
                    if callable(suggestions):
                        sugg = suggestions(*args, **kwargs)
                    else:
                        sugg = suggestions or []

                    return func(*args, **kwargs)
                except ExecutionError:
                    # Re-raise ExecutionError as-is
                    raise
                except Exception as e:
                    # Wrap other exceptions
                    raise ExecutionError(
                        f"Error during {operation}: {str(e)}",
                        context=ctx,
                        suggestions=sugg,
                    ) from e

            return wrapper  # type: ignore[return-value]

        return decorator

In [None]:
# Module: pipeline_builder_base.builder.step_classifier (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.dependencies, pipeline_builder_base.dependencies.graph

"""
Step classifier utility for identifying and grouping steps.

This module provides utilities for classifying steps by type,
extracting dependencies, and building dependency graphs.
"""

from __future__ import annotations

from typing import Any, Dict, List, Set

# from ..dependencies import DependencyGraph  # Removed: defined in notebook cells above

class StepClassifier:
    """
    Utility class for classifying and analyzing pipeline steps.
    """

    @staticmethod
    def classify_step_type(step: Any) -> str:
        """
        Classify step type from step object.

        Args:
            step: Step object to classify

        Returns:
            Step type: 'bronze', 'silver', 'gold', or 'unknown'
        """
        # Check if step has type attribute
        if hasattr(step, "type") and step.type:
            step_type = str(step.type).lower()
            if step_type in ("bronze", "silver", "gold"):
                return step_type

        # Determine type from class name
        class_name = step.__class__.__name__
        if "Bronze" in class_name:
            return "bronze"
        elif "Silver" in class_name:
            return "silver"
        elif "Gold" in class_name:
            return "gold"

        return "unknown"

    @staticmethod
    def extract_step_dependencies(step: Any) -> List[str]:
        """
        Extract dependencies from a step.

        Args:
            step: Step object to analyze

        Returns:
            List of dependency step names
        """
        dependencies: List[str] = []

        # Check for source_bronze (silver steps)
        source_bronze = getattr(step, "source_bronze", None)
        if source_bronze:
            dependencies.append(source_bronze)

        # Check for source_silvers (gold steps)
        source_silvers = getattr(step, "source_silvers", None)
        if source_silvers:
            if isinstance(source_silvers, list):
                dependencies.extend(source_silvers)
            elif isinstance(source_silvers, str):
                dependencies.append(source_silvers)

        # Check for source attribute (backward compatibility)
        source = getattr(step, "source", None)
        if source and source not in dependencies:
            if isinstance(source, str):
                dependencies.append(source)
            elif isinstance(source, list):
                dependencies.extend(source)

        return dependencies

    @staticmethod
    def group_steps_by_type(
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> Dict[str, Dict[str, Any]]:
        """
        Group steps by type into a single dictionary.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            Dictionary mapping step types to step dictionaries
        """
        return {
            "bronze": bronze_steps,
            "silver": silver_steps,
            "gold": gold_steps,
        }

    @staticmethod
    def get_all_step_names(
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> Set[str]:
        """
        Get all step names from all step types.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            Set of all step names
        """
        all_names: Set[str] = set()
        all_names.update(bronze_steps.keys())
        all_names.update(silver_steps.keys())
        all_names.update(gold_steps.keys())
        return all_names

    @staticmethod
    def build_dependency_graph(
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> DependencyGraph:
        """
        Build a dependency graph from pipeline steps.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            DependencyGraph instance
        """
        # from ..dependencies.graph import StepNode, StepType  # Removed: defined in notebook cells above

        graph = DependencyGraph()

        # Add all steps as nodes
        for step_name in StepClassifier.get_all_step_names(
            bronze_steps, silver_steps, gold_steps
        ):
            # Determine step type
            if step_name in bronze_steps:
                step_type = StepType.BRONZE
            elif step_name in silver_steps:
                step_type = StepType.SILVER
            elif step_name in gold_steps:
                step_type = StepType.GOLD
            else:
                step_type = StepType.BRONZE  # Default

            node = StepNode(name=step_name, step_type=step_type)
            graph.add_node(node)

        # Add dependencies for silver steps (depend on bronze)
        for step_name, step in silver_steps.items():
            source_bronze = getattr(step, "source_bronze", None)
            if source_bronze and source_bronze in graph.nodes:
                # add_dependency(from_step, to_step) means from_step depends on to_step
                # So silver depends on bronze: add_dependency(silver, bronze)
                graph.add_dependency(step_name, source_bronze)

        # Add dependencies for gold steps (depend on silver)
        for step_name, step in gold_steps.items():
            source_silvers = getattr(step, "source_silvers", None)
            if source_silvers:
                if isinstance(source_silvers, list):
                    for silver_name in source_silvers:
                        if silver_name in graph.nodes:
                            graph.add_dependency(step_name, silver_name)
                elif isinstance(source_silvers, str):
                    if source_silvers in graph.nodes:
                        graph.add_dependency(step_name, source_silvers)

        return graph

    @staticmethod
    def get_execution_order(
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> List[str]:
        """
        Get execution order for all steps based on dependencies.

        Args:
            bronze_steps: Dictionary of bronze steps
            silver_steps: Dictionary of silver steps
            gold_steps: Dictionary of gold steps

        Returns:
            List of step names in execution order
        """
        graph = StepClassifier.build_dependency_graph(
            bronze_steps, silver_steps, gold_steps
        )
        return graph.topological_sort()

In [None]:
# Module: pipeline_builder_base.dependencies.analyzer (pipeline_builder_base)
#
# Dependencies: dependencies.exceptions, pipeline_builder_base.dependencies.exceptions, pipeline_builder_base.dependencies.graph, pipeline_builder_base.dependencies.graph, pipeline_builder_base.logging, pipeline_builder_base.logging

"""
Unified dependency analyzer for the framework pipelines.

This module provides a single, consolidated dependency analyzer that works
with any step implementation that follows the step protocols.

"""

from __future__ import annotations

import hashlib
import time
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, Optional, Protocol

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from .exceptions import DependencyError  # Removed: defined in notebook cells above
# from .graph import DependencyGraph, StepNode, StepType  # Removed: defined in notebook cells above

class AnalysisStrategy(Enum):
    """Strategies for dependency analysis."""

    CONSERVATIVE = "conservative"  # Assume all dependencies exist
    OPTIMISTIC = "optimistic"  # Assume minimal dependencies
    HYBRID = "hybrid"  # Balance between conservative and optimistic

@dataclass
class DependencyAnalysisResult:
    """Result of dependency analysis."""

    graph: DependencyGraph
    execution_order: list[
        str
    ]  # Topologically sorted step names for sequential execution
    cycles: list[list[str]]
    conflicts: list[str]
    recommendations: list[str]
    stats: Dict[str, Any]
    analysis_duration: float

# Protocol for step objects that can be analyzed
class StepProtocol(Protocol):
    """Protocol for steps that can be analyzed for dependencies."""

    name: str

class BronzeStepProtocol(StepProtocol, Protocol):
    """Protocol for bronze steps."""

    incremental_col: Optional[str]

class SilverStepProtocol(StepProtocol, Protocol):
    """Protocol for silver steps."""

    source_bronze: str

class GoldStepProtocol(StepProtocol, Protocol):
    """Protocol for gold steps."""

    source_silvers: Optional[list[str]]

class DependencyAnalyzer:
    """
    Unified dependency analyzer for all pipeline step types.

    This analyzer works with any step implementation that follows the step protocols.
    It analyzes dependencies across bronze, silver, and gold steps.

    Features:
        - Single analyzer for all step types (Bronze, Silver, Gold)
        - Multiple analysis strategies
        - Cycle detection and resolution
        - Topological sort for execution order
        - Performance analysis and recommendations
    """

    def __init__(
        self,
        strategy: AnalysisStrategy = AnalysisStrategy.HYBRID,
        logger: Optional[PipelineLogger] = None,
    ):
        self.strategy = strategy
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger
        self._analysis_cache: Dict[str, DependencyAnalysisResult] = {}

    def analyze_dependencies(
        self,
        bronze_steps: Optional[Dict[str, BronzeStepProtocol]] = None,
        silver_steps: Optional[Dict[str, SilverStepProtocol]] = None,
        gold_steps: Optional[Dict[str, GoldStepProtocol]] = None,
        force_refresh: bool = False,
        creation_order: Optional[Dict[str, int]] = None,
    ) -> DependencyAnalysisResult:
        """
        Analyze dependencies across all step types.

        Args:
            bronze_steps: Dictionary of bronze steps (any object with name and incremental_col)
            silver_steps: Dictionary of silver steps (any object with name and source_bronze)
            gold_steps: Dictionary of gold steps (any object with name and source_silvers)
            force_refresh: Whether to force refresh of cached results

        Returns:
            DependencyAnalysisResult containing analysis results
        """
        start_time = time.time()

        # Create cache key
        cache_key = self._create_cache_key(bronze_steps, silver_steps, gold_steps)

        if not force_refresh and cache_key in self._analysis_cache:
            self.logger.info(f"Using cached dependency analysis: {cache_key}")
            return self._analysis_cache[cache_key]

        self.logger.info(
            f"Starting dependency analysis with strategy: {self.strategy.value}"
        )

        try:
            # Step 1: Build dependency graph with creation order
            graph = self._build_dependency_graph(
                bronze_steps, silver_steps, gold_steps, creation_order=creation_order
            )

            # Step 2: Detect cycles
            cycles = graph.detect_cycles()
            if cycles:
                self.logger.warning(f"Detected {len(cycles)} circular dependencies")
                graph = self._resolve_cycles(graph, cycles)

            # Step 3: Detect conflicts
            conflicts = self._detect_conflicts(graph)
            if conflicts:
                self.logger.warning(f"Detected {len(conflicts)} dependency conflicts")

            # Step 4: Generate execution order (topological sort with creation order tie-breaker)
            execution_order = graph.topological_sort(creation_order=creation_order)

            # Step 5: Generate recommendations
            recommendations = self._generate_recommendations(graph, cycles, conflicts)

            # Step 6: Calculate statistics
            stats = graph.get_stats()

            # Create result
            result = DependencyAnalysisResult(
                graph=graph,
                execution_order=execution_order,
                cycles=cycles,
                conflicts=conflicts,
                recommendations=recommendations,
                stats=stats,
                analysis_duration=time.time() - start_time,
            )

            # Cache result
            self._analysis_cache[cache_key] = result

            self.logger.info(
                f"Dependency analysis completed in {result.analysis_duration:.2f}s"
            )
            return result

        except Exception as e:
            self.logger.error(f"Dependency analysis failed: {str(e)}")
            raise DependencyError(f"Dependency analysis failed: {str(e)}") from e

    def _build_dependency_graph(
        self,
        bronze_steps: Optional[Dict[str, BronzeStepProtocol]],
        silver_steps: Optional[Dict[str, SilverStepProtocol]],
        gold_steps: Optional[Dict[str, GoldStepProtocol]],
        creation_order: Optional[Dict[str, int]] = None,
    ) -> DependencyGraph:
        """Build the dependency graph from all step types."""
        graph = DependencyGraph()

        # Add bronze steps
        if bronze_steps:
            for name, step in bronze_steps.items():
                metadata: Dict[str, Any] = {"step": step}
                if creation_order and name in creation_order:
                    metadata["creation_order"] = creation_order[name]
                node = StepNode(name=name, step_type=StepType.BRONZE, metadata=metadata)
                graph.add_node(node)

        # Add silver steps - first pass: add all nodes
        silver_step_info: Dict[
            str, SilverStepProtocol
        ] = {}  # Store step info for dependency processing
        if silver_steps:
            for name, silver_step in silver_steps.items():
                metadata_s: Dict[str, Any] = {"step": silver_step}
                if creation_order and name in creation_order:
                    metadata_s["creation_order"] = creation_order[name]
                node = StepNode(
                    name=name, step_type=StepType.SILVER, metadata=metadata_s
                )
                graph.add_node(node)
                # Store step info for second pass
                silver_step_info[name] = silver_step

        # Second pass: add dependencies (now all nodes exist in graph)
        for name, silver_step in silver_step_info.items():
            # Add dependencies
            # SilverStep always has source_bronze attribute
            source_bronze = getattr(silver_step, "source_bronze", None)
            if source_bronze:
                # Check if the source bronze step exists
                if source_bronze in graph.nodes:
                    graph.add_dependency(name, source_bronze)
                else:
                    # Log warning about missing dependency
                    self.logger.warning(
                        f"Silver step {name} references non-existent bronze step {source_bronze}"
                    )

            # Check for silver-to-silver dependencies via source_silvers
            # This allows silver steps to depend on other silver steps
            # IMPORTANT: source_silvers overrides creation order
            source_silvers = getattr(silver_step, "source_silvers", None)
            if source_silvers:
                if isinstance(source_silvers, (list, tuple)):
                    for dep in source_silvers:
                        if dep in graph.nodes:
                            graph.add_dependency(name, dep)
                        else:
                            self.logger.warning(
                                f"Silver step {name} references non-existent silver step {dep}"
                            )
                elif isinstance(source_silvers, str):
                    if source_silvers in graph.nodes:
                        graph.add_dependency(name, source_silvers)
                    else:
                        self.logger.warning(
                            f"Silver step {name} references non-existent silver step {source_silvers}"
                        )

            # Check for additional dependencies (backward compatibility)
            if hasattr(silver_step, "depends_on"):
                depends_on = getattr(silver_step, "depends_on", None)
                if depends_on and isinstance(depends_on, (list, tuple, set)):
                    for dep in depends_on:
                        if dep in graph.nodes:
                            graph.add_dependency(name, dep)
                        else:
                            self.logger.warning(
                                f"Silver step {name} references non-existent dependency {dep}"
                            )

        # Add gold steps - first pass: add all nodes
        gold_step_info: Dict[
            str, GoldStepProtocol
        ] = {}  # Store step info for dependency processing
        if gold_steps:
            for name, gold_step in gold_steps.items():
                metadata_g: Dict[str, Any] = {"step": gold_step}
                if creation_order and name in creation_order:
                    metadata_g["creation_order"] = creation_order[name]
                node = StepNode(name=name, step_type=StepType.GOLD, metadata=metadata_g)
                graph.add_node(node)
                # Store step info for second pass
                gold_step_info[name] = gold_step

        # Second pass: add dependencies (now all nodes exist in graph)
        for name, gold_step in gold_step_info.items():
            # Add dependencies
            # GoldStep always has source_silvers attribute (can be None)
            # IMPORTANT: source_silvers overrides creation order
            source_silvers = getattr(gold_step, "source_silvers", None)
            if source_silvers:
                for dep in source_silvers:
                    if dep in graph.nodes:
                        graph.add_dependency(name, dep)
                    else:
                        self.logger.warning(
                            f"Gold step {name} references non-existent silver step {dep}"
                        )

        return graph

    def _resolve_cycles(
        self, graph: DependencyGraph, cycles: list[list[str]]
    ) -> DependencyGraph:
        """Resolve cycles in the dependency graph."""
        # Simple cycle resolution: break cycles by removing the last dependency
        for cycle in cycles:
            if len(cycle) > 1:
                # Remove the last dependency in the cycle
                from_step = cycle[-2]
                to_step = cycle[-1]

                self.logger.warning(
                    f"Breaking cycle by removing dependency: {from_step} -> {to_step}"
                )

                # Remove from adjacency lists
                if to_step in graph._adjacency_list[from_step]:
                    graph._adjacency_list[from_step].remove(to_step)
                if from_step in graph._reverse_adjacency_list[to_step]:
                    graph._reverse_adjacency_list[to_step].remove(from_step)

                # Update node dependencies
                if to_step in graph.nodes[from_step].dependencies:
                    graph.nodes[from_step].dependencies.remove(to_step)
                if from_step in graph.nodes[to_step].dependents:
                    graph.nodes[to_step].dependents.remove(from_step)

        return graph

    def _detect_conflicts(self, graph: DependencyGraph) -> list[str]:
        """Detect dependency conflicts."""
        conflicts = []

        # Check for conflicting step names
        step_names = list(graph.nodes.keys())
        seen_names = set()
        for node_name in step_names:
            if node_name in seen_names:
                conflicts.append(f"Conflicting step name: {node_name}")
            seen_names.add(node_name)

        # Check for missing dependencies
        for node_name, node in graph.nodes.items():
            for dep in node.dependencies:
                if dep not in graph.nodes:
                    conflicts.append(f"Node {node_name} depends on missing node {dep}")

        return conflicts

    def _generate_recommendations(
        self, graph: DependencyGraph, cycles: list[list[str]], conflicts: list[str]
    ) -> list[str]:
        """Generate optimization recommendations."""
        recommendations = []

        # Cycle recommendations
        if cycles:
            recommendations.append(
                "Consider refactoring to eliminate circular dependencies"
            )

        # Conflict recommendations
        if conflicts:
            recommendations.append("Resolve dependency conflicts before execution")

        # Performance recommendations
        stats = graph.get_stats()
        if stats["average_dependencies"] > 3:
            recommendations.append(
                "Consider reducing step dependencies for better performance"
            )

        if len(graph.nodes) > 10:
            recommendations.append(
                "Consider breaking large pipelines into smaller, focused pipelines"
            )

        return recommendations

    def _create_cache_key(
        self,
        bronze_steps: Optional[Dict[str, BronzeStepProtocol]],
        silver_steps: Optional[Dict[str, SilverStepProtocol]],
        gold_steps: Optional[Dict[str, GoldStepProtocol]],
    ) -> str:
        """Create a cache key for the analysis."""
        # Create a simple hash of the step configurations
        key_parts = []

        if bronze_steps:
            key_parts.extend(sorted(bronze_steps.keys()))
        if silver_steps:
            key_parts.extend(sorted(silver_steps.keys()))
        if gold_steps:
            key_parts.extend(sorted(gold_steps.keys()))

        key_string = f"{self.strategy.value}:{':'.join(key_parts)}"
        return hashlib.sha256(key_string.encode()).hexdigest()

    def clear_cache(self) -> None:
        """Clear the analysis cache."""
        self._analysis_cache.clear()
        self.logger.info("Dependency analysis cache cleared")

In [None]:
# Module: pipeline_builder_base.models.base (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors, pipeline_builder_base.errors, pipeline_builder_base.models.enums, pipeline_builder_base.models.enums, pipeline_builder_base.models.types, pipeline_builder_base.models.types

"""
Base classes and configuration models for the Pipeline Builder.

"""

from __future__ import annotations

import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .enums import PipelinePhase  # Removed: defined in notebook cells above
# from .types import ModelValue  # Removed: defined in notebook cells above

@dataclass
class BaseModel(ABC):
    """
    Base class for all pipeline models with common functionality.

    Provides standard validation, serialization, and representation methods
    for all pipeline data models. All models in the pipeline system inherit
    from this base class to ensure consistent behavior.

    Features:
    - Automatic validation support
    - JSON serialization and deserialization
    - Dictionary conversion for easy data exchange
    - String representation for debugging
    - Type-safe field access

    Example:
        >>> @dataclass
        >>> class MyStep(BaseModel):
        ...     name: str
        ...     rules: Dict[str, List[ColumnRule]]
        ...
        ...     def validate(self) -> None:
        ...         if not self.name:
        ...             raise ValueError("Name cannot be empty")
        ...         if not self.rules:
        ...             raise ValueError("Rules cannot be empty")
        >>>
        >>> step = MyStep(name="test", rules={"id": [F.col("id").isNotNull()]})
        >>> step.validate()
        >>> print(step.to_json())
    """

    @abstractmethod
    def validate(self) -> None:
        """Validate the model. Override in subclasses."""
        pass

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert model to dictionary."""
        result: Dict[str, ModelValue] = {}
        for field_info in self.__dataclass_fields__.values():
            value = getattr(self, field_info.name)
            if hasattr(value, "to_dict"):
                result[field_info.name] = value.to_dict()
            else:
                result[field_info.name] = value
        return result

    def to_json(self) -> str:
        """Convert model to JSON string."""
        return json.dumps(self.to_dict(), default=str, indent=2)

    def __str__(self) -> str:
        """String representation of the model."""
        return f"{self.__class__.__name__}({', '.join(f'{k}={v}' for k, v in self.to_dict().items())})"

@dataclass
class ValidationThresholds(BaseModel):
    """
    Validation thresholds for different pipeline phases.

    Attributes:
        bronze: Bronze layer validation threshold (0-100)
        silver: Silver layer validation threshold (0-100)
        gold: Gold layer validation threshold (0-100)
    """

    bronze: float
    silver: float
    gold: float

    def validate(self) -> None:
        """Validate threshold values."""
        for phase, threshold in [
            ("bronze", self.bronze),
            ("silver", self.silver),
            ("gold", self.gold),
        ]:
            if not 0 <= threshold <= 100:
                raise PipelineValidationError(
                    f"{phase} threshold must be between 0 and 100, got {threshold}"
                )

    def get_threshold(self, phase: PipelinePhase) -> float:
        """Get threshold for a specific phase."""
        phase_map = {
            PipelinePhase.BRONZE: self.bronze,
            PipelinePhase.SILVER: self.silver,
            PipelinePhase.GOLD: self.gold,
        }
        return phase_map[phase]

    @classmethod
    def create_default(cls) -> ValidationThresholds:
        """Create default validation thresholds."""
        return cls(bronze=95.0, silver=98.0, gold=99.0)

    @classmethod
    def create_strict(cls) -> ValidationThresholds:
        """Create strict validation thresholds."""
        return cls(bronze=99.0, silver=99.5, gold=99.9)

    @classmethod
    def create_loose(cls) -> ValidationThresholds:
        """Create loose validation thresholds."""
        return cls(bronze=80.0, silver=85.0, gold=90.0)

In [None]:
# Module: pipeline_builder_base.writer.base (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.logging, pipeline_builder_base.models, pipeline_builder_base.writer.models

"""
Base LogWriter class for pipeline builders.

This module provides an abstract base class for LogWriter implementations
that can be used by both Spark and SQL pipeline builders.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Optional

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..models import ExecutionResult  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
    # LogRow,
    # WriteMode,
    # WriterConfig,
    # WriterMetrics,
    # create_log_rows_from_execution_result,
# )

class BaseLogWriter(ABC):
    """
    Abstract base class for LogWriter implementations.

    This class defines the interface that all LogWriter implementations
    must follow, while allowing engine-specific implementations for
    storage operations.

    Subclasses must implement:
    - _write_log_rows() - Engine-specific write operation
    - _read_log_table() - Engine-specific read operation
    - _table_exists() - Engine-specific table existence check
    - _create_table() - Engine-specific table creation
    """

    def __init__(
        self,
        schema: str,
        table_name: str,
        config: Optional[WriterConfig] = None,
        logger: Optional[PipelineLogger] = None,
    ) -> None:
        """
        Initialize the base LogWriter.

        Args:
            schema: Database schema name
            table_name: Table name
            config: Writer configuration (optional)
            logger: Pipeline logger (optional)
        """
        self.schema = schema
        self.table_name = table_name
        self.logger = logger or PipelineLogger()

        # Create config from schema/table_name if not provided
        if config is None:
            # from .models import WriteMode  # Removed: defined in notebook cells above

            config = WriterConfig(
                table_schema=schema,
                table_name=table_name,
                write_mode=WriteMode.APPEND,
            )
        self.config = config
        self.config.validate()

    @property
    def table_fqn(self) -> str:
        """Get fully qualified table name."""
        return f"{self.schema}.{self.table_name}"

    def create_table(self, execution_result: ExecutionResult) -> None:
        """
        Create the log table from the first execution result.

        Args:
            execution_result: The execution result to create table from
        """
        if self._table_exists():
            self.logger.warning(
                f"Table {self.table_fqn} already exists, skipping creation"
            )
            return

        self.logger.info(f"Creating log table {self.table_fqn}")
        # Cast run_mode to Literal type for type safety
        from typing import Literal, cast

        RunModeType = Literal[
            "initial", "incremental", "full_refresh", "validation_only"
        ]
        run_mode = cast(RunModeType, execution_result.context.run_mode)
        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to create table from")
            return

        self._create_table(log_rows)
        self._write_log_rows(log_rows, WriteMode.APPEND)

    def append(self, execution_result: ExecutionResult) -> WriterMetrics:
        """
        Append execution result to the log table.

        Args:
            execution_result: The execution result to append

        Returns:
            Writer metrics
        """
        if not self._table_exists():
            self.logger.warning(f"Table {self.table_fqn} does not exist, creating it")
            self.create_table(execution_result)
            return self._get_metrics()

        # Cast run_mode to Literal type for type safety
        from typing import Literal, cast

        RunModeType = Literal[
            "initial", "incremental", "full_refresh", "validation_only"
        ]
        run_mode = cast(RunModeType, execution_result.context.run_mode)
        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to append")
            return self._get_metrics()

        self._write_log_rows(log_rows, WriteMode.APPEND)
        return self._get_metrics()

    def write(
        self, execution_result: ExecutionResult, mode: WriteMode = WriteMode.APPEND
    ) -> WriterMetrics:
        """
        Write execution result to the log table.

        Args:
            execution_result: The execution result to write
            mode: Write mode (APPEND or OVERWRITE)

        Returns:
            Writer metrics
        """
        if mode == WriteMode.OVERWRITE or not self._table_exists():
            if not self._table_exists():
                self.logger.info(f"Table {self.table_fqn} does not exist, creating it")
                self.create_table(execution_result)
            else:
                self.logger.info(f"Overwriting table {self.table_fqn}")
                # For overwrite, we need to clear the table first
                # This is engine-specific, so subclasses should override if needed
                self._write_log_rows([], WriteMode.OVERWRITE)

        # Cast run_mode to Literal type for type safety
        from typing import Literal, cast

        RunModeType = Literal[
            "initial", "incremental", "full_refresh", "validation_only"
        ]
        run_mode = cast(RunModeType, execution_result.context.run_mode)
        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to write")
            return self._get_metrics()

        self._write_log_rows(log_rows, mode)
        return self._get_metrics()

    def read(self, limit: Optional[int] = None) -> list[LogRow]:
        """
        Read log rows from the table.

        Args:
            limit: Maximum number of rows to read (None for all)

        Returns:
            List of log rows
        """
        if not self._table_exists():
            self.logger.warning(f"Table {self.table_fqn} does not exist")
            return []

        return self._read_log_table(limit)

    # Abstract methods that must be implemented by subclasses

    @abstractmethod
    def _write_log_rows(self, log_rows: list[LogRow], mode: WriteMode) -> None:
        """
        Write log rows to the storage system.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            log_rows: List of log rows to write
            mode: Write mode
        """
        pass

    @abstractmethod
    def _read_log_table(self, limit: Optional[int] = None) -> list[LogRow]:
        """
        Read log rows from the storage system.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            limit: Maximum number of rows to read (None for all)

        Returns:
            List of log rows
        """
        pass

    @abstractmethod
    def _table_exists(self) -> bool:
        """
        Check if the log table exists.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Returns:
            True if table exists, False otherwise
        """
        pass

    @abstractmethod
    def _create_table(self, sample_rows: list[LogRow]) -> None:
        """
        Create the log table with appropriate schema.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            sample_rows: Sample log rows to infer schema from
        """
        pass

    def _get_metrics(self) -> WriterMetrics:
        """
        Get writer metrics.

        This is a default implementation that can be overridden
        by subclasses for more detailed metrics.

        Returns:
            Writer metrics
        """
        return {
            "total_writes": 1,
            "successful_writes": 1,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }

In [None]:
# Module: pipeline_builder.engine_config (pipeline_builder)
#
# Dependencies: pipeline_builder.protocols

"""
Engine injection for pipeline_builder.

Holds injected engine components (functions, types, window, desc,
AnalysisException) that satisfy the protocols in `protocols.py`. Users/tests
must call `configure_engine(...)` after creating their engine (PySpark,
sparkless, etc.). Defaults raise to ensure misconfiguration surfaces early.
"""

from __future__ import annotations

import threading
from dataclasses import dataclass
from typing import Any, Optional, cast

# from .protocols import (  # Removed: defined in notebook cells above
    # AnalysisExceptionProtocol,
    # FunctionsProtocol,
    # TypesProtocol,
    # WindowProtocol,
# )

@dataclass
class EngineConfig:
    functions: FunctionsProtocol
    types: TypesProtocol
    analysis_exception: type[BaseException] | AnalysisExceptionProtocol
    window: Optional[WindowProtocol] = None
    desc: Optional[Any] = None
    engine_name: str = "unknown"
    dataframe_cls: Optional[Any] = None
    spark_session_cls: Optional[Any] = None
    column_cls: Optional[Any] = None

# Global engine state (for backward compatibility)
_engine: Optional[EngineConfig] = None

# Thread-local engine state (for parallel test isolation)
_thread_local = threading.local()

def _configure_engine_from_session(spark: Any) -> None:
    """Configure engine from a SparkSession (PySpark or sparkless). Used by configure_engine(spark=...)."""
    session_module = type(spark).__module__
    if "pyspark" in session_module:
        from pyspark.sql import (
            Column as PySparkColumn,
        )
        from pyspark.sql import (
            DataFrame as PySparkDataFrame,
        )
        from pyspark.sql import (
            SparkSession as PySparkSparkSession,
        )
        from pyspark.sql import functions as pyspark_functions
        from pyspark.sql import types as pyspark_types
        from pyspark.sql.functions import desc as pyspark_desc
        from pyspark.sql.utils import AnalysisException as PySparkAnalysisException
        from pyspark.sql.window import Window as PySparkWindow

        configure_engine(
            functions=cast(FunctionsProtocol, pyspark_functions),
            types=cast(TypesProtocol, pyspark_types),
            analysis_exception=PySparkAnalysisException,
            window=PySparkWindow,
            desc=pyspark_desc,
            engine_name="pyspark",
            dataframe_cls=PySparkDataFrame,
            spark_session_cls=PySparkSparkSession,
            column_cls=PySparkColumn,
        )
    elif "sparkless" in session_module or "mock_spark" in session_module:
        from sparkless import AnalysisException as MockAnalysisException
        from sparkless import Column as MockColumn
        from sparkless import DataFrame as MockDataFrame
        from sparkless import SparkSession as MockSparkSession
        from sparkless import Window as MockWindow
        from sparkless import functions as mock_functions
        from sparkless import spark_types as mock_types
        from sparkless.functions import desc as mock_desc

        configure_engine(
            functions=cast(FunctionsProtocol, mock_functions),
            types=cast(TypesProtocol, mock_types),
            analysis_exception=MockAnalysisException,
            window=MockWindow,
            desc=mock_desc,
            engine_name="mock",
            dataframe_cls=MockDataFrame,
            spark_session_cls=MockSparkSession,
            column_cls=MockColumn,
        )
    else:
        raise ValueError(
            f"Unknown Spark session type: {type(spark).__module__}. "
            "Use configure_engine(functions=..., types=..., analysis_exception=...) for custom engines."
        )

def configure_engine(
    *,
    spark: Optional[Any] = None,
    functions: Optional[FunctionsProtocol] = None,
    types: Optional[TypesProtocol] = None,
    analysis_exception: Optional[
        type[BaseException] | AnalysisExceptionProtocol
    ] = None,
    window: Optional[WindowProtocol] = None,
    desc: Optional[Any] = None,
    engine_name: str = "unknown",
    dataframe_cls: Optional[Any] = None,
    spark_session_cls: Optional[Any] = None,
    column_cls: Optional[Any] = None,
) -> None:
    """
    Inject engine components.

    Convenience: pass spark=your_spark_session to auto-configure from PySpark or sparkless.
    Otherwise pass functions=..., types=..., analysis_exception=... (and optional window, desc, etc.).

    Sets both thread-local and global engine state for backward compatibility.
    Thread-local state takes precedence in get_engine() for parallel test isolation.
    """
    if spark is not None:
        _configure_engine_from_session(spark)
        return
    if functions is None or types is None or analysis_exception is None:
        raise TypeError(
            "configure_engine() requires either spark=<SparkSession> or "
            "functions=..., types=..., and analysis_exception=..."
        )
    global _engine
    engine_config = EngineConfig(
        functions=functions,
        types=types,
        analysis_exception=analysis_exception,
        window=window,
        desc=desc,
        engine_name=engine_name,
        dataframe_cls=dataframe_cls,
        spark_session_cls=spark_session_cls,
        column_cls=column_cls,
    )

    # Set global state (for backward compatibility)
    _engine = engine_config

    # Set thread-local state (for parallel test isolation)
    _thread_local.engine = engine_config

def get_engine() -> EngineConfig:
    """
    Get the current engine config, raising if not configured.

    Checks thread-local storage first (for parallel test isolation),
    then falls back to global state (for backward compatibility).
    """

    # Try thread-local first (for parallel test isolation)
    if hasattr(_thread_local, "engine") and _thread_local.engine is not None:
        engine: EngineConfig = _thread_local.engine
        return engine

    # Fallback to global state (for backward compatibility)
    if _engine is None:
        raise RuntimeError(
            "Engine not configured. Call configure_engine(functions=..., types=..., analysis_exception=..., window=..., desc=...) before using pipeline_builder."
        )
    return _engine

def reset_engine_state() -> None:
    """
    Reset thread-local engine state.

    This is useful for test isolation - clears the thread-local engine
    so the next get_engine() call will use global state or raise an error.
    """
    if hasattr(_thread_local, "engine"):
        delattr(_thread_local, "engine")

__all__ = ["EngineConfig", "configure_engine", "get_engine", "reset_engine_state"]

In [None]:
# Module: pipeline_builder.engine (pipeline_builder)
#
# Dependencies: pipeline_builder.protocols

"""
Engine injection for pipeline_builder.

This module holds injected engine components (functions, types, window, desc,
AnalysisException) that satisfy the protocols in `protocols.py`. Users/tests
must call `configure_engine(...)` after creating their engine (PySpark,
sparkless, etc.). Defaults raise to ensure misconfiguration surfaces early.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Optional

# from .protocols import (  # Removed: defined in notebook cells above
    # AnalysisExceptionProtocol,
    # FunctionsProtocol,
    # TypesProtocol,
    # WindowProtocol,
# )

@dataclass
class EngineConfig:
    functions: FunctionsProtocol
    types: TypesProtocol
    analysis_exception: type[BaseException] | AnalysisExceptionProtocol
    window: Optional[WindowProtocol] = None
    desc: Optional[Any] = None

_engine: Optional[EngineConfig] = None

def configure_engine(
    *,
    functions: FunctionsProtocol,
    types: TypesProtocol,
    analysis_exception: type[BaseException] | AnalysisExceptionProtocol,
    window: Optional[WindowProtocol] = None,
    desc: Optional[Any] = None,
) -> None:
    """Inject engine components."""

    global _engine
    _engine = EngineConfig(
        functions=functions,
        types=types,
        analysis_exception=analysis_exception,
        window=window,
        desc=desc,
    )

def get_engine() -> EngineConfig:
    """Get the current engine config, raising if not configured."""

    if _engine is None:
        raise RuntimeError(
            "Engine not configured. Call configure_engine(functions=..., types=..., analysis_exception=..., window=..., desc=...) before using pipeline_builder."
        )
    return _engine

__all__ = ["EngineConfig", "configure_engine", "get_engine"]

In [None]:
# Module: abstracts.step (abstracts)
#
# Dependencies: abstracts.rules, abstracts.transformer

from __future__ import annotations

from typing import Literal, Optional, Protocol
# from .rules import Rules  # Removed: defined in notebook cells above
# from .transformer import Transformer  # Removed: defined in notebook cells above

class Step(Protocol):
    """
    Protocol for pipeline steps that BronzeStep, SilverStep, and GoldStep naturally satisfy.

    This Protocol defines the interface that all step types must implement,
    allowing duck typing compatibility between abstracts and pipeline_builder.
    """

    name: str
    type: Literal["bronze", "silver", "gold"]
    rules: Rules
    source: Optional[str]
    target: Optional[str]
    transform: Optional[Transformer]
    write_mode: Optional[Literal["overwrite", "append"]]
    write_schema: Optional[str]

In [None]:
# Module: abstracts.reports.write (abstracts)
#
# Dependencies: abstracts.source

from dataclasses import dataclass
from typing import Optional
# from .source import Source  # Removed: defined in notebook cells above

@dataclass
class WriteReport:
    source: Source
    written_rows: int
    failed_rows: int
    error: Optional[Exception] = None

In [None]:
# Module: abstracts.reports.transform (abstracts)
#
# Dependencies: abstracts.source

from dataclasses import dataclass
from typing import Optional
# from .source import Source  # Removed: defined in notebook cells above

@dataclass
class TransformReport:
    source: Source
    error: Optional[Exception] = None

In [None]:
# Module: abstracts.reports.validation (abstracts)
#
# Dependencies: abstracts.source

from dataclasses import dataclass
from typing import Optional
# from .source import Source  # Removed: defined in notebook cells above

@dataclass
class ValidationReport:
    source: Source
    valid_rows: int
    invalid_rows: int
    error: Optional[Exception] = None

In [None]:
# Module: pipeline_builder.sql_source.reader (pipeline_builder)
#
# Dependencies: pipeline_builder.sql_source.models

"""
Unified reader for JdbcSource and SqlAlchemySource.

Dispatches on source type and returns a Spark DataFrame.
"""

from __future__ import annotations

from typing import Any, Dict, Union, cast
# from .sql_source.models import JdbcSource, SqlAlchemySource  # Removed: defined in notebook cells above

def read_sql_source(
    source: Union[JdbcSource, SqlAlchemySource],
    spark: Any,
) -> Any:
    """
    Read a SQL source into a Spark DataFrame.

    Args:
        source: JdbcSource or SqlAlchemySource configuration.
        spark: SparkSession (from pipeline_builder.compat or pyspark).

    Returns:
        Spark DataFrame.

    Raises:
        ValidationError: If source config is invalid.
        RuntimeError: If SqlAlchemySource is used but sqlalchemy/pandas not installed.
    """
    if isinstance(source, JdbcSource):
        return _read_jdbc(source, spark)
    if isinstance(source, SqlAlchemySource):
        return _read_sqlalchemy(source, spark)
    raise TypeError(
        f"source must be JdbcSource or SqlAlchemySource, got {type(source).__name__}"
    )

def _read_jdbc(source: JdbcSource, spark: Any) -> Any:
    table_or_query = source.table if source.table else source.query
    if not table_or_query:
        raise ValueError("JdbcSource must have table or query set")

    # Copy properties to a mutable dict so type checkers know we can mutate it
    props: Dict[str, str] = dict(source.properties)

    if source.driver:
        props["driver"] = source.driver

    kwargs = {
        "url": source.url,
        "table": table_or_query,
        "properties": props,
    }

    return spark.read.jdbc(**kwargs)

def _read_sqlalchemy(source: SqlAlchemySource, spark: Any) -> Any:
    try:
        import pandas as pd
    except ImportError as e:
        raise RuntimeError(
            "SqlAlchemySource requires pandas. Install with: pip install pipeline_builder[sql]"
        ) from e
    try:
        from sqlalchemy import create_engine
    except ImportError as e:
        raise RuntimeError(
            "SqlAlchemySource requires sqlalchemy. Install with: pip install pipeline_builder[sql]"
        ) from e

    if source.engine is not None:
        engine = source.engine
    else:
        # __post_init__ guarantees that url is a non-empty string when engine is None,
        # but type checkers cannot deduce this, so we cast explicitly.
        engine = create_engine(cast(str, source.url))

    if source.query is not None:
        pdf = pd.read_sql(source.query, engine)
    else:
        # SqlAlchemySource validation guarantees that table is set when query is None,
        # but we add a defensive assertion for type checkers and runtime safety.
        assert source.table is not None, (
            "SqlAlchemySource.table must be set when query is None"
        )

        if source.schema:
            pdf = pd.read_sql_table(source.table, engine, schema=source.schema)
        else:
            pdf = pd.read_sql_table(source.table, engine)

    return spark.createDataFrame(pdf)

In [None]:
# Module: pipeline_builder_base.models.pipeline (pipeline_builder_base)
#
# Dependencies: models.base, pipeline_builder_base.errors, pipeline_builder_base.errors, pipeline_builder_base.models.base

"""
Pipeline configuration models.

"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel, ValidationThresholds  # Removed: defined in notebook cells above

@dataclass
class PipelineConfig(BaseModel):
    """
    Main pipeline configuration.

    Attributes:
        schema: Database schema name
        thresholds: Validation thresholds for each phase
        verbose: Whether to enable verbose logging
    """

    schema: str
    thresholds: ValidationThresholds
    verbose: bool = True

    @property
    def min_bronze_rate(self) -> float:
        """Get bronze validation threshold."""
        return self.thresholds.bronze

    @property
    def min_silver_rate(self) -> float:
        """Get silver validation threshold."""
        return self.thresholds.silver

    @property
    def min_gold_rate(self) -> float:
        """Get gold validation threshold."""
        return self.thresholds.gold

    def validate(self) -> None:
        """Validate pipeline configuration."""
        if not self.schema or not isinstance(self.schema, str):
            raise PipelineValidationError("Schema name must be a non-empty string")
        self.thresholds.validate()

    @classmethod
    def create_default(cls, schema: str) -> PipelineConfig:
        """Create default pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_default(),
            verbose=True,
        )

    @classmethod
    def create_high_performance(cls, schema: str) -> PipelineConfig:
        """Create high-performance pipeline configuration with strict validation."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            verbose=False,
        )

    @classmethod
    def create_conservative(cls, schema: str) -> PipelineConfig:
        """Create conservative pipeline configuration with strict validation."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            verbose=True,
        )

@dataclass
class PipelineMetrics(BaseModel):
    """
    Overall pipeline execution metrics.

    Attributes:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        skipped_steps: Number of skipped steps
        total_duration: Total execution duration
        bronze_duration: Bronze layer duration
        silver_duration: Silver layer duration
        gold_duration: Gold layer duration
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate
        cache_hit_rate: Cache hit rate
        error_count: Number of errors
        retry_count: Number of retries
    """

    total_steps: int = 0
    successful_steps: int = 0
    failed_steps: int = 0
    skipped_steps: int = 0
    total_duration: float = 0.0
    bronze_duration: float = 0.0
    silver_duration: float = 0.0
    gold_duration: float = 0.0
    total_rows_processed: int = 0
    total_rows_written: int = 0
    avg_validation_rate: float = 0.0
    cache_hit_rate: float = 0.0
    error_count: int = 0
    retry_count: int = 0

    def validate(self) -> None:
        """Validate the pipeline metrics."""
        if self.total_steps < 0:
            raise ValueError("Total steps cannot be negative")
        if self.successful_steps < 0:
            raise ValueError("Successful steps cannot be negative")
        if self.failed_steps < 0:
            raise ValueError("Failed steps cannot be negative")
        if self.skipped_steps < 0:
            raise ValueError("Skipped steps cannot be negative")
        if self.total_duration < 0:
            raise ValueError("Total duration cannot be negative")
        if not 0 <= self.avg_validation_rate <= 100:
            raise ValueError("Average validation rate must be between 0 and 100")

    @property
    def success_rate(self) -> float:
        """Calculate success rate."""
        return (
            (self.successful_steps / self.total_steps * 100)
            if self.total_steps > 0
            else 0.0
        )

    @property
    def failure_rate(self) -> float:
        """Calculate failure rate."""
        return 100.0 - self.success_rate

    @classmethod
    def from_step_results(cls, step_results: list[Any]) -> PipelineMetrics:
        """Create metrics from step results."""
        total_steps = len(step_results)
        successful_steps = sum(1 for result in step_results if result.success)
        failed_steps = total_steps - successful_steps
        total_duration_secs = sum(result.duration_secs for result in step_results)
        total_rows_processed = sum(result.rows_processed for result in step_results)
        total_rows_written = sum(result.rows_written for result in step_results)
        avg_validation_rate = (
            sum(result.validation_rate for result in step_results) / total_steps
            if total_steps > 0
            else 0.0
        )

        return cls(
            total_steps=total_steps,
            successful_steps=successful_steps,
            failed_steps=failed_steps,
            total_duration=total_duration_secs,
            total_rows_processed=total_rows_processed,
            total_rows_written=total_rows_written,
            avg_validation_rate=avg_validation_rate,
        )

In [None]:
# Module: pipeline_builder.compat (pipeline_builder)
#
# Dependencies: pipeline_builder.engine_config, pipeline_builder.protocols

# mypy: ignore-errors
"""
Protocol-based compatibility layer for engine abstraction.

This module provides a compatibility layer that abstracts over different
Spark/PySpark implementations (real PySpark, mock Spark, sparkless, etc.).
It exposes protocol aliases and injected engine components that are configured
at runtime through the engine configuration system.

**Key Features:**
    - **Engine Detection**: Automatically detects and uses the configured engine
    - **Protocol Aliases**: Provides type-safe aliases for DataFrame, SparkSession, Column
    - **Lazy Loading**: Components are loaded lazily to avoid import-time cycles
    - **Mock Support**: Supports both real PySpark and mock Spark for testing

**Usage:**
    Before using pipeline_builder, you must configure the engine:

    >>> from pipeline_builder.engine_config import configure_engine
    >>> from pyspark.sql import SparkSession
    >>>
    >>> spark = SparkSession.builder.appName("test").getOrCreate()
    >>> configure_engine(spark=spark)

    Then you can import and use the compatibility layer:

    >>> from pipeline_builder.compat import DataFrame, SparkSession, F
    >>> df: DataFrame = spark.createDataFrame([(1, "test")], ["id", "name"])

**Exported Components:**
    - **DataFrame**: Protocol alias for DataFrame type
    - **SparkSession**: Protocol alias for SparkSession type
    - **Column**: Protocol alias for Column type
    - **F**: Functions module (PySpark functions or mock equivalent)
    - **types**: Types module (StructType, StringType, etc.)
    - **AnalysisException**: Exception class for analysis errors
    - **Window**: Window functions
    - **desc**: Descending sort function

**Engine Configuration:**
    The engine must be configured before use. See `engine_config.configure_engine()`
    for details on how to configure the engine with your Spark/PySpark objects.

Example:
    >>> from pipeline_builder.engine_config import configure_engine
    >>> from pipeline_builder.compat import DataFrame, F
    >>> from pyspark.sql import SparkSession
    >>>
    >>> # Configure engine
    >>> spark = SparkSession.builder.appName("test").getOrCreate()
    >>> configure_engine(spark=spark)
    >>>
    >>> # Use compatibility layer
    >>> from pipeline_builder.compat import DataFrame
    >>> df: DataFrame = spark.createDataFrame([(1, "test")], ["id", "name"])
    >>> result = df.filter(F.col("id") > 0)
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, cast

# from .engine_config import get_engine  # Removed: defined in notebook cells above
# from .protocols import (  # Removed: defined in notebook cells above
    # ColumnProtocol,
    # DataFrameProtocol,
    # SparkSessionProtocol,
# )

# Type aliases for typing
if TYPE_CHECKING:
    from .protocols import ColumnProtocol as Column
    from .protocols import DataFrameProtocol as DataFrame
    from .protocols import SparkSessionProtocol as SparkSession
else:
    DataFrame = Any  # type: ignore[assignment]
    SparkSession = Any  # type: ignore[assignment]
    Column = Any  # type: ignore[assignment]

# Try to bind engine components immediately if configured
try:
    _eng = get_engine()
    DataFrame = cast(Any, _eng.dataframe_cls or DataFrameProtocol)
    SparkSession = cast(Any, _eng.spark_session_cls or SparkSessionProtocol)
    Column = cast(Any, _eng.column_cls or ColumnProtocol)
    # F = _eng.functions  # In standalone notebook, use global F from pyspark
    # F is available from imports cell (pyspark.sql.functions)
    types = _eng.types
    AnalysisException = _eng.analysis_exception  # type: ignore[assignment]
    Window = _eng.window  # type: ignore[assignment]
    desc = _eng.desc
except Exception:
    # Defer to __getattr__ if not configured yet
    pass

def __getattr__(name: str) -> Any:
    """Lazily resolve injected engine components to avoid import-time cycles.

    This function is called when an attribute is accessed that wasn't found
    during module initialization. It allows lazy loading of engine components
    to avoid circular import issues.

    Args:
        name: Name of the attribute to resolve. Supported names:
            - "F": Functions module
            - "types": Types module
            - "AnalysisException": Analysis exception class
            - "Window": Window functions
            - "desc": Descending sort function
            - "DataFrame": DataFrame protocol/class
            - "SparkSession": SparkSession protocol/class
            - "Column": Column protocol/class

    Returns:
        The requested engine component from the configured engine.

    Raises:
        AttributeError: If the requested attribute is not supported or
            the engine is not configured.

    Note:
        This function is automatically called by Python when accessing
        module attributes that don't exist at import time. It should not
        be called directly.
    """
    if name in {"F", "types", "AnalysisException", "Window", "desc"}:
        eng = get_engine()
        if name == "F":
            # In standalone notebook, return global F from pyspark
            from pyspark.sql import functions as F
            return F
        if name == "types":
            # In standalone notebook, return types from pyspark.sql.types
            from pyspark.sql import types
            return types
        if name == "AnalysisException":
            return eng.analysis_exception
        if name == "Window":
            return eng.window
        if name == "desc":
            return eng.desc
    if name == "DataFrame":
        eng = get_engine()
        return eng.dataframe_cls or DataFrameProtocol
    if name == "SparkSession":
        eng = get_engine()
        return eng.spark_session_cls or SparkSessionProtocol
    if name == "Column":
        eng = get_engine()
        return eng.column_cls or ColumnProtocol
    raise AttributeError(f"module {__name__} has no attribute {name}")

def is_mock_spark() -> bool:
    """Check if the configured engine is a mock Spark implementation.

    This function is useful for conditional logic that needs to behave
    differently in test environments vs production.

    Returns:
        True if the configured engine is "mock", False otherwise.
        Returns False if the engine is not configured or an error occurs.

    Example:
        >>> from pipeline_builder.compat import is_mock_spark
        >>> if is_mock_spark():
        ...     print("Running in test mode")
        ... else:
        ...     print("Running with real PySpark")
    """
    try:
        return get_engine().engine_name == "mock"
    except Exception:
        return False

def compat_name() -> str:
    """Get the name of the currently configured engine.

    Returns the engine name that was configured via `engine_config.configure_engine()`.
    Common values include "pyspark", "mock", "sparkless", etc.

    Returns:
        Engine name string. Returns "unknown" if the engine is not configured
        or an error occurs.

    Example:
        >>> from pipeline_builder.compat import compat_name
        >>> engine_name = compat_name()
        >>> print(f"Using engine: {engine_name}")
    """
    try:
        return get_engine().engine_name
    except Exception:
        return "unknown"

def get_functions_from_session(spark: SparkSession) -> Any:
    """Get functions module from a SparkSession.

    Compatibility helper function that returns the configured functions
    module (F). The spark parameter is accepted for API compatibility
    but is not used, as the functions module comes from the configured
    engine, not from the SparkSession directly.

    Args:
        spark: SparkSession instance. Accepted for API compatibility
            but not used internally.

    Returns:
        Functions module (F) from the configured engine. This is the
        same as accessing `F` directly from the compat module.

    Example:
        >>> from pipeline_builder.compat import get_functions_from_session
        >>> from pyspark.sql import SparkSession
        >>>
        >>> spark = SparkSession.builder.appName("test").getOrCreate()
        >>> F = get_functions_from_session(spark)
        >>> # Use F for DataFrame operations
        >>> df.select(F.col("id"), F.lit("test"))
    """
    return F

def get_current_timestamp() -> Any:
    """Get current timestamp using the configured engine's timestamp function.

    Returns the current timestamp using the engine's `current_timestamp()`
    function if available, otherwise falls back to a Python datetime ISO string.

    Returns:
        Current timestamp as a Column expression (if using PySpark) or
        ISO format string (if fallback is used).

    Example:
        >>> from pipeline_builder.compat import get_current_timestamp
        >>> timestamp = get_current_timestamp()
        >>> # Use in DataFrame operations
        >>> df.withColumn("created_at", timestamp)
    """
    ct = getattr(F, "current_timestamp", None)
    if callable(ct):
        return ct()
    # Fallback: literal current timestamp string
    from datetime import datetime

    return datetime.now().isoformat()

__all__ = [
    "DataFrame",
    "SparkSession",
    "Column",
    "F",
    "types",
    "AnalysisException",
    "Window",
    "desc",
    "get_functions_from_session",
    "get_current_timestamp",
    "is_mock_spark",
]

In [None]:
# Module: abstracts.engine (abstracts)
#
# Dependencies: abstracts.reports.transform, abstracts.reports.validation, abstracts.reports.write, abstracts.source, abstracts.step

from __future__ import annotations

from abc import ABC, abstractmethod
# from .reports.transform import TransformReport  # Removed: defined in notebook cells above
# from .reports.validation import ValidationReport  # Removed: defined in notebook cells above
# from .reports.write import WriteReport  # Removed: defined in notebook cells above
# from .source import Source  # Removed: defined in notebook cells above
# from .step import Step  # Removed: defined in notebook cells above

class Engine(ABC):
    @abstractmethod
    def validate_source(self, step: Step, source: Source) -> ValidationReport: ...

    @abstractmethod
    def transform_source(self, step: Step, source: Source) -> TransformReport: ...

    @abstractmethod
    def write_target(self, step: Step, source: Source) -> WriteReport: ...

In [None]:
# Module: pipeline_builder_base.models.execution (pipeline_builder_base)
#
# Dependencies: models.base, models.exceptions, models.pipeline, pipeline_builder_base.models.base, pipeline_builder_base.models.enums, pipeline_builder_base.models.enums, pipeline_builder_base.models.exceptions, pipeline_builder_base.models.pipeline

"""
Execution models for the Pipeline Builder.

"""

from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any, Dict, Optional

# from .base import BaseModel  # Removed: defined in notebook cells above
# from .enums import ExecutionMode, PipelinePhase  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError  # Removed: defined in notebook cells above
# from .pipeline import PipelineMetrics  # Removed: defined in notebook cells above

@dataclass
class ExecutionContext(BaseModel):
    """
    Context for pipeline execution.

    Attributes:
        mode: Execution mode (initial/incremental)
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Total execution duration
        run_id: Unique run identifier
        execution_id: Unique identifier for this execution
        pipeline_id: Identifier for the pipeline being executed
        schema: Target schema for data storage
        started_at: When execution started (alias for start_time)
        ended_at: When execution ended (alias for end_time)
        run_mode: Mode of execution (alias for mode)
        config: Pipeline configuration as dictionary
    """

    mode: ExecutionMode
    start_time: datetime
    end_time: Optional[datetime] = None
    duration_secs: Optional[float] = None
    run_id: str = field(default_factory=lambda: str(uuid.uuid4()))

    # Additional fields for writer compatibility
    execution_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    pipeline_id: str = "unknown"
    schema: str = "default"
    started_at: Optional[datetime] = None
    ended_at: Optional[datetime] = None
    run_mode: str = "initial"
    config: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self) -> None:
        """Initialize aliases and defaults."""
        if self.started_at is None:
            self.started_at = self.start_time
        if self.ended_at is None:
            self.ended_at = self.end_time
        if self.run_mode == "initial":
            # Map mode to run_mode string
            if hasattr(self.mode, "value"):
                self.run_mode = self.mode.value
            elif hasattr(self.mode, "name"):
                self.run_mode = self.mode.name.lower()

    def validate(self) -> None:
        """Validate the execution context."""
        if not self.run_id:
            raise ValueError("Run ID cannot be empty")
        if self.duration_secs is not None and self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")

    def finish(self) -> None:
        """Mark execution as finished and calculate duration."""
        self.end_time = datetime.now(timezone.utc)
        if self.start_time:
            self.duration_secs = (self.end_time - self.start_time).total_seconds()

    @property
    def is_finished(self) -> bool:
        """Check if execution is finished."""
        return self.end_time is not None

    @property
    def is_running(self) -> bool:
        """Check if execution is currently running."""
        return not self.is_finished

@dataclass
class StageStats(BaseModel):
    """
    Statistics for a pipeline stage.

    Attributes:
        stage: Stage name (bronze/silver/gold)
        step: Step name
        total_rows: Total number of rows processed
        valid_rows: Number of valid rows
        invalid_rows: Number of invalid rows
        validation_rate: Validation success rate (0-100)
        duration_secs: Processing duration in seconds
        start_time: When processing started
        end_time: When processing ended
    """

    stage: str
    step: str
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None

    def validate(self) -> None:
        """Validate stage statistics."""
        if self.total_rows != self.valid_rows + self.invalid_rows:
            raise PipelineConfigurationError(
                f"Total rows ({self.total_rows}) must equal valid ({self.valid_rows}) + invalid ({self.invalid_rows})"
            )
        if not 0 <= self.validation_rate <= 100:
            raise PipelineConfigurationError(
                f"Validation rate must be between 0 and 100, got {self.validation_rate}"
            )
        if self.duration_secs < 0:
            raise PipelineConfigurationError(
                f"Duration must be non-negative, got {self.duration_secs}"
            )

    @property
    def is_valid(self) -> bool:
        """Check if the stage passed validation."""
        return self.validation_rate >= 95.0  # Default threshold

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.total_rows == 0:
            return 0.0
        return (self.invalid_rows / self.total_rows) * 100

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.total_rows / self.duration_secs

@dataclass
class StepResult(BaseModel):
    """
    Result of a pipeline step execution.

    Attributes:
        step_name: Name of the step
        phase: Pipeline phase
        success: Whether the step succeeded
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Execution duration in seconds
        rows_processed: Number of rows processed
        rows_written: Number of rows written
        validation_rate: Validation success rate
        error_message: Error message if failed
        step_type: Type of step (bronze, silver, gold)
        table_fqn: Fully qualified table name if step writes to table
        write_mode: Write mode used (overwrite, append)
        input_rows: Number of input rows processed
    """

    step_name: str
    phase: PipelinePhase
    success: bool
    start_time: datetime
    end_time: datetime
    duration_secs: float
    rows_processed: int
    rows_written: int
    validation_rate: float
    error_message: Optional[str] = None
    step_type: Optional[str] = None
    table_fqn: Optional[str] = None
    write_mode: Optional[str] = None
    input_rows: Optional[int] = None

    def validate(self) -> None:
        """Validate the step result."""
        if not self.step_name:
            raise ValueError("Step name cannot be empty")
        if self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")
        if self.rows_processed < 0:
            raise ValueError("Rows processed cannot be negative")
        if self.rows_written < 0:
            raise ValueError("Rows written cannot be negative")
        if not 0 <= self.validation_rate <= 100:
            raise ValueError("Validation rate must be between 0 and 100")

    @property
    def is_valid(self) -> bool:
        """Check if the step result is valid."""
        return self.success and self.validation_rate >= 95.0

    @property
    def is_high_quality(self) -> bool:
        """Check if the step result is high quality."""
        return self.success and self.validation_rate >= 98.0

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.rows_processed / self.duration_secs

    @classmethod
    def create_success(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        rows_processed: int,
        rows_written: int,
        validation_rate: float,
        step_type: Optional[str] = None,
        table_fqn: Optional[str] = None,
        write_mode: Optional[str] = None,
        input_rows: Optional[int] = None,
    ) -> StepResult:
        """Create a successful step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=True,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=rows_processed,
            rows_written=rows_written,
            validation_rate=validation_rate,
            error_message=None,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @classmethod
    def create_failure(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        error_message: str,
        step_type: Optional[str] = None,
        table_fqn: Optional[str] = None,
        write_mode: Optional[str] = None,
        input_rows: Optional[int] = None,
    ) -> StepResult:
        """Create a failed step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=False,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=0,
            rows_written=0,
            validation_rate=0.0,
            error_message=error_message,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.rows_processed == 0:
            return 0.0
        return 100.0 - self.validation_rate

@dataclass
class ExecutionResult(BaseModel):
    """
    Result of pipeline execution.

    Attributes:
        context: Execution context
        step_results: Results for each step
        metrics: Overall execution metrics
        success: Whether the entire pipeline succeeded
    """

    context: ExecutionContext
    step_results: list[StepResult]
    metrics: PipelineMetrics
    success: bool

    def validate(self) -> None:
        """Validate execution result."""
        if not isinstance(self.context, ExecutionContext):
            raise PipelineConfigurationError(
                "Context must be an ExecutionContext instance"
            )
        if not isinstance(self.step_results, list):
            raise PipelineConfigurationError("Step results must be a list")
        if not isinstance(self.metrics, PipelineMetrics):
            raise PipelineConfigurationError(
                "Metrics must be a PipelineMetrics instance"
            )
        if not isinstance(self.success, bool):
            raise PipelineConfigurationError("Success must be a boolean")

    @classmethod
    def from_context_and_results(
        cls, context: ExecutionContext, step_results: list[StepResult]
    ) -> ExecutionResult:
        """Create execution result from context and step results."""
        metrics = PipelineMetrics.from_step_results(step_results)
        success = all(result.success for result in step_results)
        return cls(
            context=context, step_results=step_results, metrics=metrics, success=success
        )

In [None]:
# Module: pipeline_builder.pipeline.models (pipeline_builder)
#
# Dependencies: models.pipeline, pipeline_builder_base.models

"""
Pipeline models and data structures for the framework.

This module defines the core data structures used throughout the pipeline system,
providing a clean separation of concerns and better type safety.

"""

from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional
# from .models import PipelineMetrics  # Removed: defined in notebook cells above

class PipelineMode(Enum):
    """Pipeline execution modes."""

    INITIAL = "initial"
    INCREMENTAL = "incremental"
    FULL_REFRESH = "full_refresh"
    VALIDATION_ONLY = "validation_only"

class PipelineStatus(Enum):
    """Pipeline execution status."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
    PAUSED = "paused"

# PipelineMetrics moved to main models.py to avoid duplication

@dataclass
class PipelineReport:
    """Comprehensive pipeline execution report."""

    pipeline_id: str
    execution_id: str
    mode: PipelineMode
    status: PipelineStatus  # Protocol expects str, but we use enum - structural typing allows this
    start_time: datetime
    end_time: Optional[datetime] = None
    duration_seconds: float = 0.0
    metrics: PipelineMetrics = field(default_factory=PipelineMetrics)
    bronze_results: Dict[str, Any] = field(default_factory=dict)
    silver_results: Dict[str, Any] = field(default_factory=dict)
    gold_results: Dict[str, Any] = field(default_factory=dict)
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    recommendations: list[str] = field(default_factory=list)

    @property
    def success(self) -> bool:
        """Whether the pipeline executed successfully."""
        return self.status == PipelineStatus.COMPLETED and len(self.errors) == 0

    @property
    def status_str(self) -> str:
        """Return status as string for Protocol compatibility."""
        return self.status.value

    @property
    def successful_steps(self) -> int:
        """Number of successful steps."""
        return self.metrics.successful_steps

    @property
    def failed_steps(self) -> int:
        """Number of failed steps."""
        return self.metrics.failed_steps

    def to_dict(self) -> Dict[str, Any]:
        """Convert report to dictionary."""
        return {
            "pipeline_id": self.pipeline_id,
            "execution_id": self.execution_id,
            "mode": self.mode.value,
            "status": self.status.value,
            "start_time": self.start_time.isoformat(),
            "end_time": self.end_time.isoformat() if self.end_time else None,
            "duration_seconds": self.duration_seconds,
            "metrics": {
                "total_steps": self.metrics.total_steps,
                "successful_steps": self.metrics.successful_steps,
                "failed_steps": self.metrics.failed_steps,
                "skipped_steps": self.metrics.skipped_steps,
                "total_duration": self.metrics.total_duration,
                "bronze_duration": self.metrics.bronze_duration,
                "silver_duration": self.metrics.silver_duration,
                "gold_duration": self.metrics.gold_duration,
                "total_rows_processed": self.metrics.total_rows_processed,
                "total_rows_written": self.metrics.total_rows_written,
                "cache_hit_rate": self.metrics.cache_hit_rate,
                "error_count": self.metrics.error_count,
                "retry_count": self.metrics.retry_count,
            },
            "bronze_results": self.bronze_results,
            "silver_results": self.silver_results,
            "gold_results": self.gold_results,
            "errors": self.errors,
            "warnings": self.warnings,
            "recommendations": self.recommendations,
        }

# PipelineConfig moved to main models.py to avoid duplication

@dataclass
class StepExecutionContext:
    """Context for step execution."""

    step_name: str
    step_type: str
    mode: PipelineMode
    start_time: datetime
    dependencies: list[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def duration(self) -> float:
        """Duration of step execution in seconds."""
        return (datetime.now() - self.start_time).total_seconds()

In [None]:
# Module: pipeline_builder.compat_helpers (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

"""
Compatibility helpers for working with protocol-based Spark sessions.
"""

from __future__ import annotations

from typing import Any, Optional

# from .compat import SparkSession  # Removed: defined in notebook cells above

def create_dataframe_compat(
    spark: SparkSession,  # type: ignore[valid-type]
    data: Any,
    schema: Optional[Any] = None,
    original_method: Optional[Any] = None,
    **kwargs: Any,
) -> Any:
    """
    Create DataFrame with compatibility for PySpark.

    Supports all schema formats: list of strings, StructType, None.
    Handles PySpark 3.5+ schema argument position differences.

    Args:
        spark: SparkSession instance
        data: Data to create DataFrame from (list of tuples, list of dicts, etc.)
        schema: Schema definition (list of strings, StructType, or None)
        original_method: Original createDataFrame method (to avoid recursion when monkey-patched)
        **kwargs: Additional arguments passed to createDataFrame

    Returns:
        DataFrame instance
    """
    # Use original method if provided (to avoid recursion), otherwise use spark.createDataFrame
    create_df = (
        original_method if original_method is not None else spark.createDataFrame
    )

    # Call createDataFrame method
    # Handle PySpark 3.5+ schema argument issues (PySpark bug, not mock-spark)
    if schema is None:
        return create_df(data, **kwargs)
    else:
        # Try different calling patterns to handle PySpark version differences
        # Pattern 1: Positional schema (PySpark 3.5+)
        try:
            if kwargs:
                return create_df(data, schema, **kwargs)
            else:
                return create_df(data, schema)
        except Exception as e:
            error_str = str(e)
            # Check if this is the PySpark StructType error (known PySpark 3.5+ bug)
            if "NOT_LIST_OR_NONE_OR_STRUCT" in error_str:
                # Try without kwargs
                try:
                    return create_df(data, schema)
                except Exception:
                    # Last resort: try with schema as keyword
                    return create_df(data, schema=schema, **kwargs)
            # For other errors, try keyword argument (older PySpark)
            try:
                return create_df(data, schema=schema, **kwargs)
            except Exception:
                # Final fallback: try without kwargs and keyword schema
                return create_df(data, schema=schema)

def is_dataframe_like(obj: Any) -> bool:
    """
    Check if object is DataFrame-like using structural typing.

    Checks for essential DataFrame methods: count, columns (property), filter.

    Args:
        obj: Object to check

    Returns:
        True if object has DataFrame-like interface, False otherwise
    """
    # columns is typically a property (not callable), count and filter are methods
    return (
        hasattr(obj, "count")
        and hasattr(obj, "columns")
        and hasattr(obj, "filter")
        and callable(getattr(obj, "count", None))
        and callable(getattr(obj, "filter", None))
    )

def detect_spark_type(spark: SparkSession) -> str:
    """
    Detect if spark session is PySpark.

    Args:
        spark: SparkSession instance to check

    Returns:
        'pyspark', 'mock', or 'unknown'
    """
    # Fast-path: PySpark sessions have a JVM bridge
    if hasattr(spark, "sparkContext") and hasattr(spark.sparkContext, "_jsc"):
        return "pyspark"

    try:
        spark_module = type(spark).__module__
        if "pyspark" in spark_module:
            return "pyspark"
        # Detect sparkless/mock sessions by module path
        if "sparkless" in spark_module or "mock" in spark_module:
            return "mock"
    except Exception:
        pass

    # Fallback to engine name if available
    try:
        # from .compat import compat_name  # Local import to avoid cycles  # Removed: defined in notebook cells above

        engine_name = compat_name()
        if engine_name in {"mock", "sparkless"}:
            return "mock"
        if engine_name == "pyspark":
            return "pyspark"
    except Exception:
        pass

    return "unknown"

def create_test_dataframe(
    spark: SparkSession,  # type: ignore[valid-type]
    data: Any,
    schema: Optional[Any] = None,
    **kwargs: Any,
) -> Any:
    """
    High-level helper for creating test DataFrames.

    Provides a consistent API for creating DataFrames in tests.
    Handles PySpark 3.5+ schema argument position differences.

    Args:
        spark: SparkSession instance (PySpark or mock-spark)
        data: Data to create DataFrame from (list of tuples, list of dicts, etc.)
        schema: Schema definition (list of strings, StructType, or None)
        **kwargs: Additional arguments passed to createDataFrame

    Returns:
        DataFrame instance
    """
    return create_dataframe_compat(spark, data, schema, **kwargs)

In [None]:
# Module: pipeline_builder.functions (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.protocols

"""
Functions interface protocol for the framework.

This module provides a protocol-based interface for PySpark functions that can
be injected into framework components. This design allows for better testability,
flexibility, and engine abstraction.

**Key Features:**
    - **Protocol-Based**: Uses Python Protocol for type safety and duck typing
    - **Engine Abstraction**: Works with both real PySpark and mock implementations
    - **Injection Support**: Functions can be injected via engine configuration
    - **Type Safety**: Provides type hints for all function signatures

**Usage:**
    The functions protocol is typically accessed through the compat module:

    >>> from pipeline_builder.compat import F
    >>> df.select(F.col("id"), F.lit("test"))

    Or via the get_default_functions() helper:

    >>> from pipeline_builder.functions import get_default_functions
    >>> F = get_default_functions()
    >>> df.select(F.col("id"))

**Supported Functions:**
    The protocol defines common PySpark functions including:
    - Column operations: col, expr, lit, when
    - Aggregations: count, countDistinct, sum, max, min, avg
    - String functions: length
    - Date functions: date_trunc, dayofweek, current_timestamp

Dependencies:
    - compat: Compatibility layer for engine detection

Example:
    >>> from pipeline_builder.functions import FunctionsProtocol, get_default_functions
    >>> from pipeline_builder.compat import F
    >>>
    >>> # Get functions from compat module
    >>> functions = get_default_functions()
    >>> # Use functions for DataFrame operations
    >>> df.select(functions.col("id"), functions.lit("value"))
"""

from __future__ import annotations

from typing import Optional, Protocol, Union, cast

# from .protocols import ColumnProtocol  # Removed: defined in notebook cells above

class FunctionsProtocol(Protocol):
    """Protocol for PySpark functions interface.

    This protocol defines the interface that all functions implementations
    must satisfy. It includes common PySpark functions for column operations,
    aggregations, and transformations.

    **Implementation Requirements:**
        Any class or module implementing this protocol must provide all
        the methods defined here with matching signatures. The protocol
        supports both real PySpark functions and mock implementations
        for testing.

    **Common Implementations:**
        - PySpark `pyspark.sql.functions` module
        - Mock functions for testing (see test utilities)
        - Custom function wrappers for specific engines

    Example:
        >>> from pipeline_builder.functions import FunctionsProtocol
        >>> from pipeline_builder.compat import F
        >>>
        >>> # F implements FunctionsProtocol
        >>> def use_functions(f: FunctionsProtocol):
        ...     return f.col("id")
        >>>
        >>> result = use_functions(F)
    """

    def col(self, col_name: str) -> ColumnProtocol:
        """Create a column reference.

        Args:
            col_name: Name of the column to reference.

        Returns:
            Column expression representing the column reference.

        Example:
            >>> F.col("user_id")
        """
        ...

    def expr(self, expr: str) -> ColumnProtocol:
        """Create an expression from a string."""
        ...

    def lit(
        self, value: Union[str, int] | Union[float, Optional[bool]]
    ) -> ColumnProtocol:
        """Create a literal column."""
        ...

    def when(
        self,
        condition: ColumnProtocol,
        value: Union[str, int] | Union[float, Optional[bool]],
    ) -> ColumnProtocol:
        """Create a conditional expression."""
        ...

    def count(self, col: Union[str, ColumnProtocol] = "*") -> ColumnProtocol:
        """Create a count aggregation."""
        ...

    def countDistinct(self, *cols: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a count distinct aggregation."""
        ...

    def sum(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a sum aggregation."""
        ...

    def max(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a max aggregation."""
        ...

    def min(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a min aggregation."""
        ...

    def avg(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create an average aggregation."""
        ...

    def length(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a length function."""
        ...

    def date_trunc(
        self, format: str, col: Union[str, ColumnProtocol]
    ) -> ColumnProtocol:
        """Create a date truncation function."""
        ...

    def dayofweek(self, col: Union[str, ColumnProtocol]) -> ColumnProtocol:
        """Create a day of week function."""
        ...

    def current_timestamp(self) -> ColumnProtocol:
        """Create a current timestamp function."""
        ...

def get_default_functions() -> FunctionsProtocol:
    """Get the injected functions implementation.

    Returns the functions module (F) from the configured engine. This is
    the same as accessing `F` directly from the compat module, but provides
    a typed interface for dependency injection.

    Returns:
        FunctionsProtocol instance from the configured engine. This is
        typically the PySpark functions module or a mock equivalent.

    Example:
        >>> from pipeline_builder.functions import get_default_functions
        >>> F = get_default_functions()
        >>> # Use F for DataFrame operations
        >>> df.select(F.col("id"), F.count("*"))
    """

    # from .compat import F  # Removed: defined in notebook cells above
    from pyspark.sql import functions as F  # F from pyspark (not from compat)

    return cast(FunctionsProtocol, F)

In [None]:
# Module: pipeline_builder.types (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat

"""
Simplified type definitions for the framework.

This module provides essential type definitions and aliases
for better type safety without over-engineering.

"""

from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Protocol, Union

# from .compat import Column, DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

# ============================================================================
# Basic Type Aliases
# ============================================================================

# String types
StepName = str
PipelineId = str
ExecutionId = str
TableName = str
SchemaName = str
ErrorCode = str

# Numeric types
QualityRate = float
Duration = float
RowCount = int

# Dictionary types
StringDict = Dict[str, str]
NumericDict = Dict[str, Union[int, float]]
GenericDict = Dict[str, Any]
OptionalDict = Optional[Dict[str, Any]]
OptionalList = Optional[List[Any]]

# ============================================================================
# Enums
# ============================================================================

class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

class StepStatus(Enum):
    """Step execution status."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    SKIPPED = "skipped"

# PipelineMode moved to pipeline/models.py to avoid duplication

# ============================================================================
# Function Types
# ============================================================================

# Transform function types
# Note: SparkSession, DataFrame, and Column are type aliases from compat.py
# They work correctly at runtime and mypy understands them via TYPE_CHECKING
TransformFunction = Callable[[SparkSession, DataFrame], DataFrame]
BronzeTransformFunction = Callable[[SparkSession, DataFrame], DataFrame]
SilverTransformFunction = Callable[
    [SparkSession, DataFrame, Dict[str, DataFrame], Optional[Dict[str, DataFrame]]],
    DataFrame,
]
GoldTransformFunction = Callable[
    [SparkSession, Dict[str, DataFrame], Optional[Dict[str, DataFrame]]], DataFrame
]

# Filter function type
FilterFunction = Callable[[DataFrame], DataFrame]

# ============================================================================
# Data Types
# ============================================================================

# Column rules type
ColumnRules = Dict[str, List[Union[str, Column]]]

# Result types
StepResult = Dict[str, Any]
PipelineResult = Dict[str, Any]
ExecutionResultDict = Dict[str, Any]
ValidationResultDict = Dict[str, Any]

# Context types
StepContext = Dict[str, Any]
ExecutionContext = Dict[str, Any]

# Configuration types
PipelineConfigDict = Dict[str, Any]
ExecutionConfig = Dict[str, Any]
ValidationConfig = Dict[str, Any]
MonitoringConfig = Dict[str, Any]

# Quality types
QualityThresholds = Dict[str, float]

# Error types
ErrorContext = Dict[str, Any]
ErrorSuggestions = List[str]

# ============================================================================
# Protocols (Simplified)
# ============================================================================

class Validatable(Protocol):
    """Protocol for objects that can be validated."""

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid."""
        ...

class Serializable(Protocol):
    """Protocol for objects that can be serialized."""

    def to_dict(self) -> Dict[str, Any]:
        """Convert object to dictionary."""
        ...

In [None]:
# Module: pipeline_builder.transformation.transform_service (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models, pipeline_builder_base.logging

"""Transform service for applying transformations.

This module provides a service for applying transformation functions to
DataFrames. The TransformService handles transform function execution and
context preparation for Silver and Gold steps.
"""

from __future__ import annotations

from typing import Dict, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..models import GoldStep, SilverStep  # Removed: defined in notebook cells above

class TransformService:
    """Service for applying transformations to DataFrames.

    Handles transform function execution and context preparation for Silver
    and Gold steps. Separates transformation logic from execution flow.

    Attributes:
        spark: SparkSession instance for DataFrame operations.
        logger: PipelineLogger instance for logging.

    Example:
        >>> from pipeline_builder.transformation.transform_service import TransformService
        >>> from pipeline_builder.compat import SparkSession
        >>>
        >>> service = TransformService(spark)
        >>> result = service.apply_silver_transform(
        ...     step=silver_step,
        ...     bronze_df=bronze_data,
        ...     silvers={}
        ... )
        >>> gold_result = service.apply_gold_transform(
        ...     step=gold_step,
        ...     silvers={"clean_events": silver_data}
        ... )
    """

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the transform service.

        Args:
            spark: Active SparkSession instance for DataFrame operations.
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.spark = spark
        self.logger = logger or PipelineLogger()

    def apply_silver_transform(
        self,
        step: SilverStep,
        bronze_df: DataFrame,
        silvers: Dict[str, DataFrame],
    ) -> DataFrame:
        """Apply a silver step transformation.

        Executes the transform function for a Silver step. Silver transforms
        receive bronze DataFrame and silvers dictionary (usually empty).

        Args:
            step: SilverStep instance with transform function.
            bronze_df: Bronze DataFrame to transform (source data).
            silvers: Dictionary of silver DataFrames. Usually empty for
                Silver steps, but available for cross-silver dependencies.

        Returns:
            Transformed DataFrame after applying the step's transform function.

        Raises:
            ValueError: If step.transform is None.

        Note:
            Silver transforms have signature:
            (spark: SparkSession, bronze_df: DataFrame, silvers: Dict[str, DataFrame]) -> DataFrame
        """
        if step.transform is None:
            raise ValueError(f"Silver step '{step.name}' requires a transform function")

        return step.transform(self.spark, bronze_df, silvers)  # type: ignore[call-arg,misc]

    def apply_gold_transform(
        self,
        step: GoldStep,
        silvers: Dict[str, DataFrame],
    ) -> DataFrame:
        """Apply a gold step transformation.

        Executes the transform function for a Gold step. Gold transforms
        receive a dictionary of silver DataFrames for aggregation and
        business logic.

        Args:
            step: GoldStep instance with transform function.
            silvers: Dictionary mapping silver step names to DataFrames.
                Must contain all step.source_silvers.

        Returns:
            Transformed DataFrame after applying the step's transform function.

        Raises:
            ValueError: If step.transform is None.

        Note:
            Gold transforms have signature:
            (spark: SparkSession, silvers: Dict[str, DataFrame]) -> DataFrame
        """
        if step.transform is None:
            raise ValueError(f"Gold step '{step.name}' requires a transform function")

        return step.transform(self.spark, silvers)  # type: ignore[call-arg,misc]

In [None]:
# Module: pipeline_builder.models.types (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

"""
Type definitions and protocols for the Pipeline Builder models.

This module provides type aliases, protocols, and type definitions used
throughout the pipeline system. It defines the structure of validation rules,
transform functions, and model values.

Key Components:
    - **Type Aliases**: ColumnRules, TransformFunction, SilverTransformFunction,
      GoldTransformFunction for better code readability
    - **Protocols**: Validatable, Serializable for type checking and duck typing
    - **Model Types**: ModelValue, ColumnRule, ResourceValue for type safety

Dependencies:
    - compat: Compatibility layer for Spark/PySpark types

Example:
    >>> from pipeline_builder.models.types import ColumnRules, SilverTransformFunction
    >>> from pipeline_builder.compat import SparkSession, DataFrame
    >>>
    >>> # Define validation rules
    >>> rules: ColumnRules = {
    ...     "user_id": [F.col("user_id").isNotNull()],
    ...     "email": [F.col("email").contains("@")]
    ... }
    >>>
    >>> # Define transform function
    >>> def clean_data(spark: SparkSession, bronze_df: DataFrame, prior_silvers: dict) -> DataFrame:
    ...     return bronze_df.filter(F.col("user_id").isNotNull())
    >>>
    >>> transform: SilverTransformFunction = clean_data
"""

from typing import Callable, Dict, List, Optional, Protocol, TypeVar, Union

# from ..compat import Column, DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

# Specific types for model values instead of Any
ModelValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ColumnRule = Union[DataFrame, str, bool]  # PySpark Column, string, or boolean
ResourceValue = Union[str, int, float, bool, List[str], Dict[str, str]]

# Type aliases for better readability
ColumnRules = Dict[str, List[Union[str, Column]]]
TransformFunction = Callable[[DataFrame], DataFrame]
SilverTransformFunction = Callable[
    [SparkSession, DataFrame, Dict[str, DataFrame], Optional[Dict[str, DataFrame]]],
    DataFrame,
]
GoldTransformFunction = Callable[
    [SparkSession, Dict[str, DataFrame], Optional[Dict[str, DataFrame]]], DataFrame
]

# Generic type for pipeline results
T = TypeVar("T")

class Validatable(Protocol):
    """Protocol for objects that can be validated.

    This protocol defines the interface for objects that support validation.
    Any class implementing this protocol must provide a `validate` method
    that checks the object's state and raises an exception if invalid.

    Example:
        >>> class MyModel:
        ...     def validate(self) -> None:
        ...         if not self.name:
        ...             raise ValueError("Name required")
        >>>
        >>> def check_valid(obj: Validatable) -> None:
        ...     obj.validate()
        >>>
        >>> model = MyModel()
        >>> check_valid(model)  # Type checker accepts this
    """

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid.

        Raises:
            ValidationError: If the object is invalid. Subclasses should
                raise specific error types (e.g., PipelineValidationError).
        """
        ...

class Serializable(Protocol):
    """Protocol for objects that can be serialized.

    This protocol defines the interface for objects that support serialization
    to dictionaries and JSON strings. Any class implementing this protocol
    must provide `to_dict` and `to_json` methods.

    Example:
        >>> class MyModel:
        ...     def to_dict(self) -> Dict[str, ModelValue]:
        ...         return {"name": self.name}
        ...
        ...     def to_json(self) -> str:
        ...         return json.dumps(self.to_dict())
        >>>
        >>> def serialize(obj: Serializable) -> str:
        ...     return obj.to_json()
        >>>
        >>> model = MyModel()
        >>> serialize(model)  # Type checker accepts this
    """

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert object to dictionary.

        Returns:
            Dictionary representation of the object with all fields
            converted to primitive types or dictionaries.
        """
        ...

    def to_json(self) -> str:
        """Convert object to JSON string.

        Returns:
            JSON string representation of the object.
        """
        ...

In [None]:
# Module: pipeline_builder.storage.schema_manager (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.storage.schema_utils, pipeline_builder_base.errors, pipeline_builder_base.logging

"""Schema manager for table schema validation and management.

This module provides schema validation and management functionality. The
SchemaManager handles schema creation, retrieval, and validation operations
for pipeline tables.
"""

from __future__ import annotations

from typing import Any, Optional, Tuple
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import SparkSession  # Removed: defined in notebook cells above
# from .schema_utils import get_existing_schema_safe, schemas_match  # Removed: defined in notebook cells above

class SchemaManager:
    """Manages schema validation and operations.

    Handles schema existence checks, validation, and schema matching. Provides
    centralized schema management for the pipeline execution engine.

    Attributes:
        spark: SparkSession instance for schema operations.
        logger: PipelineLogger instance for logging.

    Example:
        >>> from pipeline_builder.storage.schema_manager import SchemaManager
        >>> from pipeline_builder.compat import SparkSession
        >>>
        >>> manager = SchemaManager(spark)
        >>> manager.ensure_schema_exists("analytics")
        >>> schema = manager.get_table_schema("analytics.events")
        >>> matches, differences = manager.validate_schema_match(
        ...     "analytics.events", output_schema, ExecutionMode.INCREMENTAL, "clean_events"
        ... )
    """

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the schema manager.

        Args:
            spark: Active SparkSession instance for schema operations.
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.spark = spark
        self.logger = logger or PipelineLogger()

    def ensure_schema_exists(self, schema: str) -> None:
        """Ensure a schema exists, creating it if necessary.

        Checks if schema exists in catalog, and creates it if it doesn't.
        Uses idempotent CREATE SCHEMA IF NOT EXISTS for safe creation.

        Args:
            schema: Schema name to create or verify.

        Raises:
            ExecutionError: If schema creation fails after all attempts.
        """
        # Check if schema already exists
        try:
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema in databases:
                return  # Schema already exists, nothing to do
        except Exception:
            pass  # If we can't check, try to create anyway

        try:
            # Use SQL CREATE SCHEMA (works for both PySpark and mock-spark)
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            # Verify it was created
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema not in databases:
                raise ExecutionError(
                    f"Schema '{schema}' creation via SQL failed - schema not in catalog. "
                    f"Available databases: {databases}"
                )
        except ExecutionError:
            raise  # Re-raise ExecutionError
        except Exception as e:
            # Wrap other exceptions
            raise ExecutionError(f"Failed to create schema '{schema}': {str(e)}") from e

    def get_table_schema(
        self,
        table_name: str,
        refresh: bool = False,
    ) -> Optional[Any]:
        """
        Get the schema of an existing table.

        Args:
            table_name: Fully qualified table name
            refresh: Whether to refresh table metadata before reading schema

        Returns:
            StructType schema if table exists and schema is readable, None otherwise
        """
        if refresh:
            try:
                self.spark.sql(f"REFRESH TABLE {table_name}")
            except Exception as refresh_error:
                # Refresh might fail for some table types - log but continue
                self.logger.debug(
                    f"Could not refresh table {table_name} before schema read: {refresh_error}"
                )

        return get_existing_schema_safe(self.spark, table_name)

    def validate_schema_match(
        self,
        table_name: str,
        output_schema: Any,
        mode: Any,
        step_name: str,
    ) -> Tuple[bool, list[str]]:
        """
        Validate that output schema matches existing table schema.

        Args:
            table_name: Fully qualified table name
            output_schema: Schema of the output DataFrame
            mode: Execution mode
            step_name: Name of the step being validated

        Returns:
            Tuple of (matches: bool, differences: list[str])

        Raises:
            ExecutionError: If schema cannot be read or doesn't match (depending on mode)
        """
        existing_schema = self.get_table_schema(table_name, refresh=True)

        if existing_schema is None:
            # Cannot read schema - raise error
            raise ExecutionError(
                f"Cannot read schema for table '{table_name}' in {mode.value} mode. "
                "Schema validation is required for INCREMENTAL and FULL_REFRESH modes.",
                context={
                    "step_name": step_name,
                    "table": table_name,
                    "mode": mode.value,
                },
                suggestions=[
                    "Ensure the table exists and is accessible",
                    "Check that the table schema is readable",
                    "Use INITIAL mode if you need to recreate the table",
                ],
            )

        # If catalog reports empty schema, treat as mismatch with explicit guidance
        schema_is_empty = not existing_schema.fields or len(existing_schema.fields) == 0
        if schema_is_empty:
            raise ExecutionError(
                f"Schema mismatch for table '{table_name}' in {mode.value} mode. "
                f"Catalog reports empty schema (struct<>), but output schema has {len(output_schema.fields)} fields: {[f.name for f in output_schema.fields]}. "
                f"Use INITIAL mode to recreate the table or provide schema_override explicitly.",
                context={
                    "step_name": step_name,
                    "table": table_name,
                    "mode": mode.value,
                    "existing_schema": "struct<> (empty - catalog sync issue)",
                    "output_schema": str(output_schema),
                },
                suggestions=[
                    "Run initial_load/full_refresh to recreate the table with the desired schema",
                    "Provide schema_override to force the schema in allowed modes",
                ],
            )

        matches, differences = schemas_match(existing_schema, output_schema)

        if not matches:
            raise ExecutionError(
                f"Schema mismatch for table '{table_name}' in {mode.value} mode. "
                f"Schema changes are only allowed in INITIAL mode.\n"
                f"{chr(10).join(differences)}\n\n"
                f"Existing table schema: {existing_schema}\n"
                f"Output DataFrame schema: {output_schema}",
                context={
                    "step_name": step_name,
                    "table": table_name,
                    "mode": mode.value,
                    "existing_schema": str(existing_schema),
                    "output_schema": str(output_schema),
                },
                suggestions=[
                    "Ensure the output schema matches the existing table schema exactly",
                    "Run with INITIAL mode to recreate the table with the new schema",
                    "Manually update the existing table schema to match the new schema",
                ],
            )

        return matches, differences

In [None]:
# Module: pipeline_builder.writer.query_builder (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat

# mypy: ignore-errors
"""
Query builder module for common PySpark DataFrame operations.

This module provides reusable query builders and common aggregations
to reduce code duplication across the writer modules.

"""

from __future__ import annotations

from datetime import datetime, timedelta
from typing import Any, Dict

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

# Import specific functions for convenience
# from ..compat import F as functions  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

class QueryBuilder:
    """Builder class for common PySpark DataFrame operations."""

    @staticmethod
    def filter_by_date_range(
        df: DataFrame,
        days: int = 30,
        date_column: str = "created_at",
    ) -> DataFrame:
        """
        Filter DataFrame by date range.

        Args:
            df: Input DataFrame
            days: Number of days to look back

        Returns:
            Filtered DataFrame
        """
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)
        result = df.filter(
            functions.col(date_column) >= functions.lit(start_date.strftime("%Y-%m-%d"))
        )
        return result

    @staticmethod
    def add_date_column(
        df: DataFrame,
        date_column: str = "created_at",
        output_column: str = "date",
        format: str = "yyyy-MM-dd",
    ) -> DataFrame:
        """
        Add formatted date column to DataFrame.

        Args:
            df: Input DataFrame
            date_column: Source date column name
            output_column: Output column name
            format: Date format string

        Returns:
            DataFrame with added date column
        """
        result = df.withColumn(
            output_column,
            functions.date_format(functions.col(date_column), format),  # type: ignore[attr-defined]
        )
        return result

    @staticmethod
    def get_common_aggregations() -> Dict[str, Any]:
        """
        Get common aggregation functions.

        Returns:
            Dictionary of common aggregations
        """
        return {
            "count_all": functions.count("*").alias("total_executions"),  # type: ignore[attr-defined]
            "count_rows": functions.count("*").alias("execution_count"),  # type: ignore[attr-defined]
            "avg_validation_rate": functions.avg("validation_rate").alias(  # type: ignore[attr-defined]
                "avg_validation_rate"
            ),
            "min_validation_rate": functions.min("validation_rate").alias(  # type: ignore[attr-defined]
                "min_validation_rate"
            ),
            "max_validation_rate": functions.max("validation_rate").alias(  # type: ignore[attr-defined]
                "max_validation_rate"
            ),
            "stddev_validation_rate": functions.stddev("validation_rate").alias(  # type: ignore[attr-defined]
                "stddev_validation_rate"
            ),
            "avg_execution_time": functions.avg("execution_time").alias(  # type: ignore[attr-defined]
                "avg_execution_time"
            ),
            "min_execution_time": functions.min("execution_time").alias(  # type: ignore[attr-defined]
                "min_execution_time"
            ),
            "max_execution_time": functions.max("execution_time").alias(  # type: ignore[attr-defined]
                "max_execution_time"
            ),
            "stddev_execution_time": functions.stddev("execution_time").alias(  # type: ignore[attr-defined]
                "stddev_execution_time"
            ),
            "sum_rows_written": functions.sum("rows_written").alias(  # type: ignore[attr-defined]
                "total_rows_written"
            ),
            "successful_executions": functions.sum(  # type: ignore[attr-defined]
                functions.when(functions.col("success"), 1).otherwise(0)  # type: ignore[attr-defined]
            ).alias("successful_executions"),
            "failed_executions": functions.sum(  # type: ignore[attr-defined]
                functions.when(~functions.col("success"), 1).otherwise(0)  # type: ignore[attr-defined]
            ).alias("failed_executions"),
            "high_quality_executions": functions.sum(  # type: ignore[attr-defined]
                functions.when(functions.col("validation_rate") >= 95.0, 1).otherwise(0)  # type: ignore[attr-defined]
            ).alias("high_quality_executions"),
            "low_quality_executions": functions.sum(  # type: ignore[attr-defined]
                functions.when(functions.col("validation_rate") < 80.0, 1).otherwise(0)  # type: ignore[attr-defined]
            ).alias("low_quality_executions"),
        }

    @staticmethod
    def get_quality_aggregations() -> Dict[str, Any]:
        """
        Get quality-specific aggregations.

        Returns:
            Dictionary of quality aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "total_executions": aggs["count_all"],
            "avg_validation_rate": aggs["avg_validation_rate"],
            "min_validation_rate": aggs["min_validation_rate"],
            "max_validation_rate": aggs["max_validation_rate"],
            "stddev_validation_rate": aggs["stddev_validation_rate"],
            "high_quality_executions": aggs["high_quality_executions"],
            "low_quality_executions": aggs["low_quality_executions"],
        }

    @staticmethod
    def get_performance_aggregations() -> Dict[str, Any]:
        """
        Get performance-specific aggregations.

        Returns:
            Dictionary of performance aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "execution_count": aggs["count_rows"],
            "avg_execution_time": aggs["avg_execution_time"],
            "min_execution_time": aggs["min_execution_time"],
            "max_execution_time": aggs["max_execution_time"],
            "stddev_execution_time": aggs["stddev_execution_time"],
            "avg_validation_rate": aggs["avg_validation_rate"],
            "total_rows_written": aggs["sum_rows_written"],
            "successful_executions": aggs["successful_executions"],
        }

    @staticmethod
    def get_trend_aggregations() -> Dict[str, Any]:
        """
        Get trend-specific aggregations.

        Returns:
            Dictionary of trend aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "daily_executions": aggs["count_all"],
            "successful_executions": aggs["successful_executions"],
            "failed_executions": aggs["failed_executions"],
            "avg_execution_time": aggs["avg_execution_time"],
            "total_rows_written": aggs["sum_rows_written"],
        }

    @staticmethod
    def build_daily_trends_query(
        df: DataFrame,
        days: int = 30,
    ) -> DataFrame:
        """
        Build daily trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with daily trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_trend_aggregations()

        result = (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )
        return result

    @staticmethod
    def build_phase_trends_query(
        df: DataFrame,
        days: int = 30,
    ) -> DataFrame:
        """
        Build phase trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with phase trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_performance_aggregations()

        result = filtered_df.groupBy("phase").agg(**aggs).orderBy("phase")
        return result

    @staticmethod
    def build_step_trends_query(
        df: DataFrame,
        days: int = 30,
    ) -> DataFrame:
        """
        Build step trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with step trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_performance_aggregations()

        result = (
            filtered_df.groupBy("step")
            .agg(**aggs)
            .orderBy(functions.desc("avg_execution_time"))
        )
        return result

    @staticmethod
    def build_quality_trends_query(
        df: DataFrame,
        days: int = 30,
    ) -> DataFrame:
        """
        Build quality trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with quality trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_quality_aggregations()

        return (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )

    @staticmethod
    def build_overall_metrics_query(
        df: DataFrame,
        days: int = 30,
    ) -> DataFrame:
        """
        Build overall metrics query.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with overall metrics
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_quality_aggregations()

        result = filtered_df.agg(**aggs)
        return result

    @staticmethod
    def build_anomaly_detection_query(
        df: DataFrame,
        threshold_column: str,
        threshold_value: float,
    ) -> DataFrame:
        """
        Build anomaly detection query.

        Args:
            df: Input DataFrame
            threshold_column: Column to check against threshold
            threshold_value: Threshold value

        Returns:
            DataFrame with anomalies
        """
        result = df.filter(functions.col(threshold_column) < threshold_value)
        return result

    @staticmethod
    def build_performance_anomaly_query(
        df: DataFrame,
        performance_threshold: float,
    ) -> DataFrame:
        """
        Build performance anomaly detection query.

        Args:
            df: Input DataFrame
            performance_threshold: Performance threshold value

        Returns:
            DataFrame with performance anomalies
        """
        result = df.filter(
            (functions.col("execution_time") > performance_threshold)
            | (functions.col("validation_rate") < 80.0)
            | (~functions.col("success"))
        )
        return result

    @staticmethod
    def build_quality_anomaly_query(
        df: DataFrame,
        quality_threshold: float = 90.0,
    ) -> DataFrame:
        """
        Build quality anomaly detection query.

        Args:
            df: Input DataFrame
            quality_threshold: Quality threshold value

        Returns:
            DataFrame with quality anomalies
        """
        result = df.filter(functions.col("validation_rate") < quality_threshold)
        return result

    @staticmethod
    def build_temporal_anomaly_query(
        df: DataFrame,
        change_threshold: float = -10.0,
    ) -> DataFrame:
        """
        Build temporal anomaly detection query.

        Args:
            df: Input DataFrame
            change_threshold: Change threshold value

        Returns:
            DataFrame with temporal anomalies
        """
        # First, calculate daily quality metrics
        daily_quality_df = (
            df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(functions.avg("validation_rate").alias("daily_avg_validation_rate"))
            .orderBy("date")
        )

        # Use window function to calculate lag and quality change
        # from ..compat import Window  # Removed: defined in notebook cells above

        window_spec = Window.orderBy("date")
        result = (
            daily_quality_df.withColumn(
                "prev_avg_validation_rate",
                functions.lag("daily_avg_validation_rate", 1).over(window_spec),
            )
            .withColumn(
                "quality_change",
                functions.col("daily_avg_validation_rate")
                - functions.col("prev_avg_validation_rate"),
            )
            .filter(functions.col("quality_change") < change_threshold)
            .orderBy("quality_change")
        )
        return result

    @staticmethod
    def calculate_statistics(
        df: DataFrame,
        column: str,
    ) -> Dict[str, float]:
        """
        Calculate basic statistics for a column.

        Args:
            df: Input DataFrame
            column: Column name to calculate statistics for

        Returns:
            Dictionary with statistics
        """
        stats_df = df.agg(
            functions.avg(column).alias("avg"),
            functions.stddev(column).alias("stddev"),
            functions.min(column).alias("min"),
            functions.max(column).alias("max"),
        )

        result = stats_df.collect()[0]
        return {
            "avg": result["avg"],
            "stddev": result["stddev"],
            "min": result["min"],
            "max": result["max"],
        }

    @staticmethod
    def build_recent_performance_query(
        df: DataFrame,
        days: int = 7,
    ) -> DataFrame:
        """
        Build recent performance query.

        Args:
            df: Input DataFrame
            days: Number of recent days to analyze

        Returns:
            DataFrame with recent performance
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = {
            "daily_executions": functions.count("*").alias("daily_executions"),
            "avg_execution_time": functions.avg("execution_time").alias(
                "avg_execution_time"
            ),
            "avg_validation_rate": functions.avg("validation_rate").alias(
                "avg_validation_rate"
            ),
        }

        result = (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )
        return result

In [None]:
# Module: pipeline_builder.validation.utils (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder_base.validation

"""
Utility functions for the framework validation.

This module provides utility functions for data analysis and validation operations.

"""

from __future__ import annotations

from typing import Any, Dict

# Re-export safe_divide from base for backward compatibility
# from .validation import safe_divide  # noqa: F401  # Removed: defined in notebook cells above

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

def get_dataframe_info(df: DataFrame) -> Dict[str, Any]:
    """
    Get basic information about a DataFrame.

    Args:
        df: DataFrame to analyze

    Returns:
        Dictionary with DataFrame information
    """
    try:
        row_count = df.count()
        column_count = len(df.columns)
        schema = df.schema

        return {
            "row_count": row_count,
            "column_count": column_count,
            "columns": df.columns,
            "schema": str(schema),
            "is_empty": row_count == 0,
        }
    except Exception as e:
        return {
            "error": str(e),
            "row_count": 0,
            "column_count": 0,
            "columns": [],
            "schema": "unknown",
            "is_empty": True,
        }

In [None]:
# Module: abstracts.runner (abstracts)
#
# Dependencies: abstracts.engine, abstracts.reports.run, abstracts.reports.transform, abstracts.reports.validation, abstracts.reports.write, abstracts.source, abstracts.step

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Union
# from .engine import Engine  # Removed: defined in notebook cells above
# from .reports.run import Report  # Removed: defined in notebook cells above
# from .reports.transform import TransformReport  # Removed: defined in notebook cells above
# from .reports.validation import ValidationReport  # Removed: defined in notebook cells above
# from .reports.write import WriteReport  # Removed: defined in notebook cells above
# from .source import Source  # Removed: defined in notebook cells above
# from .step import Step  # Removed: defined in notebook cells above

class StepRunner:
    def __init__(self, steps: List[Step], engine: Engine) -> None:
        self.steps = steps
        self.engine = engine
        self.bronze_sources: Dict[str, Source] = {}
        self.prior_silvers: Dict[str, Source] = {}
        self.step_reports: Dict[
            str, Union[ValidationReport, TransformReport, WriteReport]
        ] = {}

    def __iter__(self) -> StepRunner:
        self._current_step = 0
        return self

    def __next__(self) -> Union[ValidationReport, TransformReport, WriteReport]:
        if self._current_step < len(self.steps):
            step = self.steps[self._current_step]
            self._current_step += 1
            return self.run_step(step)
        raise StopIteration

    def run_next_step(self) -> Union[ValidationReport, TransformReport, WriteReport]:
        return next(self)

    def run_step(
        self, step: Step
    ) -> Union[ValidationReport, TransformReport, WriteReport]:
        if step.type == "bronze":
            if step.name not in self.bronze_sources:
                raise ValueError(
                    f"Bronze source '{step.name}' not found in bronze_sources"
                )
            validation_report = self.engine.validate_source(
                step, self.bronze_sources[step.name]
            )
            self.step_reports[step.name] = validation_report
            return validation_report
        elif step.type == "silver":
            if step.source is None:
                raise ValueError(f"Silver step '{step.name}' requires a source")
            if step.source not in self.prior_silvers:
                raise ValueError(f"Source '{step.source}' not found in prior_silvers")
            transform_report = self.engine.transform_source(
                step, self.prior_silvers[step.source]
            )
            validation_report = self.engine.validate_source(
                step, transform_report.source
            )
            self.prior_silvers[step.name] = transform_report.source
            self.step_reports[step.name] = validation_report
            write_report = self.engine.write_target(step, transform_report.source)
            return write_report
        elif step.type == "gold":
            if step.source is None:
                raise ValueError(f"Gold step '{step.name}' requires a source")
            if step.source not in self.prior_silvers:
                raise ValueError(f"Source '{step.source}' not found in prior_silvers")
            transform_report = self.engine.transform_source(
                step, self.prior_silvers[step.source]
            )
            write_report = self.engine.write_target(step, transform_report.source)
            self.step_reports[step.name] = write_report
            return write_report
        else:
            raise ValueError(f"Unknown step type: {step.type}")

class Runner(ABC):
    """
    Abstract base class for pipeline runners.

    Concrete implementations should provide run_initial_load and run_incremental methods.
    Additional methods like run_full_refresh and run_validation_only can be added
    by concrete implementations beyond the abstract interface.
    """

    def __init__(self, steps: List[Step], engine: Engine) -> None:
        self.steps = steps
        self.engine = engine

    @abstractmethod
    def run_initial_load(
        self, bronze_sources: Optional[Dict[str, Source]] = None
    ) -> Report:
        """
        Run initial load pipeline execution.

        Args:
            bronze_sources: Dictionary mapping bronze step names to source data

        Returns:
            Report with execution results
        """
        ...

    @abstractmethod
    def run_incremental(
        self, bronze_sources: Optional[Dict[str, Source]] = None
    ) -> Report:
        """
        Run incremental pipeline execution.

        Args:
            bronze_sources: Dictionary mapping bronze step names to source data

        Returns:
            Report with execution results
        """
        ...

In [None]:
# Module: pipeline_builder.pipeline.monitor (pipeline_builder)
#
# Dependencies: models.pipeline, pipeline.models, pipeline_builder.pipeline.models, pipeline_builder_base.logging, pipeline_builder_base.logging, pipeline_builder_base.models

"""
Simplified pipeline monitoring for the framework.

This module provides basic monitoring and reporting for pipeline execution.

"""

from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import PipelineMetrics  # Removed: defined in notebook cells above

# from .models import PipelineMode, PipelineReport, PipelineStatus  # Removed: defined in notebook cells above

class SimplePipelineMonitor:
    """
    Simplified pipeline monitoring.

    This monitor provides basic execution tracking and reporting
    without complex metrics collection.
    """

    def __init__(self, logger: Optional[PipelineLogger] = None):
        """Initialize the simplified monitor."""
        self.logger = logger or PipelineLogger()
        self._current_report: Optional[PipelineReport] = None

    def start_execution(
        self,
        pipeline_id: str,
        mode: PipelineMode,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> PipelineReport:
        """Start monitoring a pipeline execution."""
        start_time = datetime.now()

        self._current_report = PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=f"exec_{pipeline_id}",
            status=PipelineStatus.RUNNING,
            mode=mode,
            start_time=start_time,
            end_time=None,
            duration_seconds=0.0,
            metrics=PipelineMetrics(
                total_steps=len(bronze_steps) + len(silver_steps) + len(gold_steps),
                successful_steps=0,
                failed_steps=0,
                total_duration=0.0,
            ),
            errors=[],
            warnings=[],
        )

        self.logger.info(f"Started monitoring pipeline: {pipeline_id}")
        return self._current_report

    def update_step_execution(
        self,
        step_name: str,
        step_type: str,
        success: bool,
        duration: float,
        error_message: Optional[str] = None,
        rows_processed: int = 0,
        rows_written: int = 0,
    ) -> None:
        """Update step execution metrics."""
        if not self._current_report:
            return

        if success:
            self._current_report.metrics.successful_steps += 1
        else:
            self._current_report.metrics.failed_steps += 1
            if error_message:
                self._current_report.errors.append(f"{step_name}: {error_message}")

        self.logger.debug(
            f"Updated step {step_name}: success={success}, duration={duration:.2f}s"
        )

    def finish_execution(self, success: bool) -> PipelineReport:
        """Finish monitoring and return final report."""
        if not self._current_report:
            raise RuntimeError("No active execution to finish")

        end_time = datetime.now()
        total_duration = (end_time - self._current_report.start_time).total_seconds()

        # Update final metrics
        self._current_report.end_time = end_time
        self._current_report.duration_seconds = total_duration
        self._current_report.status = (
            PipelineStatus.COMPLETED if success else PipelineStatus.FAILED
        )
        self._current_report.metrics.total_duration = total_duration

        self.logger.info(
            f"Finished monitoring pipeline: {self._current_report.pipeline_id}"
        )
        return self._current_report

# Alias for backward compatibility
PipelineMonitor = SimplePipelineMonitor

In [None]:
# Module: pipeline_builder.validation.execution_validator (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.validation, pipeline_builder_base.logging

"""Execution validator service.

This module provides validation services that can be used during pipeline
execution to validate data according to step rules. The ExecutionValidator
separates validation logic from execution flow, making it composable and
testable.
"""

from __future__ import annotations

from typing import Any, Dict, Optional, Tuple
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..validation import apply_column_rules  # Removed: defined in notebook cells above

class ExecutionValidator:
    """Service for validating data during pipeline execution.

    Handles validation logic separately from execution flow, making it
    composable and testable. Validates DataFrames according to step rules
    and provides validation metrics.

    Attributes:
        logger: PipelineLogger instance for logging.
        functions: FunctionsProtocol instance for PySpark operations.

    Example:
        >>> from pipeline_builder.validation.execution_validator import ExecutionValidator
        >>> from pipeline_builder.functions import get_default_functions
        >>>
        >>> validator = ExecutionValidator(functions=get_default_functions())
        >>> valid_df, invalid_df, stats = validator.validate_step_output(
        ...     df=output_df,
        ...     step_name="clean_events",
        ...     rules={"status": [F.col("status").isNotNull()]}
        ... )
        >>> rate, invalid_count = validator.get_validation_metrics(stats)
    """

    def __init__(
        self,
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
    ):
        """Initialize the execution validator.

        Args:
            logger: Optional PipelineLogger instance. If None, creates a default
                logger.
            functions: Optional FunctionsProtocol instance for PySpark
                operations. If None, functions must be provided when calling
                validation methods.
        """
        self.logger = logger or PipelineLogger()
        self.functions = functions

    def ensure_materialized_for_validation(
        self,
        df: DataFrame,
        rules: Dict[str, Any],
    ) -> DataFrame:
        """
        Force DataFrame materialization before validation to avoid CTE optimization issues.

        Mock-spark's CTE optimization can fail when validation rules reference columns
        created by transforms (via withColumn). By materializing the DataFrame first,
        we ensure all columns are available in the validation context.

        Args:
            df: DataFrame to potentially materialize
            rules: Validation rules dictionary

        Returns:
            Materialized DataFrame (or original if materialization not needed/available)
        """
        # Check if rules reference columns that might be new (not in original input)
        # Materialize before validation so downstream rules see all columns.
        if not rules:
            return df

        try:
            if hasattr(df, "cache"):
                df = df.cache()
            _ = df.count()
        except Exception as e:
            # Surface materialization problems instead of masking them
            self.logger.debug(f"Could not materialize DataFrame before validation: {e}")

        return df

    def validate_step_output(
        self,
        df: DataFrame,
        step_name: str,
        rules: Dict[str, Any],
        stage: str = "pipeline",
    ) -> Tuple[DataFrame, DataFrame, Any]:
        """Validate step output according to rules.

        Validates a DataFrame according to step validation rules. Returns
        separate DataFrames for valid and invalid rows, plus validation
        statistics.

        Args:
            df: DataFrame to validate.
            step_name: Name of the step being validated (for error messages).
            rules: Dictionary mapping column names to lists of validation rules.
            stage: Stage name for validation context. Defaults to "pipeline".

        Returns:
            Tuple of (valid_df, invalid_df, validation_stats) where:
            - valid_df: DataFrame containing rows that passed validation
            - invalid_df: DataFrame containing rows that failed validation
            - validation_stats: Validation statistics object with metrics

        Note:
            - Materializes DataFrame before validation to avoid CTE issues
            - Returns empty invalid_df if no rules provided
            - Uses apply_column_rules() for actual validation logic
        """
        if not rules:
            # No rules to apply, return original DataFrame
            return df, df.limit(0), None

        # Materialize before validation to avoid CTE issues
        df = self.ensure_materialized_for_validation(df, rules)

        # Apply validation rules
        valid_df, invalid_df, validation_stats = apply_column_rules(
            df,
            rules,
            stage,
            step_name,
            functions=self.functions,
        )

        return valid_df, invalid_df, validation_stats

    def get_validation_metrics(
        self,
        validation_stats: Any,
    ) -> Tuple[float, int]:
        """
        Extract validation metrics from validation stats.

        Args:
            validation_stats: Validation statistics object

        Returns:
            Tuple of (validation_rate, invalid_rows)
        """
        if validation_stats is None:
            return 100.0, 0

        validation_rate = getattr(validation_stats, "validation_rate", 100.0)
        invalid_rows = getattr(validation_stats, "invalid_rows", 0)

        return validation_rate, invalid_rows

In [None]:
# Module: pipeline_builder.pipeline.step_factory (pipeline_builder)
#
# Dependencies: pipeline_builder.models, pipeline_builder.types, pipeline_builder_base.logging

"""Step factory for creating pipeline steps.

This module provides a factory for creating step instances, separating
step creation from pipeline building logic. The StepFactory centralizes
step creation, making it easier to modify step creation logic and test
pipeline building.
"""

from __future__ import annotations

from typing import Any, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from ..types import (  # Removed: defined in notebook cells above
    # ColumnRules,
    # GoldTransformFunction,
    # SilverTransformFunction,
    # StepName,
    # TableName,
# )

class StepFactory:
    """Factory for creating pipeline step instances.

    Handles step creation logic separately from pipeline building. Provides
    methods to create BronzeStep, SilverStep, and GoldStep instances with
    proper validation and configuration.

    Attributes:
        logger: PipelineLogger instance for logging.

    Example:
        >>> from pipeline_builder.pipeline.step_factory import StepFactory
        >>> from pipeline_builder.functions import get_default_functions
        >>> F = get_default_functions()
        >>>
        >>> factory = StepFactory()
        >>> bronze = factory.create_bronze_step(
        ...     name="events",
        ...     rules={"id": [F.col("id").isNotNull()]},
        ...     incremental_col="timestamp"
        ... )
        >>> silver = factory.create_silver_step(
        ...     name="clean_events",
        ...     source_bronze="events",
        ...     transform=lambda spark, df, silvers: df.filter(F.col("status") == "active"),
        ...     rules={"status": [F.col("status").isNotNull()]},
        ...     table_name="clean_events"
        ... )
    """

    def __init__(
        self,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the step factory.

        Args:
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.logger = logger or PipelineLogger()

    def create_bronze_step(
        self,
        name: StepName,
        rules: ColumnRules,
        incremental_col: Optional[str] = None,
        schema: Optional[str] = None,
    ) -> BronzeStep:
        """
        Create a bronze step.

        Args:
            name: Step name
            rules: Validation rules
            incremental_col: Optional incremental column name
            schema: Optional schema name

        Returns:
            BronzeStep instance
        """
        return BronzeStep(
            name=name,
            rules=rules,
            incremental_col=incremental_col,
            schema=schema,
        )

    def create_silver_step(
        self,
        name: StepName,
        source_bronze: StepName,
        transform: SilverTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        schema: Optional[str] = None,
        source_incremental_col: Optional[str] = None,
        watermark_col: Optional[str] = None,
        schema_override: Optional[Any] = None,
    ) -> SilverStep:
        """
        Create a silver step.

        Args:
            name: Step name
            source_bronze: Source bronze step name
            transform: Transform function
            rules: Validation rules
            table_name: Target table name
            schema: Optional schema name
            source_incremental_col: Optional source incremental column
            watermark_col: Optional watermark column
            schema_override: Optional schema override

        Returns:
            SilverStep instance
        """
        return SilverStep(
            name=name,
            source_bronze=source_bronze,
            transform=transform,
            rules=rules,
            table_name=table_name,
            schema=schema,
            source_incremental_col=source_incremental_col,
            watermark_col=watermark_col,
            schema_override=schema_override,
        )

    def create_gold_step(
        self,
        name: StepName,
        transform: GoldTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        source_silvers: Optional[list[StepName]] = None,
        schema: Optional[str] = None,
        schema_override: Optional[Any] = None,
    ) -> GoldStep:
        """
        Create a gold step.

        Args:
            name: Step name
            transform: Transform function
            rules: Validation rules
            table_name: Target table name
            source_silvers: Optional list of source silver step names
            schema: Optional schema name
            schema_override: Optional schema override

        Returns:
            GoldStep instance
        """
        return GoldStep(
            name=name,
            transform=transform,
            rules=rules,
            table_name=table_name,
            source_silvers=source_silvers,
            schema=schema,
            schema_override=schema_override,
        )

In [None]:
# Module: pipeline_builder.models.base (pipeline_builder)
#
# Dependencies: pipeline_builder.errors, pipeline_builder.models.enums, pipeline_builder.models.types

"""
Base classes and configuration models for the Pipeline Builder.

This module provides the foundational model classes that all pipeline components
inherit from, including base validation, serialization, and configuration models.

Key Components:
    - **BaseModel**: Abstract base class for all pipeline models with common
      functionality for validation, serialization, and representation
    - **ValidationThresholds**: Configuration for validation thresholds across
      pipeline phases (Bronze, Silver, Gold)

Dependencies:
    - errors: Pipeline validation and error handling
    - models.enums: Pipeline phase enumerations
    - models.types: Type definitions and protocols

Example:
    >>> from pipeline_builder.models.base import BaseModel, ValidationThresholds
    >>> from dataclasses import dataclass
    >>>
    >>> @dataclass
    >>> class MyStep(BaseModel):
    ...     name: str
    ...     value: int
    ...
    ...     def validate(self) -> None:
    ...         if not self.name:
    ...             raise ValueError("Name required")
    ...         if self.value < 0:
    ...             raise ValueError("Value must be non-negative")
    >>>
    >>> step = MyStep(name="test", value=42)
    >>> step.validate()
    >>> print(step.to_json())
"""

from __future__ import annotations

import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .enums import PipelinePhase  # Removed: defined in notebook cells above
# from .types import ModelValue  # Removed: defined in notebook cells above

@dataclass
class BaseModel(ABC):
    """
    Base class for all pipeline models with common functionality.

    Provides standard validation, serialization, and representation methods
    for all pipeline data models. All models in the pipeline system inherit
    from this base class to ensure consistent behavior.

    Features:
    - Automatic validation support
    - JSON serialization and deserialization
    - Dictionary conversion for easy data exchange
    - String representation for debugging
    - Type-safe field access

    Example:
        >>> @dataclass
        >>> class MyStep(BaseModel):
        ...     name: str
        ...     rules: Dict[str, List[ColumnRule]]
        ...
        ...     def validate(self) -> None:
        ...         if not self.name:
        ...             raise ValueError("Name cannot be empty")
        ...         if not self.rules:
        ...             raise ValueError("Rules cannot be empty")
        >>>
        >>> step = MyStep(name="test", rules={"id": [F.col("id").isNotNull()]})
        >>> step.validate()
        >>> print(step.to_json())
    """

    @abstractmethod
    def validate(self) -> None:
        """Validate the model.

        This method must be implemented by all subclasses to ensure model
        integrity. It should raise appropriate exceptions if validation fails.

        Raises:
            ValidationError: If the model is invalid. Subclasses should raise
                specific error types (e.g., PipelineValidationError).

        Example:
            >>> @dataclass
            >>> class MyModel(BaseModel):
            ...     name: str
            ...
            ...     def validate(self) -> None:
            ...         if not self.name:
            ...             raise ValueError("Name cannot be empty")
            >>>
            >>> model = MyModel(name="test")
            >>> model.validate()  # Passes
        """
        pass

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert model to dictionary.

        Recursively converts the model and all nested models to dictionaries.
        Nested models that have a `to_dict` method will be converted recursively.

        Returns:
            Dictionary representation of the model with all fields converted
            to primitive types or dictionaries.

        Example:
            >>> step = BronzeStep(name="test", rules={"id": [F.col("id").isNotNull()]})
            >>> step_dict = step.to_dict()
            >>> print(step_dict["name"])  # "test"
        """
        result: Dict[str, ModelValue] = {}
        for field_info in self.__dataclass_fields__.values():
            value = getattr(self, field_info.name)
            if hasattr(value, "to_dict"):
                result[field_info.name] = value.to_dict()
            else:
                result[field_info.name] = value
        return result

    def to_json(self) -> str:
        """Convert model to JSON string.

        Serializes the model to a formatted JSON string with indentation.
        Uses the model's `to_dict` method for conversion.

        Returns:
            JSON string representation of the model, formatted with 2-space
            indentation.

        Example:
            >>> step = BronzeStep(name="test", rules={"id": [F.col("id").isNotNull()]})
            >>> json_str = step.to_json()
            >>> print(json_str)
            {
              "name": "test",
              "rules": {...}
            }
        """
        return json.dumps(self.to_dict(), default=str, indent=2)

    def __str__(self) -> str:
        """String representation of the model.

        Returns:
            Human-readable string representation showing the class name and
            all field values.

        Example:
            >>> step = BronzeStep(name="test", rules={"id": [F.col("id").isNotNull()]})
            >>> print(str(step))
            BronzeStep(name=test, rules={'id': [...]})
        """
        return f"{self.__class__.__name__}({', '.join(f'{k}={v}' for k, v in self.to_dict().items())})"

@dataclass
class ValidationThresholds(BaseModel):
    """Validation thresholds for different pipeline phases.

    Defines the minimum validation success rates required for each layer
    of the Medallion Architecture. Thresholds are expressed as percentages
    (0-100) and are used to determine if pipeline execution meets quality
    requirements.

    **Validation Rules:**
        - All thresholds must be between 0 and 100 (inclusive)
        - Thresholds are validated during model validation

    Attributes:
        bronze: Bronze layer validation threshold (0-100). Defaults to 95.0
            for standard configurations. Represents the minimum percentage
            of rows that must pass validation in the Bronze layer.
        silver: Silver layer validation threshold (0-100). Defaults to 98.0
            for standard configurations. Represents the minimum percentage
            of rows that must pass validation in the Silver layer.
        gold: Gold layer validation threshold (0-100). Defaults to 99.0
            for standard configurations. Represents the minimum percentage
            of rows that must pass validation in the Gold layer.

    Raises:
        PipelineValidationError: If any threshold is outside the valid range
            (0-100) during validation.

    Example:
        >>> # Create default thresholds
        >>> thresholds = ValidationThresholds.create_default()
        >>> print(f"Bronze: {thresholds.bronze}%")  # Bronze: 95.0%
        >>>
        >>> # Create custom thresholds
        >>> thresholds = ValidationThresholds(
        ...     bronze=90.0,
        ...     silver=95.0,
        ...     gold=99.0
        ... )
        >>> thresholds.validate()
        >>>
        >>> # Get threshold for specific phase
        >>> from pipeline_builder.models.enums import PipelinePhase
        >>> bronze_threshold = thresholds.get_threshold(PipelinePhase.BRONZE)
    """

    bronze: float
    silver: float
    gold: float

    def validate(self) -> None:
        """Validate threshold values.

        Ensures all thresholds are within the valid range (0-100).
        Raises an error if any threshold is invalid.

        Raises:
            PipelineValidationError: If any threshold is outside the valid
                range (0-100).

        Example:
            >>> thresholds = ValidationThresholds(bronze=95.0, silver=98.0, gold=99.0)
            >>> thresholds.validate()  # Passes
            >>>
            >>> invalid = ValidationThresholds(bronze=150.0, silver=98.0, gold=99.0)
            >>> invalid.validate()  # Raises PipelineValidationError
        """
        for phase, threshold in [
            ("bronze", self.bronze),
            ("silver", self.silver),
            ("gold", self.gold),
        ]:
            if not 0 <= threshold <= 100:
                raise PipelineValidationError(
                    f"{phase} threshold must be between 0 and 100, got {threshold}"
                )

    def get_threshold(self, phase: PipelinePhase) -> float:
        """Get threshold for a specific phase.

        Args:
            phase: The pipeline phase to get the threshold for.

        Returns:
            The validation threshold for the specified phase (0-100).

        Example:
            >>> thresholds = ValidationThresholds(bronze=95.0, silver=98.0, gold=99.0)
            >>> from pipeline_builder.models.enums import PipelinePhase
            >>> bronze_threshold = thresholds.get_threshold(PipelinePhase.BRONZE)
            >>> print(bronze_threshold)  # 95.0
        """
        phase_map = {
            PipelinePhase.BRONZE: self.bronze,
            PipelinePhase.SILVER: self.silver,
            PipelinePhase.GOLD: self.gold,
        }
        return phase_map[phase]

    @classmethod
    def create_default(cls) -> ValidationThresholds:
        """Create default validation thresholds.

        Returns a standard configuration suitable for most production use cases:
        - Bronze: 95.0% (allows some data quality issues in raw data)
        - Silver: 98.0% (higher quality after cleaning)
        - Gold: 99.0% (very high quality for analytics)

        Returns:
            ValidationThresholds instance with default values.

        Example:
            >>> thresholds = ValidationThresholds.create_default()
            >>> print(f"Bronze: {thresholds.bronze}%")  # Bronze: 95.0%
        """
        return cls(bronze=95.0, silver=98.0, gold=99.0)

    @classmethod
    def create_strict(cls) -> ValidationThresholds:
        """Create strict validation thresholds.

        Returns a high-quality configuration for critical data pipelines:
        - Bronze: 99.0% (very high quality raw data)
        - Silver: 99.5% (extremely high quality after cleaning)
        - Gold: 99.9% (near-perfect quality for analytics)

        Use this configuration when data quality is critical and you can
        afford to reject more rows.

        Returns:
            ValidationThresholds instance with strict values.

        Example:
            >>> thresholds = ValidationThresholds.create_strict()
            >>> print(f"Gold: {thresholds.gold}%")  # Gold: 99.9%
        """
        return cls(bronze=99.0, silver=99.5, gold=99.9)

    @classmethod
    def create_loose(cls) -> ValidationThresholds:
        """Create loose validation thresholds.

        Returns a permissive configuration for exploratory or development use:
        - Bronze: 80.0% (allows significant data quality issues)
        - Silver: 85.0% (moderate quality after cleaning)
        - Gold: 90.0% (acceptable quality for analytics)

        Use this configuration for development, testing, or when working
        with noisy data sources.

        Returns:
            ValidationThresholds instance with loose values.

        Example:
            >>> thresholds = ValidationThresholds.create_loose()
            >>> print(f"Bronze: {thresholds.bronze}%")  # Bronze: 80.0%
        """
        return cls(bronze=80.0, silver=85.0, gold=90.0)

In [None]:
# Module: pipeline_builder.writer.analytics (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.logging, pipeline_builder.writer.exceptions, pipeline_builder.writer.exceptions, pipeline_builder.writer.query_builder, pipeline_builder.writer.query_builder, pipeline_builder_base.logging

"""
Writer analytics module for data quality and trend analysis.

This module provides comprehensive analytics capabilities for analyzing
pipeline execution data, detecting trends, and generating insights.

"""

from __future__ import annotations

from datetime import datetime, timedelta
from typing import Dict, Literal, Optional, TypedDict, Union, cast

# from ..compat import DataFrame, F, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from .exceptions import WriterError  # Removed: defined in notebook cells above
# from .query_builder import QueryBuilder  # Removed: defined in notebook cells above

# Alias for convenience
col = F.col

# ============================================================================
# TypedDict Definitions
# ============================================================================

class AnalysisPeriod(TypedDict):
    """Analysis period structure."""

    start_date: str
    end_date: str
    days_analyzed: int

class DailyQualityTrend(TypedDict):
    """Daily quality trend data point."""

    date: str
    total_executions: int
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float
    stddev_validation_rate: float
    high_quality_executions: int
    low_quality_executions: int
    quality_score: str

class OverallQualityMetrics(TypedDict):
    """Overall quality metrics."""

    total_executions: int
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float
    stddev_validation_rate: float

class DegradationAlert(TypedDict):
    """Quality degradation alert."""

    type: str
    message: str
    severity: Literal["high", "medium", "low"]

class QualityTrends(TypedDict):
    """Quality trends analysis structure."""

    analysis_period: AnalysisPeriod
    daily_trends: list[DailyQualityTrend]
    overall_metrics: OverallQualityMetrics
    degradation_alerts: list[DegradationAlert]
    quality_grade: str

class ValidationAnomaly(TypedDict):
    """Validation anomaly data point."""

    step: str
    phase: str
    validation_rate: float
    valid_rows: int
    invalid_rows: int
    timestamp: str

class StepAnomaly(TypedDict):
    """Step-level anomaly data point."""

    step: str
    execution_count: int
    avg_validation_rate: float
    min_validation_rate: float
    stddev_validation_rate: float
    anomaly_score: float

class TemporalAnomaly(TypedDict):
    """Temporal anomaly data point."""

    date: str
    daily_avg_validation_rate: float
    prev_avg_validation_rate: float
    quality_change: float

class AnomalySummary(TypedDict):
    """Anomaly summary statistics."""

    total_validation_anomalies: int
    total_step_anomalies: int
    total_temporal_anomalies: int
    overall_anomaly_score: float

class QualityAnomalies(TypedDict):
    """Quality anomalies analysis structure."""

    validation_anomalies: list[ValidationAnomaly]
    step_anomalies: list[StepAnomaly]
    temporal_anomalies: list[TemporalAnomaly]
    anomaly_summary: AnomalySummary

class VolumeTrendPoint(TypedDict):
    """Volume trend data point."""

    date: str
    daily_executions: int
    successful_executions: int
    failed_executions: int
    success_rate: float
    avg_execution_time: float
    total_rows_written: int

class PhaseTrendPoint(TypedDict):
    """Phase trend data point."""

    phase: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int
    success_rate: float

class StepTrendPoint(TypedDict):
    """Step trend data point."""

    step: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    stddev_execution_time: float
    min_execution_time: float
    max_execution_time: float
    performance_grade: str

class TrendIndicators(TypedDict):
    """Trend indicators."""

    execution_volume_trend: str
    success_rate_trend: str
    recent_executions: int
    historical_avg_executions: float
    recent_success_rate: float
    historical_success_rate: float

class ExecutionTrends(TypedDict):
    """Execution trends analysis structure."""

    analysis_period: AnalysisPeriod
    volume_trends: list[VolumeTrendPoint]
    phase_trends: list[PhaseTrendPoint]
    step_trends: list[StepTrendPoint]
    trend_indicators: TrendIndicators

class DataQualityAnalyzer:
    """Analyzes data quality metrics and trends."""

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the data quality analyzer."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("DataQualityAnalyzer")
        else:
            self.logger = logger

    def analyze_quality_trends(
        self,
        df: DataFrame,
        days: int = 30,
    ) -> QualityTrends:
        """
        Analyze data quality trends over time.

        Args:
            df: DataFrame containing log data
            days: Number of days to analyze

        Returns:
            Dictionary containing quality trend analysis
        """
        try:
            self.logger.info(f"Analyzing data quality trends for last {days} days")

            # Use query builder for quality trends
            quality_trends_df = QueryBuilder.build_quality_trends_query(df, days)
            quality_trends = quality_trends_df.collect()

            # Use query builder for overall metrics
            overall_metrics_df = QueryBuilder.build_overall_metrics_query(df, days)
            overall_metrics = overall_metrics_df.collect()[0]

            # Detect quality degradation
            degradation_alerts = []
            if len(quality_trends) > 1:
                recent_avg = quality_trends[-1]["avg_validation_rate"]
                historical_avg = sum(
                    row["avg_validation_rate"] for row in quality_trends[:-1]
                ) / len(quality_trends[:-1])

                if recent_avg < historical_avg - 5.0:  # 5% degradation threshold
                    degradation_alerts.append(
                        {
                            "type": "quality_degradation",
                            "message": f"Recent validation rate ({recent_avg:.1f}%) is significantly lower than historical average ({historical_avg:.1f}%)",
                            "severity": (
                                "high"
                                if recent_avg < historical_avg - 10.0
                                else "medium"
                            ),
                        }
                    )

            # Get date range for analysis period
            end_date = datetime.now()
            start_date = end_date - timedelta(days=days)

            analysis_result = {
                "analysis_period": {
                    "start_date": start_date.strftime("%Y-%m-%d"),
                    "end_date": end_date.strftime("%Y-%m-%d"),
                    "days_analyzed": days,
                },
                "daily_trends": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "total_executions": row["total_executions"],
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "min_validation_rate": round(row["min_validation_rate"], 2),
                        "max_validation_rate": round(row["max_validation_rate"], 2),
                        "stddev_validation_rate": round(
                            row["stddev_validation_rate"], 2
                        ),
                        "high_quality_executions": row["high_quality_executions"],
                        "low_quality_executions": row["low_quality_executions"],
                        "quality_score": self._calculate_quality_score(row.asDict()),
                    }
                    for row in quality_trends
                ],
                "overall_metrics": {
                    "total_executions": overall_metrics["total_executions"],
                    "avg_validation_rate": round(
                        overall_metrics["overall_avg_validation_rate"], 2
                    ),
                    "min_validation_rate": round(
                        overall_metrics["overall_min_validation_rate"], 2
                    ),
                    "max_validation_rate": round(
                        overall_metrics["overall_max_validation_rate"], 2
                    ),
                    "stddev_validation_rate": round(
                        overall_metrics["overall_stddev_validation_rate"], 2
                    ),
                },
                "degradation_alerts": degradation_alerts,
                "quality_grade": self._calculate_quality_grade(
                    overall_metrics["overall_avg_validation_rate"]
                ),
            }

            self.logger.info("Data quality trends analysis completed")
            return cast(QualityTrends, analysis_result)

        except Exception as e:
            self.logger.error(f"Failed to analyze quality trends: {e}")
            raise WriterError(f"Failed to analyze quality trends: {e}") from e

    def detect_quality_anomalies(self, df: DataFrame) -> QualityAnomalies:
        """
        Detect data quality anomalies.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting data quality anomalies")

            # Calculate overall statistics for anomaly detection
            overall_stats = QueryBuilder.calculate_statistics(df, "validation_rate")
            threshold = overall_stats["avg"] - (2 * overall_stats["stddev"])

            # Detect validation rate anomalies using query builder
            validation_anomalies_df = (
                QueryBuilder.build_anomaly_detection_query(
                    df, "validation_rate", threshold
                )
                .select(
                    "step",
                    "phase",
                    "validation_rate",
                    "valid_rows",
                    "invalid_rows",
                    "created_at",
                )
                .orderBy("validation_rate")
            )

            validation_anomalies = validation_anomalies_df.collect()

            # Detect step-specific anomalies using query builder
            step_anomalies_df = (
                df.groupBy("step")
                .agg(**QueryBuilder.get_performance_aggregations())
                .filter(
                    (col("avg_validation_rate") < 90.0)
                    | (col("stddev_validation_rate") > 10.0)
                )
                .orderBy("avg_validation_rate")
            )

            step_anomalies = step_anomalies_df.collect()

            # Detect temporal anomalies using query builder
            temporal_anomalies_df = QueryBuilder.build_temporal_anomaly_query(df)
            temporal_anomalies = temporal_anomalies_df.collect()

            anomaly_result = {
                "validation_anomalies": [
                    {
                        "step": row["step"],
                        "phase": row["phase"],
                        "validation_rate": round(row["validation_rate"], 2),
                        "valid_rows": row["valid_rows"],
                        "invalid_rows": row["invalid_rows"],
                        "timestamp": row["created_at"].strftime("%Y-%m-%d %H:%M:%S"),
                    }
                    for row in validation_anomalies
                ],
                "step_anomalies": [
                    {
                        "step": row["step"],
                        "execution_count": row["execution_count"],
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "min_validation_rate": round(row["min_validation_rate"], 2),
                        "stddev_validation_rate": round(
                            row["stddev_validation_rate"], 2
                        ),
                        "anomaly_score": self._calculate_anomaly_score(row.asDict()),
                    }
                    for row in step_anomalies
                ],
                "temporal_anomalies": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_avg_validation_rate": round(
                            row["daily_avg_validation_rate"], 2
                        ),
                        "prev_avg_validation_rate": round(
                            row["prev_avg_validation_rate"], 2
                        ),
                        "quality_change": round(row["quality_change"], 2),
                    }
                    for row in temporal_anomalies
                ],
                "anomaly_summary": {
                    "total_validation_anomalies": len(validation_anomalies),
                    "total_step_anomalies": len(step_anomalies),
                    "total_temporal_anomalies": len(temporal_anomalies),
                    "overall_anomaly_score": self._calculate_overall_anomaly_score(
                        len(validation_anomalies),
                        len(step_anomalies),
                        len(temporal_anomalies),
                    ),
                },
            }

            self.logger.info(
                f"Quality anomaly detection completed: {len(validation_anomalies)} validation anomalies found"
            )
            return cast(QualityAnomalies, anomaly_result)

        except Exception as e:
            self.logger.error(f"Failed to detect quality anomalies: {e}")
            raise WriterError(f"Failed to detect quality anomalies: {e}") from e

    def _calculate_quality_score(self, row: Dict[str, Union[int, float]]) -> str:
        """Calculate quality score for a row."""
        avg_rate = row["avg_validation_rate"]
        if avg_rate >= 95.0:
            return "A"
        elif avg_rate >= 90.0:
            return "B"
        elif avg_rate >= 80.0:
            return "C"
        else:
            return "D"

    def _calculate_quality_grade(self, avg_validation_rate: float) -> str:
        """Calculate overall quality grade."""
        if avg_validation_rate >= 95.0:
            return "A"
        elif avg_validation_rate >= 90.0:
            return "B"
        elif avg_validation_rate >= 80.0:
            return "C"
        else:
            return "D"

    def _calculate_anomaly_score(self, row: Dict[str, Union[int, float]]) -> float:
        """Calculate anomaly score for a step."""
        avg_rate = row["avg_validation_rate"]
        stddev_rate = row["stddev_validation_rate"]

        # Lower average rate and higher standard deviation = higher anomaly score
        anomaly_score = (100 - avg_rate) + (stddev_rate * 2)
        return float(round(min(anomaly_score, 100.0), 2))

    def _calculate_overall_anomaly_score(
        self, validation_anomalies: int, step_anomalies: int, temporal_anomalies: int
    ) -> float:
        """Calculate overall anomaly score."""
        total_anomalies = validation_anomalies + step_anomalies + temporal_anomalies

        if total_anomalies == 0:
            return 0.0

        # Weight different types of anomalies
        weighted_score = (
            (validation_anomalies * 1.0)
            + (step_anomalies * 0.8)
            + (temporal_anomalies * 1.2)
        )
        return round(min(weighted_score, 100.0), 2)

class TrendAnalyzer:
    """Analyzes execution trends and patterns."""

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the trend analyzer."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("TrendAnalyzer")
        else:
            self.logger = logger

    def analyze_execution_trends(
        self,
        df: DataFrame,
        days: int = 30,
    ) -> ExecutionTrends:
        """
        Analyze execution trends over time.

        Args:
            df: DataFrame containing log data
            days: Number of days to analyze

        Returns:
            Dictionary containing trend analysis
        """
        try:
            self.logger.info(f"Analyzing execution trends for last {days} days")

            # Use query builder for all trend analyses
            volume_trends_df = QueryBuilder.build_daily_trends_query(df, days)
            volume_trends = volume_trends_df.collect()

            phase_trends_df = QueryBuilder.build_phase_trends_query(df, days)
            phase_trends = phase_trends_df.collect()

            step_trends_df = QueryBuilder.build_step_trends_query(df, days)
            step_trends = step_trends_df.collect()

            # Calculate trend indicators
            trend_indicators = self._calculate_trend_indicators(
                [row.asDict() for row in volume_trends]
            )

            # Get date range for analysis period
            end_date = datetime.now()
            start_date = end_date - timedelta(days=days)

            analysis_result = {
                "analysis_period": {
                    "start_date": start_date.strftime("%Y-%m-%d"),
                    "end_date": end_date.strftime("%Y-%m-%d"),
                    "days_analyzed": days,
                },
                "volume_trends": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_executions": row["daily_executions"],
                        "successful_executions": row["successful_executions"],
                        "failed_executions": row["failed_executions"],
                        "success_rate": (
                            round(
                                (row["successful_executions"] / row["daily_executions"])
                                * 100,
                                2,
                            )
                            if row["daily_executions"] > 0
                            else 0
                        ),
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "total_rows_written": row["total_rows_written"],
                    }
                    for row in volume_trends
                ],
                "phase_trends": [
                    {
                        "phase": row["phase"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "total_rows_written": row["total_rows_written"],
                        "success_rate": round(
                            (row["successful_executions"] / row["execution_count"])
                            * 100,
                            2,
                        ),
                    }
                    for row in phase_trends
                ],
                "step_trends": [
                    {
                        "step": row["step"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "stddev_execution_time": round(row["stddev_execution_time"], 2),
                        "min_execution_time": round(row["min_execution_time"], 2),
                        "max_execution_time": round(row["max_execution_time"], 2),
                        "performance_grade": self._calculate_performance_grade(
                            row.asDict()
                        ),
                    }
                    for row in step_trends
                ],
                "trend_indicators": trend_indicators,
            }

            self.logger.info("Execution trends analysis completed")
            return cast(ExecutionTrends, analysis_result)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def _calculate_trend_indicators(
        self, volume_trends: list[Dict[str, Union[int, float]]]
    ) -> TrendIndicators:
        """Calculate trend indicators from volume trends."""
        if len(volume_trends) < 2:
            return {
                "execution_volume_trend": "insufficient_data",
                "success_rate_trend": "insufficient_data",
                "recent_executions": 0,
                "historical_avg_executions": 0.0,
                "recent_success_rate": 0.0,
                "historical_success_rate": 0.0,
            }

        # Calculate execution volume trend
        recent_executions = volume_trends[-1]["daily_executions"]
        historical_avg = sum(
            row["daily_executions"] for row in volume_trends[:-1]
        ) / len(volume_trends[:-1])

        execution_trend = (
            "increasing"
            if recent_executions > historical_avg * 1.1
            else "decreasing"
            if recent_executions < historical_avg * 0.9
            else "stable"
        )

        # Calculate success rate trend
        recent_success_rate = (
            (
                volume_trends[-1]["successful_executions"]
                / volume_trends[-1]["daily_executions"]
            )
            * 100
            if volume_trends[-1]["daily_executions"] > 0
            else 0
        )
        historical_success_rate = sum(
            (row["successful_executions"] / row["daily_executions"]) * 100
            for row in volume_trends[:-1]
            if row["daily_executions"] > 0
        ) / len([row for row in volume_trends[:-1] if row["daily_executions"] > 0])

        success_trend = (
            "improving"
            if recent_success_rate > historical_success_rate + 2
            else (
                "declining"
                if recent_success_rate < historical_success_rate - 2
                else "stable"
            )
        )

        return {
            "execution_volume_trend": execution_trend,
            "success_rate_trend": success_trend,
            "recent_executions": int(recent_executions),
            "historical_avg_executions": round(historical_avg, 2),
            "recent_success_rate": round(recent_success_rate, 2),
            "historical_success_rate": round(historical_success_rate, 2),
        }

    def _calculate_performance_grade(self, row: Dict[str, Union[int, float]]) -> str:
        """Calculate performance grade for a step."""
        avg_time = row["avg_execution_time"]
        stddev_time = row["stddev_execution_time"]

        # Consider both average time and consistency (low stddev)
        if avg_time < 60 and stddev_time < 30:  # Fast and consistent
            return "A"
        elif avg_time < 120 and stddev_time < 60:  # Reasonable and somewhat consistent
            return "B"
        elif avg_time < 300:  # Acceptable
            return "C"
        else:  # Slow
            return "D"

In [None]:
# Module: abstracts.builder (abstracts)
#
# Dependencies: abstracts.engine, abstracts.rules, abstracts.runner, abstracts.step, abstracts.transformer

from __future__ import annotations

from typing import List, Literal, Optional, Type
# from .engine import Engine  # Removed: defined in notebook cells above
# from .rules import Rules  # Removed: defined in notebook cells above
# from .runner import Runner  # Removed: defined in notebook cells above
# from .step import Step  # Removed: defined in notebook cells above
# from .transformer import Transformer  # Removed: defined in notebook cells above

class PipelineBuilder:
    """
    Abstract pipeline builder that uses dependency injection for engine and runner.

    This builder provides a fluent API for constructing pipelines with engine injection,
    allowing different engine implementations (SparkEngine, SqlEngine, etc.) to be used.
    """

    def __init__(self, runner_cls: Type[Runner], engine: Engine) -> None:
        """
        Initialize the pipeline builder.

        Args:
            runner_cls: Runner class to use for pipeline execution
            engine: Engine instance to use for step execution
        """
        self.runner_cls = runner_cls
        self.engine = engine
        self.steps: List[Step] = []

    def validate_steps(self, steps: List[Step]) -> bool:
        """
        Validate pipeline steps configuration.

        Args:
            steps: List of steps to validate

        Returns:
            True if all steps are valid, False otherwise

        Raises:
            ValueError: If validation fails with details
        """
        # Allow empty pipelines (some tests create empty pipelines)
        # if not steps:
        #     raise ValueError("Pipeline must have at least one step")

        # Check for duplicate step names
        step_names = [step.name for step in steps]
        if len(step_names) != len(set(step_names)):
            duplicates = [name for name in step_names if step_names.count(name) > 1]
            raise ValueError(f"Duplicate step names found: {set(duplicates)}")

        # Validate step dependencies
        # Determine step types by checking class type or type attribute
        # Concrete steps (BronzeStep, SilverStep, GoldStep) don't have type attribute
        # but satisfy the Step Protocol
        bronze_names = set()
        silver_names = set()
        gold_names = set()

        for step in steps:
            # Check if step has type attribute (for Protocol compatibility)
            if hasattr(step, "type") and step.type:
                step_type = step.type
            else:
                # Determine type from class name
                class_name = step.__class__.__name__
                if "Bronze" in class_name:
                    step_type = "bronze"
                elif "Silver" in class_name:
                    step_type = "silver"
                elif "Gold" in class_name:
                    step_type = "gold"
                else:
                    raise ValueError(f"Cannot determine step type for {class_name}")

            if step_type == "bronze":
                bronze_names.add(step.name)
            elif step_type == "silver":
                silver_names.add(step.name)
            elif step_type == "gold":
                gold_names.add(step.name)

        for step in steps:
            # Determine step type again for validation
            if hasattr(step, "type") and step.type:
                step_type = step.type
            else:
                class_name = step.__class__.__name__
                if "Bronze" in class_name:
                    step_type = "bronze"
                elif "Silver" in class_name:
                    step_type = "silver"
                elif "Gold" in class_name:
                    step_type = "gold"
                else:
                    continue  # Skip unknown types

            if step_type == "silver":
                # Silver steps use source_bronze attribute, not source
                source = getattr(step, "source_bronze", None) or getattr(
                    step, "source", None
                )
                if source and source not in bronze_names:
                    raise ValueError(
                        f"Silver step '{step.name}' references unknown bronze source '{source}'"
                    )
            elif step_type == "gold":
                # Gold steps use source_silvers attribute (list), not source
                source_silvers = getattr(step, "source_silvers", None)
                if source_silvers:
                    for silver_name in source_silvers:
                        if silver_name not in silver_names:
                            raise ValueError(
                                f"Gold step '{step.name}' references unknown silver source '{silver_name}'"
                            )
                # Also check source attribute for backward compatibility
                source = getattr(step, "source", None)
                if source and source not in silver_names:
                    raise ValueError(
                        f"Gold step '{step.name}' references unknown silver source '{source}'"
                    )

        return True

    def to_pipeline(
        self, steps: Optional[List[Step]] = None, engine: Optional[Engine] = None
    ) -> Runner:
        """
        Build and return a Runner for executing the pipeline.

        Args:
            steps: Optional list of steps (uses self.steps if not provided)
            engine: Optional engine instance (uses self.engine if not provided)

        Returns:
            Runner instance ready for execution

        Raises:
            ValueError: If step validation fails
        """
        steps_to_use = steps if steps is not None else self.steps
        engine_to_use = engine if engine is not None else self.engine

        if not steps_to_use:
            raise ValueError("No steps provided to build pipeline")

        if self.validate_steps(steps_to_use):
            return self.runner_cls(steps_to_use, engine_to_use)
        raise ValueError("Invalid steps configuration")

    def with_bronze_rules(self, name: str, rules: Rules) -> PipelineBuilder:
        """
        Add a bronze step with validation rules.

        Args:
            name: Unique name for the bronze step
            rules: Validation rules for the bronze step

        Returns:
            Self for method chaining
        """
        # Note: Step is a Protocol, so we can't instantiate it directly.
        # Concrete implementations will create appropriate step objects.
        # This method is meant to be overridden or used with concrete step types.
        raise NotImplementedError(
            "with_bronze_rules must be implemented by concrete PipelineBuilder subclasses"
        )

    def add_silver_transform(
        self,
        name: str,
        source: str,
        transform: Transformer,
        rules: Rules,
        write_target: str,
        write_mode: Literal["overwrite", "append"],
        write_schema: Optional[str] = None,
    ) -> PipelineBuilder:
        """
        Add a silver transformation step.

        Args:
            name: Unique name for the silver step
            source: Name of the bronze step this depends on
            transform: Transformation function
            rules: Validation rules
            write_target: Target table name
            write_mode: Write mode (overwrite or append)
            write_schema: Optional schema name

        Returns:
            Self for method chaining
        """
        # Note: Step is a Protocol, so we can't instantiate it directly.
        # Concrete implementations will create appropriate step objects.
        raise NotImplementedError(
            "add_silver_transform must be implemented by concrete PipelineBuilder subclasses"
        )

    def add_gold_transform(
        self,
        name: str,
        source: str,
        transform: Transformer,
        rules: Rules,
        write_target: str,
        write_schema: Optional[str] = None,
    ) -> PipelineBuilder:
        """
        Add a gold transformation step.

        Args:
            name: Unique name for the gold step
            source: Name of the silver step this depends on
            transform: Transformation function
            rules: Validation rules
            write_target: Target table name
            write_schema: Optional schema name

        Returns:
            Self for method chaining
        """
        # Note: Step is a Protocol, so we can't instantiate it directly.
        # Concrete implementations will create appropriate step objects.
        raise NotImplementedError(
            "add_gold_transform must be implemented by concrete PipelineBuilder subclasses"
        )

# Store as global alias to avoid name collision with pipeline_builder.pipeline.builder.PipelineBuilder
_AbstractsPipelineBuilderClass = PipelineBuilder

In [None]:
# Module: pipeline_builder.models.steps (pipeline_builder)
#
# Dependencies: models.base, models.types, pipeline_builder.compat, pipeline_builder.models.base, pipeline_builder.models.enums, pipeline_builder.models.types, pipeline_builder.sql_source.models, pipeline_builder_base.errors, pipeline_builder_base.errors

"""
Step models for the Pipeline Builder.

"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional, Union

# TypeAlias is available in Python 3.10+, use typing_extensions for 3.8/3.9
# Mypy prefers typing_extensions even for Python 3.11
from typing_extensions import TypeAlias
# from .sql_source.models import JdbcSource, SqlAlchemySource  # Removed: defined in notebook cells above
# from .errors import PipelineValidationError, ValidationError  # Removed: defined in notebook cells above

# from .base import BaseModel  # Removed: defined in notebook cells above
# from .enums import PipelinePhase  # Removed: defined in notebook cells above
# from .types import ColumnRules, GoldTransformFunction, SilverTransformFunction  # Removed: defined in notebook cells above

SqlSourceType: TypeAlias = Union[JdbcSource, SqlAlchemySource]

if TYPE_CHECKING:
    # Engine-specific StructType should satisfy the TypesProtocol.StructType
    # Import the actual type from pyspark.sql.types for type checking
    try:
        from pyspark.sql.types import StructType as _StructTypeBase
    except ImportError:
        # Fallback if PySpark not available during type checking
        from typing import Any as _StructTypeBase  # type: ignore[assignment]

    StructType: TypeAlias = _StructTypeBase
else:
    try:
        # from ..compat import types as compat_types  # Removed: defined in notebook cells above
        from pyspark.sql import types  # types from pyspark (not from compat)

        StructType: TypeAlias = compat_types.StructType  # type: ignore[assignment]
    except Exception:
        # Use object instead of Any for Python 3.8 compatibility
        # Any cannot be used with isinstance() in Python 3.8
        # For runtime, we use object, but mypy will use the TypeAlias from TYPE_CHECKING
        StructType: TypeAlias = object  # type: ignore[assignment, misc]

@dataclass
class BronzeStep(BaseModel):
    """
    Bronze layer step configuration for raw data validation and ingestion.

    Bronze steps represent the first layer of the Medallion Architecture,
    handling raw data validation and establishing the foundation for downstream
    processing. They define validation rules and incremental processing capabilities.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `rules`: Must be a non-empty dictionary with validation rules
        - `incremental_col`: Must be a string if provided

    Attributes:
        name: Unique identifier for this Bronze step
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        incremental_col: Column name for incremental processing (e.g., "timestamp").
                        If provided, enables watermarking for efficient updates.
                        If None, forces full refresh mode for downstream steps.
        schema: Optional schema name for reading bronze data

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> from pipeline_builder.functions import get_default_functions
        >>> F = get_default_functions()
        >>>
        >>> # Valid Bronze step with PySpark expressions
        >>> bronze_step = BronzeStep(
        ...     name="user_events",
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "event_type": [F.col("event_type").isin(["click", "view", "purchase"])],
        ...         "timestamp": [F.col("timestamp").isNotNull(), F.col("timestamp") > "2020-01-01"]
        ...     },
        ...     incremental_col="timestamp"
        ... )
        >>>
        >>> # Validate configuration
        >>> bronze_step.validate()
        >>> print(f"Supports incremental: {bronze_step.has_incremental_capability}")

        >>> # Invalid Bronze step (will raise ValidationError)
        >>> try:
        ...     BronzeStep(name="", rules={})  # Empty name and rules
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Step name must be a non-empty string"
    """

    name: str
    rules: ColumnRules
    incremental_col: Optional[str] = None
    schema: Optional[str] = None
    sql_source: Optional[SqlSourceType] = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if not isinstance(self.rules, dict) or not self.rules:
            raise ValidationError("Rules must be a non-empty dictionary")
        if self.incremental_col is not None and not isinstance(
            self.incremental_col, str
        ):
            raise ValidationError("Incremental column must be a string")

    def validate(self) -> None:
        """Validate bronze step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if self.incremental_col is not None and not isinstance(
            self.incremental_col, str
        ):
            raise PipelineValidationError("Incremental column must be a string")

    @property
    def has_incremental_capability(self) -> bool:
        """Check if this Bronze step supports incremental processing."""
        return self.incremental_col is not None

    @property
    def step_type(self) -> PipelinePhase:
        """Return the pipeline phase for this step."""
        return PipelinePhase.BRONZE

@dataclass
class SilverStep(BaseModel):
    """
    Silver layer step configuration for data cleaning and enrichment.

    Silver steps represent the second layer of the Medallion Architecture,
    transforming raw Bronze data into clean, business-ready datasets.
    They apply data quality rules, business logic, and data transformations.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `source_bronze`: Must be a non-empty string (except for existing tables)
        - `transform`: Must be callable and cannot be None
        - `rules`: Must be a non-empty dictionary with validation rules
        - `table_name`: Must be a non-empty string

    Attributes:
        name: Unique identifier for this Silver step
        source_bronze: Name of the Bronze step providing input data
        transform: Transformation function with signature:
                 (spark: SparkSession  # type: ignore[valid-type], bronze_df: DataFrame  # type: ignore[valid-type], prior_silvers: Dict[str, DataFrame]  # type: ignore[valid-type]) -> DataFrame
                 Must be callable and cannot be None.
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        table_name: Target Delta table name where results will be stored
        watermark_col: Column name for watermarking (e.g., "timestamp", "updated_at").
                      If provided, enables incremental processing with append mode.
                      If None, uses overwrite mode for full refresh.
        existing: Whether this represents an existing table (for validation-only steps)
        schema: Optional schema name for writing silver data
        schema_override: Optional PySpark StructType schema to override DataFrame schema
                        when creating tables. Uses Delta Lake's overwriteSchema option.
                        Applied during initial runs and when table doesn't exist.

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> def clean_user_events(spark, bronze_df, prior_silvers):
        ...     return (bronze_df
        ...         .filter(F.col("user_id").isNotNull())
        ...         .withColumn("event_date", F.date_trunc("day", "timestamp"))
        ...         .withColumn("is_weekend", F.dayofweek("timestamp").isin([1, 7]))
        ...     )
        >>>
        >>> # Valid Silver step
        >>> silver_step = SilverStep(
        ...     name="clean_events",
        ...     source_bronze="user_events",
        ...     transform=clean_user_events,
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "event_date": [F.col("event_date").isNotNull()]
        ...     },
        ...     table_name="clean_user_events",
        ...     watermark_col="timestamp"
        ... )

        >>> # Invalid Silver step (will raise ValidationError)
        >>> try:
        ...     SilverStep(name="clean_events", source_bronze="", transform=None, rules={}, table_name="")
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Transform function is required and must be callable"
    """

    name: str
    source_bronze: str
    rules: ColumnRules
    table_name: str
    transform: Optional[SilverTransformFunction] = None
    watermark_col: Optional[str] = None
    existing: bool = False
    optional: bool = False
    schema: Optional[str] = None
    source_incremental_col: Optional[str] = None
    schema_override: Optional[StructType] = None
    source_silvers: Optional[list[str]] = None
    sql_source: Optional[SqlSourceType] = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if (
            not self.existing
            and not self.sql_source
            and (not self.source_bronze or not isinstance(self.source_bronze, str))
        ):
            raise ValidationError("Source bronze step name must be a non-empty string")
        if self.transform is not None and not callable(self.transform):
            raise ValidationError("Transform function must be callable if provided")
        if not self.existing and not self.sql_source and self.transform is None:
            raise ValidationError(
                "Transform function is required for non-existing silver steps"
            )
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValidationError("Table name must be a non-empty string")
        if self.source_incremental_col is not None and not isinstance(
            self.source_incremental_col, str
        ):
            raise ValidationError("source_incremental_col must be a string")
        if self.schema_override is not None:
            # Accept any StructType-like object; engine enforces correctness at write time.
            pass

    def validate(self) -> None:
        """Validate silver step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not self.sql_source and (
            not self.source_bronze or not isinstance(self.source_bronze, str)
        ):
            raise PipelineValidationError(
                "Source bronze step name must be a non-empty string"
            )
        if not self.sql_source and not callable(self.transform):
            raise PipelineValidationError("Transform must be a callable function")
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if not self.table_name or not isinstance(self.table_name, str):
            raise PipelineValidationError("Table name must be a non-empty string")
        if self.source_incremental_col is not None and not isinstance(
            self.source_incremental_col, str
        ):
            raise PipelineValidationError(
                "source_incremental_col must be a string when provided"
            )
        if self.schema_override is not None:
            # Accept any StructType-like object; engine enforces correctness at write time.
            pass

    @property
    def step_type(self) -> PipelinePhase:
        """Return the pipeline phase for this step."""
        return PipelinePhase.SILVER

@dataclass
class GoldStep(BaseModel):
    """
    Gold layer step configuration for business analytics and reporting.

    Gold steps represent the third layer of the Medallion Architecture,
    creating business-ready datasets for analytics, reporting, and dashboards.
    They aggregate and transform Silver layer data into meaningful business insights.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `transform`: Must be callable and cannot be None
        - `rules`: Must be a non-empty dictionary with validation rules
        - `table_name`: Must be a non-empty string
        - `source_silvers`: Must be a non-empty list if provided

    Attributes:
        name: Unique identifier for this Gold step
        transform: Transformation function with signature:
                 (spark: SparkSession  # type: ignore[valid-type], silvers: Dict[str, DataFrame]  # type: ignore[valid-type]) -> DataFrame
                 - spark: Active SparkSession for operations
                 - silvers: Dictionary of all Silver DataFrames by step name
                 Must be callable and cannot be None.
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        table_name: Target Delta table name where results will be stored
        source_silvers: List of Silver step names to use as input sources.
                       If None, uses all available Silver steps.
                       Allows selective consumption of Silver data.
        schema: Optional schema name for writing gold data
        schema_override: Optional PySpark StructType schema to override DataFrame schema
                        when writing to gold tables. Uses Delta Lake's overwriteSchema option.
                        Always applied for gold table writes.

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> def user_daily_metrics(spark, silvers):
        ...     events_df = silvers["clean_events"]
        ...     return (events_df
        ...         .groupBy("user_id", "event_date")
        ...         .agg(
        ...             F.count("*").alias("total_events"),
        ...             F.countDistinct("event_type").alias("unique_event_types"),
        ...             F.max("timestamp").alias("last_activity"),
        ...             F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases")
        ...         )
        ...         .withColumn("is_active_user", F.col("total_events") > 5)
        ...     )
        >>>
        >>> # Valid Gold step
        >>> gold_step = GoldStep(
        ...     name="user_metrics",
        ...     transform=user_daily_metrics,
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "total_events": [F.col("total_events") > 0]
        ...     },
        ...     table_name="user_daily_metrics",
        ...     source_silvers=["clean_events"]
        ... )

        >>> # Invalid Gold step (will raise ValidationError)
        >>> try:
        ...     GoldStep(name="", transform=None, rules={}, table_name="", source_silvers=[])
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Step name must be a non-empty string"
    """

    name: str
    rules: ColumnRules
    table_name: str
    transform: Optional[GoldTransformFunction] = None
    existing: bool = False
    optional: bool = False
    source_silvers: Optional[list[str]] = None
    schema: Optional[str] = None
    schema_override: Optional[StructType] = None
    sql_source: Optional[SqlSourceType] = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if self.transform is not None and not callable(self.transform):
            raise ValidationError("Transform function must be callable if provided")
        if not self.existing and not self.sql_source and self.transform is None:
            raise ValidationError(
                "Transform function is required for non-existing gold steps"
            )
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValidationError("Table name must be a non-empty string")
        if not isinstance(self.rules, dict) or not self.rules:
            raise ValidationError("Rules must be a non-empty dictionary")
        if self.source_silvers is not None and (
            not isinstance(self.source_silvers, list)
            or (not self.source_silvers and not self.sql_source)
        ):
            raise ValidationError("Source silvers must be a non-empty list")
        if self.schema_override is not None:
            # Accept any StructType-like object; engine enforces correctness.
            pass

    def validate(self) -> None:
        """Validate gold step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if self.transform is not None and not callable(self.transform):
            raise PipelineValidationError(
                "Transform must be a callable function if provided"
            )
        if not self.existing and not self.sql_source and self.transform is None:
            raise PipelineValidationError(
                "Transform function is required for non-existing gold steps"
            )
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if not self.table_name or not isinstance(self.table_name, str):
            raise PipelineValidationError("Table name must be a non-empty string")
        if self.source_silvers is not None and not isinstance(
            self.source_silvers, list
        ):
            raise PipelineValidationError("Source silvers must be a list or None")
        if self.schema_override is not None:
            # Accept any StructType-like object; engine enforces correctness.
            pass

    @property
    def step_type(self) -> PipelinePhase:
        """Return the pipeline phase for this step."""
        return PipelinePhase.GOLD

In [None]:
# Module: pipeline_builder.models.pipeline (pipeline_builder)
#
# Dependencies: pipeline_builder.errors, pipeline_builder.models.base

"""
Pipeline configuration models.

This module provides configuration and metrics models for pipeline execution,
including the main PipelineConfig and PipelineMetrics classes.

Key Components:
    - **PipelineConfig**: Main configuration for pipeline execution, including
      schema, validation thresholds, and logging settings
    - **PipelineMetrics**: Aggregated metrics from pipeline execution, including
      step counts, durations, row counts, and validation rates

Dependencies:
    - errors: Pipeline validation and error handling
    - models.base: Base model classes and ValidationThresholds

Example:
    >>> from pipeline_builder.models.pipeline import PipelineConfig, PipelineMetrics
    >>> from pipeline_builder.models.base import ValidationThresholds
    >>>
    >>> # Create pipeline configuration
    >>> config = PipelineConfig.create_default(schema="my_schema")
    >>> config.validate()
    >>>
    >>> # Create metrics from step results
    >>> metrics = PipelineMetrics.from_step_results(step_results)
    >>> print(f"Success rate: {metrics.success_rate}%")
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel, ValidationThresholds  # Removed: defined in notebook cells above

@dataclass
class PipelineConfig(BaseModel):
    """Main pipeline configuration.

    Central configuration class for pipeline execution. Defines the target
    schema, validation thresholds for each Medallion Architecture layer,
    and logging verbosity.

    **Validation Rules:**
        - `schema`: Must be a non-empty string
        - `thresholds`: Must be a valid ValidationThresholds instance
        - All thresholds are validated during model validation

    Attributes:
        schema: Database schema name where pipeline tables will be created.
            Must be a non-empty string. Used to construct fully qualified
            table names (e.g., "my_schema.my_table").
        thresholds: ValidationThresholds instance defining minimum validation
            success rates for Bronze, Silver, and Gold layers. Defaults to
            standard thresholds (95%, 98%, 99%).
        verbose: Whether to enable verbose logging during pipeline execution.
            Defaults to True. When True, detailed execution logs are printed.

    Raises:
        PipelineValidationError: If schema is empty or invalid, or if
            thresholds fail validation.

    Example:
        >>> from pipeline_builder.models.pipeline import PipelineConfig
        >>> from pipeline_builder.models.base import ValidationThresholds
        >>>
        >>> # Create default configuration
        >>> config = PipelineConfig.create_default(schema="analytics")
        >>> print(config.schema)  # "analytics"
        >>>
        >>> # Create custom configuration
        >>> thresholds = ValidationThresholds(bronze=90.0, silver=95.0, gold=99.0)
        >>> config = PipelineConfig(
        ...     schema="production",
        ...     thresholds=thresholds,
        ...     verbose=False
        ... )
        >>> config.validate()
        >>>
        >>> # Access thresholds
        >>> print(f"Bronze threshold: {config.min_bronze_rate}%")
    """

    schema: str
    thresholds: ValidationThresholds
    verbose: bool = True

    @property
    def min_bronze_rate(self) -> float:
        """Get bronze validation threshold.

        Returns:
            Minimum validation success rate for Bronze layer (0-100).

        Example:
            >>> config = PipelineConfig.create_default(schema="test")
            >>> print(config.min_bronze_rate)  # 95.0
        """
        return self.thresholds.bronze

    @property
    def min_silver_rate(self) -> float:
        """Get silver validation threshold.

        Returns:
            Minimum validation success rate for Silver layer (0-100).

        Example:
            >>> config = PipelineConfig.create_default(schema="test")
            >>> print(config.min_silver_rate)  # 98.0
        """
        return self.thresholds.silver

    @property
    def min_gold_rate(self) -> float:
        """Get gold validation threshold.

        Returns:
            Minimum validation success rate for Gold layer (0-100).

        Example:
            >>> config = PipelineConfig.create_default(schema="test")
            >>> print(config.min_gold_rate)  # 99.0
        """
        return self.thresholds.gold

    def validate(self) -> None:
        """Validate pipeline configuration.

        Ensures the configuration is valid by checking schema name and
        validation thresholds. Raises an error if validation fails.

        Raises:
            PipelineValidationError: If schema is empty or invalid, or if
                thresholds fail validation.

        Example:
            >>> config = PipelineConfig.create_default(schema="test")
            >>> config.validate()  # Passes
            >>>
            >>> invalid = PipelineConfig(schema="", thresholds=ValidationThresholds.create_default())
            >>> invalid.validate()  # Raises PipelineValidationError
        """
        if not self.schema or not isinstance(self.schema, str):
            raise PipelineValidationError("Schema name must be a non-empty string")
        self.thresholds.validate()

    @classmethod
    def create_default(cls, schema: str) -> PipelineConfig:
        """Create default pipeline configuration.

        Creates a standard configuration suitable for most production use cases:
        - Standard validation thresholds (95%, 98%, 99%)
        - Verbose logging enabled

        Args:
            schema: Database schema name for pipeline tables.

        Returns:
            PipelineConfig instance with default settings.

        Example:
            >>> config = PipelineConfig.create_default(schema="analytics")
            >>> print(config.verbose)  # True
            >>> print(config.min_bronze_rate)  # 95.0
        """
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_default(),
            verbose=True,
        )

    @classmethod
    def create_high_performance(cls, schema: str) -> PipelineConfig:
        """Create high-performance pipeline configuration with strict validation.

        Creates a configuration optimized for performance and data quality:
        - Strict validation thresholds (99%, 99.5%, 99.9%)
        - Verbose logging disabled for better performance

        Args:
            schema: Database schema name for pipeline tables.

        Returns:
            PipelineConfig instance with high-performance settings.

        Example:
            >>> config = PipelineConfig.create_high_performance(schema="production")
            >>> print(config.verbose)  # False
            >>> print(config.min_gold_rate)  # 99.9
        """
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            verbose=False,
        )

    @classmethod
    def create_conservative(cls, schema: str) -> PipelineConfig:
        """Create conservative pipeline configuration with strict validation.

        Creates a configuration prioritizing data quality and observability:
        - Strict validation thresholds (99%, 99.5%, 99.9%)
        - Verbose logging enabled for detailed monitoring

        Args:
            schema: Database schema name for pipeline tables.

        Returns:
            PipelineConfig instance with conservative settings.

        Example:
            >>> config = PipelineConfig.create_conservative(schema="critical")
            >>> print(config.verbose)  # True
            >>> print(config.min_gold_rate)  # 99.9
        """
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            verbose=True,
        )

@dataclass
class PipelineMetrics(BaseModel):
    """Overall pipeline execution metrics.

    Aggregates metrics from all pipeline steps to provide a comprehensive
    view of pipeline execution performance and quality. Metrics include step
    counts, durations, row counts, validation rates, and efficiency measures.

    **Validation Rules:**
        - All counts must be non-negative
        - All durations must be non-negative
        - Validation rate must be between 0 and 100
        - Total steps must equal successful + failed + skipped

    Attributes:
        total_steps: Total number of steps in the pipeline. Defaults to 0.
        successful_steps: Number of steps that completed successfully.
            Defaults to 0.
        failed_steps: Number of steps that failed during execution.
            Defaults to 0.
        skipped_steps: Number of steps that were skipped (e.g., due to
            dependencies). Defaults to 0.
        total_duration: Total execution duration in seconds. Defaults to 0.0.
        bronze_duration: Total duration for Bronze layer steps in seconds.
            Defaults to 0.0.
        silver_duration: Total duration for Silver layer steps in seconds.
            Defaults to 0.0.
        gold_duration: Total duration for Gold layer steps in seconds.
            Defaults to 0.0.
        total_rows_processed: Total number of rows processed across all steps.
            Defaults to 0.
        total_rows_written: Total number of rows written to tables across
            all steps. Defaults to 0.
        avg_validation_rate: Average validation success rate across all steps
            (0-100). Defaults to 0.0.
        cache_hit_rate: Cache hit rate (0-100). Defaults to 0.0.
        error_count: Total number of errors encountered. Defaults to 0.
        retry_count: Total number of retries attempted. Defaults to 0.

    Example:
        >>> from pipeline_builder.models.pipeline import PipelineMetrics
        >>>
        >>> # Create metrics from step results
        >>> metrics = PipelineMetrics.from_step_results(step_results)
        >>> print(f"Success rate: {metrics.success_rate}%")
        >>> print(f"Total rows: {metrics.total_rows_processed}")
        >>>
        >>> # Create metrics manually
        >>> metrics = PipelineMetrics(
        ...     total_steps=5,
        ...     successful_steps=4,
        ...     failed_steps=1,
        ...     total_duration=120.5,
        ...     total_rows_processed=1000000,
        ...     avg_validation_rate=98.5
        ... )
        >>> metrics.validate()
    """

    total_steps: int = 0
    successful_steps: int = 0
    failed_steps: int = 0
    skipped_steps: int = 0
    total_duration: float = 0.0
    bronze_duration: float = 0.0
    silver_duration: float = 0.0
    gold_duration: float = 0.0
    total_rows_processed: int = 0
    total_rows_written: int = 0
    avg_validation_rate: float = 0.0
    cache_hit_rate: float = 0.0
    error_count: int = 0
    retry_count: int = 0

    def validate(self) -> None:
        """Validate the pipeline metrics.

        Ensures all metric values are within valid ranges and consistent
        with each other. Raises an error if validation fails.

        Raises:
            ValueError: If any metric value is invalid or inconsistent.

        Example:
            >>> metrics = PipelineMetrics(total_steps=5, successful_steps=4)
            >>> metrics.validate()  # Passes
            >>>
            >>> invalid = PipelineMetrics(total_steps=-1)
            >>> invalid.validate()  # Raises ValueError
        """
        if self.total_steps < 0:
            raise ValueError("Total steps cannot be negative")
        if self.successful_steps < 0:
            raise ValueError("Successful steps cannot be negative")
        if self.failed_steps < 0:
            raise ValueError("Failed steps cannot be negative")
        if self.skipped_steps < 0:
            raise ValueError("Skipped steps cannot be negative")
        if self.total_duration < 0:
            raise ValueError("Total duration cannot be negative")
        if not 0 <= self.avg_validation_rate <= 100:
            raise ValueError("Average validation rate must be between 0 and 100")

    @property
    def success_rate(self) -> float:
        """Calculate success rate.

        Returns:
            Percentage of successful steps (0-100). Returns 0.0 if there
            are no steps.

        Example:
            >>> metrics = PipelineMetrics(total_steps=10, successful_steps=8)
            >>> print(f"Success rate: {metrics.success_rate}%")  # 80.0%
        """
        return (
            (self.successful_steps / self.total_steps * 100)
            if self.total_steps > 0
            else 0.0
        )

    @property
    def failure_rate(self) -> float:
        """Calculate failure rate.

        Returns:
            Percentage of failed steps (0-100). Returns 0.0 if there
            are no steps.

        Example:
            >>> metrics = PipelineMetrics(total_steps=10, successful_steps=8)
            >>> print(f"Failure rate: {metrics.failure_rate}%")  # 20.0%
        """
        return 100.0 - self.success_rate

    @classmethod
    def from_step_results(cls, step_results: list[Any]) -> PipelineMetrics:
        """Create metrics from step results.

        Aggregates metrics from a list of StepResult objects to create
        comprehensive pipeline metrics.

        Args:
            step_results: List of StepResult objects from pipeline execution.

        Returns:
            PipelineMetrics instance with aggregated metrics from all steps.

        Example:
            >>> from pipeline_builder.models.execution import StepResult
            >>> from pipeline_builder.models.enums import PipelinePhase
            >>> from datetime import datetime, timezone
            >>>
            >>> # Create step results
            >>> results = [
            ...     StepResult.create_success(
            ...         step_name="bronze_step",
            ...         phase=PipelinePhase.BRONZE,
            ...         start_time=datetime.now(timezone.utc),
            ...         end_time=datetime.now(timezone.utc),
            ...         rows_processed=1000,
            ...         rows_written=950,
            ...         validation_rate=95.0
            ...     )
            ... ]
            >>>
            >>> # Aggregate metrics
            >>> metrics = PipelineMetrics.from_step_results(results)
            >>> print(f"Total steps: {metrics.total_steps}")  # 1
            >>> print(f"Success rate: {metrics.success_rate}%")  # 100.0%
        """
        total_steps = len(step_results)
        successful_steps = sum(1 for result in step_results if result.success)
        failed_steps = total_steps - successful_steps
        total_duration_secs = sum(result.duration_secs for result in step_results)
        total_rows_processed = sum(result.rows_processed for result in step_results)
        total_rows_written = sum(result.rows_written for result in step_results)
        avg_validation_rate = (
            sum(result.validation_rate for result in step_results) / total_steps
            if total_steps > 0
            else 0.0
        )

        return cls(
            total_steps=total_steps,
            successful_steps=successful_steps,
            failed_steps=failed_steps,
            total_duration=total_duration_secs,
            total_rows_processed=total_rows_processed,
            total_rows_written=total_rows_written,
            avg_validation_rate=avg_validation_rate,
        )

In [None]:
# Module: pipeline_builder.models.dependencies (pipeline_builder)
#
# Dependencies: pipeline_builder.models.base, pipeline_builder.models.base, pipeline_builder_base.errors, pipeline_builder_base.errors

"""
Dependency models for the Pipeline Builder.

"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict
# from .errors import PipelineValidationError  # Removed: defined in notebook cells above

# from .base import BaseModel  # Removed: defined in notebook cells above

@dataclass
class SilverDependencyInfo(BaseModel):
    """
    Dependency information for Silver steps.

    Attributes:
        step_name: Name of the silver step
        source_bronze: Source bronze step name
        depends_on_silvers: Set of silver step names this step depends on
        execution_group: (Deprecated) Legacy field, no longer used. Execution
            order is determined by topological sort.
    """

    step_name: str
    source_bronze: str
    depends_on_silvers: set[str]
    execution_group: int

    def validate(self) -> None:
        """Validate dependency information."""
        if not self.step_name or not isinstance(self.step_name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not self.source_bronze or not isinstance(self.source_bronze, str):
            raise PipelineValidationError(
                "Source bronze step name must be a non-empty string"
            )
        if not isinstance(self.depends_on_silvers, set):
            raise PipelineValidationError("Depends on silvers must be a set")
        if self.execution_group < 0:
            raise PipelineValidationError("Execution group must be non-negative")

@dataclass
class CrossLayerDependency(BaseModel):
    """
    Represents a dependency between steps across different layers.

    Attributes:
        source_step: Name of the source step
        target_step: Name of the target step
        dependency_type: Type of dependency (data, validation, etc.)
        is_required: Whether this dependency is required for execution
    """

    source_step: str
    target_step: str
    dependency_type: str = "data"
    is_required: bool = True

    def validate(self) -> None:
        """Validate dependency information."""
        if not self.source_step or not isinstance(self.source_step, str):
            raise PipelineValidationError("Source step must be a non-empty string")
        if not self.target_step or not isinstance(self.target_step, str):
            raise PipelineValidationError("Target step must be a non-empty string")
        if self.source_step == self.target_step:
            raise PipelineValidationError("Source and target steps cannot be the same")

@dataclass
class UnifiedStepConfig(BaseModel):
    """
    Unified configuration for pipeline steps.

    Attributes:
        step_name: Name of the step
        step_type: Type of step (bronze/silver/gold)
        dependencies: List of step dependencies
        config: Step-specific configuration
    """

    step_name: str
    step_type: str
    dependencies: list[str]
    config: Dict[str, Any]

    def validate(self) -> None:
        """Validate unified step configuration."""
        if not self.step_name or not isinstance(self.step_name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if self.step_type not in ["bronze", "silver", "gold"]:
            raise PipelineValidationError("Step type must be bronze, silver, or gold")
        if not isinstance(self.dependencies, list):
            raise PipelineValidationError("Dependencies must be a list")
        if not isinstance(self.config, dict):
            raise PipelineValidationError("Config must be a dictionary")

@dataclass
class UnifiedExecutionPlan(BaseModel):
    """
    Unified execution plan for pipeline steps.

    Attributes:
        steps: List of unified step configurations
        execution_order: Ordered list of step names for execution
    """

    steps: list[UnifiedStepConfig]
    execution_order: list[str]

    def validate(self) -> None:
        """Validate unified execution plan."""
        if not isinstance(self.steps, list):
            raise PipelineValidationError("Steps must be a list")
        if not isinstance(self.execution_order, list):
            raise PipelineValidationError("Execution order must be a list")

        # Validate that all steps in execution order exist
        step_names = {step.step_name for step in self.steps}
        for step_name in self.execution_order:
            if step_name not in step_names:
                raise PipelineValidationError(f"Step {step_name} not found in steps")

In [None]:
# Module: pipeline_builder.models.execution (pipeline_builder)
#
# Dependencies: pipeline_builder.models.base, pipeline_builder.models.enums, pipeline_builder.models.exceptions, pipeline_builder.models.pipeline

"""
Execution models for the Pipeline Builder.

This module provides models for tracking pipeline execution state and results,
including execution contexts, step results, stage statistics, and overall
execution results.

Key Components:
    - **ExecutionContext**: Tracks execution state, timing, and metadata
    - **StageStats**: Statistics for individual pipeline stages
    - **StepResult**: Results from individual step execution
    - **ExecutionResult**: Aggregated results from entire pipeline execution

Dependencies:
    - models.base: BaseModel
    - models.enums: ExecutionMode, PipelinePhase
    - models.exceptions: PipelineConfigurationError
    - models.pipeline: PipelineMetrics

Example:
    >>> from pipeline_builder.models.execution import (
    ...     ExecutionContext,
    ...     StepResult,
    ...     ExecutionResult
    ... )
    >>> from pipeline_builder.models.enums import ExecutionMode, PipelinePhase
    >>> from datetime import datetime, timezone
    >>>
    >>> # Create execution context
    >>> context = ExecutionContext(
    ...     mode=ExecutionMode.INITIAL,
    ...     start_time=datetime.now(timezone.utc)
    ... )
    >>>
    >>> # Create step result
    >>> result = StepResult.create_success(
    ...     step_name="bronze_step",
    ...     phase=PipelinePhase.BRONZE,
    ...     start_time=datetime.now(timezone.utc),
    ...     end_time=datetime.now(timezone.utc),
    ...     rows_processed=1000,
    ...     rows_written=950,
    ...     validation_rate=95.0
    ... )
"""

from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any, Dict, Optional

# from .base import BaseModel  # Removed: defined in notebook cells above
# from .enums import ExecutionMode, PipelinePhase  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError  # Removed: defined in notebook cells above
# from .pipeline import PipelineMetrics  # Removed: defined in notebook cells above

@dataclass
class ExecutionContext(BaseModel):
    """Context for pipeline execution.

    Tracks the state and metadata of a pipeline execution run, including
    timing information, execution mode, and identifiers. Provides both
    primary fields and aliases for compatibility with different parts of
    the system.

    **Validation Rules:**
        - `run_id`: Must be a non-empty string
        - `duration_secs`: Must be non-negative if set

    Attributes:
        mode: Execution mode (INITIAL, INCREMENTAL, FULL_REFRESH,
            VALIDATION_ONLY). Determines how the pipeline is executed.
        start_time: When execution started. Required field.
        end_time: When execution ended. None if execution is still running.
        duration_secs: Total execution duration in seconds. None if execution
            is still running. Automatically calculated when `finish()` is called.
        run_id: Unique run identifier (UUID string). Automatically generated
            if not provided.
        execution_id: Unique identifier for this execution (UUID string).
            Used for tracking and logging. Automatically generated if not provided.
        pipeline_id: Identifier for the pipeline being executed. Defaults
            to "unknown" if not provided.
        schema: Target schema for data storage. Defaults to "default" if
            not provided.
        started_at: When execution started (alias for start_time). Set
            automatically from start_time if not provided.
        ended_at: When execution ended (alias for end_time). Set automatically
            from end_time if not provided.
        run_mode: Mode of execution as string (alias for mode.value).
            Automatically set from mode if not provided.
        config: Pipeline configuration as dictionary. Defaults to empty dict.

    Example:
        >>> from pipeline_builder.models.execution import ExecutionContext
        >>> from pipeline_builder.models.enums import ExecutionMode
        >>> from datetime import datetime, timezone
        >>>
        >>> # Create context
        >>> context = ExecutionContext(
        ...     mode=ExecutionMode.INITIAL,
        ...     start_time=datetime.now(timezone.utc)
        ... )
        >>> print(context.run_id)  # Unique UUID
        >>>
        >>> # Finish execution
        >>> context.finish()
        >>> print(context.duration_secs)  # Execution duration
    """

    mode: ExecutionMode
    start_time: datetime
    end_time: Optional[datetime] = None
    duration_secs: Optional[float] = None
    run_id: str = field(default_factory=lambda: str(uuid.uuid4()))

    # Additional fields for writer compatibility
    execution_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    pipeline_id: str = "unknown"
    schema: str = "default"
    started_at: Optional[datetime] = None
    ended_at: Optional[datetime] = None
    run_mode: str = "initial"
    config: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self) -> None:
        """Initialize aliases and defaults.

        Sets up alias fields (started_at, ended_at, run_mode) from primary
        fields if they are not explicitly provided. This ensures backward
        compatibility with code that uses the alias fields.
        """
        if self.started_at is None:
            self.started_at = self.start_time
        if self.ended_at is None:
            self.ended_at = self.end_time
        if self.run_mode == "initial":
            # Map mode to run_mode string
            if hasattr(self.mode, "value"):
                self.run_mode = self.mode.value
            elif hasattr(self.mode, "name"):
                self.run_mode = self.mode.name.lower()

    def validate(self) -> None:
        """Validate the execution context.

        Ensures the context has valid values for required fields and that
        numeric fields are within valid ranges.

        Raises:
            ValueError: If run_id is empty or duration_secs is negative.

        Example:
            >>> context = ExecutionContext(
            ...     mode=ExecutionMode.INITIAL,
            ...     start_time=datetime.now(timezone.utc)
            ... )
            >>> context.validate()  # Passes
        """
        if not self.run_id:
            raise ValueError("Run ID cannot be empty")
        if self.duration_secs is not None and self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")

    def finish(self) -> None:
        """Mark execution as finished and calculate duration.

        Sets the end_time to the current timestamp and calculates the
        execution duration. Also updates the ended_at alias field.

        Example:
            >>> context = ExecutionContext(
            ...     mode=ExecutionMode.INITIAL,
            ...     start_time=datetime.now(timezone.utc)
            ... )
            >>> # ... execution happens ...
            >>> context.finish()
            >>> print(context.duration_secs)  # Execution duration in seconds
        """
        self.end_time = datetime.now(timezone.utc)
        if self.start_time:
            self.duration_secs = (self.end_time - self.start_time).total_seconds()

    @property
    def is_finished(self) -> bool:
        """Check if execution is finished.

        Returns:
            True if end_time is set, False otherwise.

        Example:
            >>> context = ExecutionContext(...)
            >>> print(context.is_finished)  # False
            >>> context.finish()
            >>> print(context.is_finished)  # True
        """
        return self.end_time is not None

    @property
    def is_running(self) -> bool:
        """Check if execution is currently running.

        Returns:
            True if execution is still running (end_time is None),
            False otherwise.

        Example:
            >>> context = ExecutionContext(...)
            >>> print(context.is_running)  # True
            >>> context.finish()
            >>> print(context.is_running)  # False
        """
        return not self.is_finished

@dataclass
class StageStats(BaseModel):
    """Statistics for a pipeline stage.

    Tracks detailed statistics for a single pipeline stage (Bronze, Silver,
    or Gold), including row counts, validation rates, and timing information.

    **Validation Rules:**
        - `total_rows` must equal `valid_rows + invalid_rows`
        - `validation_rate` must be between 0 and 100
        - `duration_secs` must be non-negative

    Attributes:
        stage: Stage name (bronze, silver, or gold). Identifies which
            Medallion Architecture layer this stage belongs to.
        step: Step name within the stage. Identifies the specific step
            these statistics are for.
        total_rows: Total number of rows processed in this stage.
        valid_rows: Number of rows that passed validation.
        invalid_rows: Number of rows that failed validation.
        validation_rate: Validation success rate (0-100). Percentage of
            rows that passed validation.
        duration_secs: Processing duration in seconds for this stage.
        start_time: When processing started. Optional timestamp.
        end_time: When processing ended. Optional timestamp.

    Example:
        >>> from pipeline_builder.models.execution import StageStats
        >>> from datetime import datetime, timezone
        >>>
        >>> stats = StageStats(
        ...     stage="bronze",
        ...     step="user_events",
        ...     total_rows=1000,
        ...     valid_rows=950,
        ...     invalid_rows=50,
        ...     validation_rate=95.0,
        ...     duration_secs=10.5
        ... )
        >>> stats.validate()
        >>> print(f"Error rate: {stats.error_rate}%")  # 5.0%
    """

    stage: str
    step: str
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None

    def validate(self) -> None:
        """Validate stage statistics.

        Ensures row counts are consistent and all values are within valid
        ranges. Raises an error if validation fails.

        Raises:
            PipelineConfigurationError: If row counts are inconsistent or
                values are outside valid ranges.

        Example:
            >>> stats = StageStats(
            ...     stage="bronze",
            ...     step="test",
            ...     total_rows=1000,
            ...     valid_rows=950,
            ...     invalid_rows=50,
            ...     validation_rate=95.0,
            ...     duration_secs=10.5
            ... )
            >>> stats.validate()  # Passes
        """
        if self.total_rows != self.valid_rows + self.invalid_rows:
            raise PipelineConfigurationError(
                f"Total rows ({self.total_rows}) must equal valid ({self.valid_rows}) + invalid ({self.invalid_rows})"
            )
        if not 0 <= self.validation_rate <= 100:
            raise PipelineConfigurationError(
                f"Validation rate must be between 0 and 100, got {self.validation_rate}"
            )
        if self.duration_secs < 0:
            raise PipelineConfigurationError(
                f"Duration must be non-negative, got {self.duration_secs}"
            )

    @property
    def is_valid(self) -> bool:
        """Check if the stage passed validation.

        Returns:
            True if validation_rate >= 95.0%, False otherwise.

        Example:
            >>> stats = StageStats(..., validation_rate=96.0, ...)
            >>> print(stats.is_valid)  # True
        """
        return self.validation_rate >= 95.0  # Default threshold

    @property
    def error_rate(self) -> float:
        """Calculate error rate.

        Returns:
            Percentage of rows that failed validation (0-100). Returns 0.0
            if total_rows is 0.

        Example:
            >>> stats = StageStats(
            ...     total_rows=1000,
            ...     invalid_rows=50,
            ...     ...
            ... )
            >>> print(f"Error rate: {stats.error_rate}%")  # 5.0%
        """
        if self.total_rows == 0:
            return 0.0
        return (self.invalid_rows / self.total_rows) * 100

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second.

        Returns:
            Processing throughput in rows per second. Returns 0.0 if
            duration_secs is 0.

        Example:
            >>> stats = StageStats(
            ...     total_rows=10000,
            ...     duration_secs=10.0,
            ...     ...
            ... )
            >>> print(f"Throughput: {stats.throughput_rows_per_sec} rows/sec")  # 1000.0
        """
        if self.duration_secs == 0:
            return 0.0
        return self.total_rows / self.duration_secs

@dataclass
class StepResult(BaseModel):
    """Result of a pipeline step execution.

    Tracks the outcome and metrics of a single pipeline step execution,
    including success status, timing, row counts, and validation rates.

    **Validation Rules:**
        - `step_name`: Must be a non-empty string
        - `duration_secs`: Must be non-negative
        - `rows_processed`: Must be non-negative
        - `rows_written`: Must be non-negative
        - `validation_rate`: Must be between 0 and 100

    Attributes:
        step_name: Name of the step that was executed. Identifies which
            step these results are for.
        phase: Pipeline phase (BRONZE, SILVER, or GOLD) that this step
            belongs to.
        success: Whether the step execution succeeded. True if the step
            completed without errors, False otherwise.
        start_time: When step execution started. Required timestamp.
        end_time: When step execution ended. Required timestamp.
        duration_secs: Execution duration in seconds. Calculated from
            start_time and end_time.
        rows_processed: Number of rows processed during step execution.
            Includes both valid and invalid rows.
        rows_written: Number of rows written to the target table. May be
            less than rows_processed if validation filtered out some rows.
        validation_rate: Validation success rate (0-100). Percentage of
            processed rows that passed validation.
        error_message: Error message if the step failed. None if the step
            succeeded.
        step_type: Type of step (bronze, silver, gold) as string. Optional
            for compatibility.
        table_fqn: Fully qualified table name if step writes to a table
            (e.g., "schema.table_name"). None if step doesn't write to a table.
        write_mode: Write mode used (overwrite, append). None if step
            doesn't write to a table.
        input_rows: Number of input rows processed. Optional field for
            tracking input data size.

    Example:
        >>> from pipeline_builder.models.execution import StepResult
        >>> from pipeline_builder.models.enums import PipelinePhase
        >>> from datetime import datetime, timezone
        >>>
        >>> # Create success result
        >>> result = StepResult.create_success(
        ...     step_name="bronze_step",
        ...     phase=PipelinePhase.BRONZE,
        ...     start_time=datetime.now(timezone.utc),
        ...     end_time=datetime.now(timezone.utc),
        ...     rows_processed=1000,
        ...     rows_written=950,
        ...     validation_rate=95.0
        ... )
        >>> print(f"Success: {result.success}")  # True
        >>> print(f"Throughput: {result.throughput_rows_per_sec} rows/sec")
    """

    step_name: str
    phase: PipelinePhase
    success: bool
    start_time: datetime
    end_time: datetime
    duration_secs: float
    rows_processed: int
    rows_written: int
    validation_rate: float
    error_message: Optional[str] = None
    step_type: Optional[str] = None
    table_fqn: Optional[str] = None
    write_mode: Optional[str] = None
    input_rows: Optional[int] = None

    def validate(self) -> None:
        """Validate the step result.

        Ensures all fields are within valid ranges and required fields are
        present. Raises an error if validation fails.

        Raises:
            ValueError: If any field is invalid or out of range.

        Example:
            >>> result = StepResult.create_success(...)
            >>> result.validate()  # Passes
        """
        if not self.step_name:
            raise ValueError("Step name cannot be empty")
        if self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")
        if self.rows_processed < 0:
            raise ValueError("Rows processed cannot be negative")
        if self.rows_written < 0:
            raise ValueError("Rows written cannot be negative")
        if not 0 <= self.validation_rate <= 100:
            raise ValueError("Validation rate must be between 0 and 100")

    @property
    def is_valid(self) -> bool:
        """Check if the step result is valid.

        Returns:
            True if the step succeeded and validation_rate >= 95.0%,
            False otherwise.

        Example:
            >>> result = StepResult(..., success=True, validation_rate=96.0)
            >>> print(result.is_valid)  # True
        """
        return self.success and self.validation_rate >= 95.0

    @property
    def is_high_quality(self) -> bool:
        """Check if the step result is high quality.

        Returns:
            True if the step succeeded and validation_rate >= 98.0%,
            False otherwise.

        Example:
            >>> result = StepResult(..., success=True, validation_rate=99.0)
            >>> print(result.is_high_quality)  # True
        """
        return self.success and self.validation_rate >= 98.0

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second.

        Returns:
            Processing throughput in rows per second. Returns 0.0 if
            duration_secs is 0.

        Example:
            >>> result = StepResult(
            ...     rows_processed=10000,
            ...     duration_secs=10.0,
            ...     ...
            ... )
            >>> print(f"Throughput: {result.throughput_rows_per_sec} rows/sec")  # 1000.0
        """
        if self.duration_secs == 0:
            return 0.0
        return self.rows_processed / self.duration_secs

    @classmethod
    def create_success(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        rows_processed: int,
        rows_written: int,
        validation_rate: float,
        step_type: Optional[str] = None,
        table_fqn: Optional[str] = None,
        write_mode: Optional[str] = None,
        input_rows: Optional[int] = None,
    ) -> StepResult:
        """Create a successful step result.

        Factory method for creating a StepResult representing a successful
        step execution. Automatically calculates duration and sets success=True.

        Args:
            step_name: Name of the step that was executed.
            phase: Pipeline phase (BRONZE, SILVER, or GOLD).
            start_time: When step execution started.
            end_time: When step execution ended.
            rows_processed: Number of rows processed.
            rows_written: Number of rows written to table.
            validation_rate: Validation success rate (0-100).
            step_type: Optional step type string (bronze, silver, gold).
            table_fqn: Optional fully qualified table name.
            write_mode: Optional write mode (overwrite, append).
            input_rows: Optional number of input rows.

        Returns:
            StepResult instance with success=True and calculated duration.

        Example:
            >>> from datetime import datetime, timezone
            >>> result = StepResult.create_success(
            ...     step_name="bronze_step",
            ...     phase=PipelinePhase.BRONZE,
            ...     start_time=datetime.now(timezone.utc),
            ...     end_time=datetime.now(timezone.utc),
            ...     rows_processed=1000,
            ...     rows_written=950,
            ...     validation_rate=95.0
            ... )
        """
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=True,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=rows_processed,
            rows_written=rows_written,
            validation_rate=validation_rate,
            error_message=None,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @classmethod
    def create_failure(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        error_message: str,
        step_type: Optional[str] = None,
        table_fqn: Optional[str] = None,
        write_mode: Optional[str] = None,
        input_rows: Optional[int] = None,
    ) -> StepResult:
        """Create a failed step result.

        Factory method for creating a StepResult representing a failed
        step execution. Automatically calculates duration and sets success=False,
        with zero rows processed/written and zero validation rate.

        Args:
            step_name: Name of the step that was executed.
            phase: Pipeline phase (BRONZE, SILVER, or GOLD).
            start_time: When step execution started.
            end_time: When step execution ended.
            error_message: Error message describing the failure.
            step_type: Optional step type string (bronze, silver, gold).
            table_fqn: Optional fully qualified table name.
            write_mode: Optional write mode (overwrite, append).
            input_rows: Optional number of input rows.

        Returns:
            StepResult instance with success=False and zero metrics.

        Example:
            >>> from datetime import datetime, timezone
            >>> result = StepResult.create_failure(
            ...     step_name="bronze_step",
            ...     phase=PipelinePhase.BRONZE,
            ...     start_time=datetime.now(timezone.utc),
            ...     end_time=datetime.now(timezone.utc),
            ...     error_message="Validation failed: threshold not met"
            ... )
        """
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=False,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=0,
            rows_written=0,
            validation_rate=0.0,
            error_message=error_message,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @property
    def error_rate(self) -> float:
        """Calculate error rate.

        Returns:
            Percentage of rows that failed validation (0-100). Returns 0.0
            if rows_processed is 0.

        Example:
            >>> result = StepResult(..., rows_processed=1000, validation_rate=95.0)
            >>> print(f"Error rate: {result.error_rate}%")  # 5.0%
        """
        if self.rows_processed == 0:
            return 0.0
        return 100.0 - self.validation_rate

@dataclass
class ExecutionResult(BaseModel):
    """Result of pipeline execution.

    Aggregates results from an entire pipeline execution, including the
    execution context, individual step results, overall metrics, and
    overall success status.

    **Validation Rules:**
        - `context`: Must be an ExecutionContext instance
        - `step_results`: Must be a list
        - `metrics`: Must be a PipelineMetrics instance
        - `success`: Must be a boolean

    Attributes:
        context: ExecutionContext instance containing execution metadata,
            timing, and configuration.
        step_results: List of StepResult instances, one for each step
            executed in the pipeline.
        metrics: PipelineMetrics instance with aggregated metrics from
            all steps (total rows, durations, validation rates, etc.).
        success: Whether the entire pipeline succeeded. True if all steps
            succeeded, False if any step failed.

    Example:
        >>> from pipeline_builder.models.execution import ExecutionResult
        >>> from pipeline_builder.models.enums import ExecutionMode
        >>>
        >>> # Create execution result from context and step results
        >>> context = ExecutionContext(mode=ExecutionMode.INITIAL, ...)
        >>> step_results = [step_result1, step_result2, ...]
        >>> result = ExecutionResult.from_context_and_results(context, step_results)
        >>> print(f"Pipeline success: {result.success}")
        >>> print(f"Total rows: {result.metrics.total_rows_processed}")
    """

    context: ExecutionContext
    step_results: list[StepResult]
    metrics: PipelineMetrics
    success: bool

    def validate(self) -> None:
        """Validate execution result.

        Ensures all fields are of the correct types. Raises an error if
        validation fails.

        Raises:
            PipelineConfigurationError: If any field has an invalid type.

        Example:
            >>> result = ExecutionResult(...)
            >>> result.validate()  # Passes
        """
        if not isinstance(self.context, ExecutionContext):
            raise PipelineConfigurationError(
                "Context must be an ExecutionContext instance"
            )
        if not isinstance(self.step_results, list):
            raise PipelineConfigurationError("Step results must be a list")
        if not isinstance(self.metrics, PipelineMetrics):
            raise PipelineConfigurationError(
                "Metrics must be a PipelineMetrics instance"
            )
        if not isinstance(self.success, bool):
            raise PipelineConfigurationError("Success must be a boolean")

    @classmethod
    def from_context_and_results(
        cls, context: ExecutionContext, step_results: list[StepResult]
    ) -> ExecutionResult:
        """Create execution result from context and step results.

        Factory method that aggregates step results into pipeline metrics
        and determines overall success. This is the recommended way to
        create an ExecutionResult after pipeline execution.

        Args:
            context: ExecutionContext from the pipeline execution.
            step_results: List of StepResult instances from all executed steps.

        Returns:
            ExecutionResult instance with aggregated metrics and success status.

        Example:
            >>> context = ExecutionContext(mode=ExecutionMode.INITIAL, ...)
            >>> step_results = [
            ...     StepResult.create_success(...),
            ...     StepResult.create_success(...)
            ... ]
            >>> result = ExecutionResult.from_context_and_results(context, step_results)
            >>> print(f"Success: {result.success}")  # True
            >>> print(f"Total steps: {result.metrics.total_steps}")  # 2
        """
        metrics = PipelineMetrics.from_step_results(step_results)
        success = all(result.success for result in step_results)
        return cls(
            context=context, step_results=step_results, metrics=metrics, success=success
        )

In [None]:
# Module: pipeline_builder.models.factory (pipeline_builder)
#
# Dependencies: pipeline_builder.models.base, pipeline_builder.models.enums, pipeline_builder.models.exceptions, pipeline_builder.models.execution, pipeline_builder.models.pipeline, pipeline_builder.models.steps

"""
Factory functions for creating and managing pipeline models.

This module provides factory functions for creating and validating pipeline
configuration objects, execution contexts, and step configurations. These
functions simplify object creation and ensure consistent initialization.

Key Functions:
    - **create_pipeline_config**: Create PipelineConfig with custom thresholds
    - **create_execution_context**: Create ExecutionContext for pipeline runs
    - **validate_pipeline_config**: Validate PipelineConfig instances
    - **validate_step_config**: Validate step configurations
    - **serialize_pipeline_config**: Serialize config to JSON
    - **deserialize_pipeline_config**: Deserialize config from JSON

Dependencies:
    - models.base: ValidationThresholds
    - models.enums: ExecutionMode
    - models.exceptions: PipelineConfigurationError, PipelineExecutionError
    - models.execution: ExecutionContext
    - models.pipeline: PipelineConfig
    - models.steps: BronzeStep, SilverStep, GoldStep

Example:
    >>> from pipeline_builder.models.factory import (
    ...     create_pipeline_config,
    ...     create_execution_context,
    ...     validate_pipeline_config
    ... )
    >>> from pipeline_builder.models.enums import ExecutionMode
    >>>
    >>> # Create pipeline configuration
    >>> config = create_pipeline_config(
    ...     schema="analytics",
    ...     bronze_threshold=95.0,
    ...     silver_threshold=98.0,
    ...     gold_threshold=99.0
    ... )
    >>> validate_pipeline_config(config)
    >>>
    >>> # Create execution context
    >>> context = create_execution_context(ExecutionMode.INITIAL)
"""

from __future__ import annotations

import json
from datetime import datetime, timezone
from typing import Union

# from .base import ValidationThresholds  # Removed: defined in notebook cells above
# from .enums import ExecutionMode  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError, PipelineExecutionError  # Removed: defined in notebook cells above
# from .execution import ExecutionContext  # Removed: defined in notebook cells above
# from .pipeline import PipelineConfig  # Removed: defined in notebook cells above
# from .steps import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above

def create_pipeline_config(
    schema: str,
    bronze_threshold: float = 95.0,
    silver_threshold: float = 98.0,
    gold_threshold: float = 99.0,
    verbose: bool = True,
) -> PipelineConfig:
    """Factory function to create pipeline configuration.

    Creates a PipelineConfig instance with custom validation thresholds.
    This is a convenience function that simplifies configuration creation
    compared to manually constructing ValidationThresholds and PipelineConfig.

    Args:
        schema: Database schema name for pipeline tables. Must be a
            non-empty string.
        bronze_threshold: Minimum validation success rate for Bronze layer
            (0-100). Defaults to 95.0.
        silver_threshold: Minimum validation success rate for Silver layer
            (0-100). Defaults to 98.0.
        gold_threshold: Minimum validation success rate for Gold layer
            (0-100). Defaults to 99.0.
        verbose: Whether to enable verbose logging. Defaults to True.

    Returns:
        PipelineConfig instance with the specified settings.

    Raises:
        PipelineValidationError: If schema is invalid or thresholds are
            outside the valid range (0-100).

    Example:
        >>> config = create_pipeline_config(
        ...     schema="analytics",
        ...     bronze_threshold=90.0,
        ...     silver_threshold=95.0,
        ...     gold_threshold=99.0,
        ...     verbose=False
        ... )
        >>> print(config.schema)  # "analytics"
        >>> print(config.min_bronze_rate)  # 90.0
    """
    thresholds = ValidationThresholds(
        bronze=bronze_threshold, silver=silver_threshold, gold=gold_threshold
    )
    return PipelineConfig(schema=schema, thresholds=thresholds, verbose=verbose)

def create_execution_context(mode: ExecutionMode) -> ExecutionContext:
    """Factory function to create execution context.

    Creates an ExecutionContext instance with the specified execution mode
    and current timestamp. Automatically generates unique run_id and
    execution_id.

    Args:
        mode: Execution mode (INITIAL, INCREMENTAL, FULL_REFRESH,
            VALIDATION_ONLY).

    Returns:
        ExecutionContext instance initialized with the current timestamp
        and a unique run identifier.

    Example:
        >>> from pipeline_builder.models.enums import ExecutionMode
        >>> context = create_execution_context(ExecutionMode.INITIAL)
        >>> print(context.mode)  # ExecutionMode.INITIAL
        >>> print(context.run_id)  # Unique UUID string
    """
    return ExecutionContext(mode=mode, start_time=datetime.now(timezone.utc))

def validate_pipeline_config(config: PipelineConfig) -> None:
    """Validate a pipeline configuration.

    Validates a PipelineConfig instance and converts PipelineExecutionError
    to PipelineConfigurationError for clearer error semantics.

    Args:
        config: PipelineConfig instance to validate.

    Raises:
        PipelineConfigurationError: If the configuration is invalid.
            Wraps any PipelineExecutionError from the validation process.

    Example:
        >>> config = create_pipeline_config(schema="test")
        >>> validate_pipeline_config(config)  # Passes
        >>>
        >>> invalid = PipelineConfig(schema="", thresholds=ValidationThresholds.create_default())
        >>> validate_pipeline_config(invalid)  # Raises PipelineConfigurationError
    """
    try:
        config.validate()
    except PipelineExecutionError as e:
        raise PipelineConfigurationError(f"Invalid pipeline configuration: {e}") from e

def validate_step_config(step: Union[BronzeStep, SilverStep, GoldStep]) -> None:
    """Validate a step configuration.

    Validates a step configuration (Bronze, Silver, or Gold) and converts
    PipelineExecutionError to PipelineConfigurationError for clearer error
    semantics.

    Args:
        step: Step instance (BronzeStep, SilverStep, or GoldStep) to validate.

    Raises:
        PipelineConfigurationError: If the step configuration is invalid.
            Wraps any PipelineExecutionError from the validation process.

    Example:
        >>> from pipeline_builder.models.steps import BronzeStep
        >>> step = BronzeStep(name="test", rules={"id": [F.col("id").isNotNull()]})
        >>> validate_step_config(step)  # Passes
        >>>
        >>> invalid = BronzeStep(name="", rules={})
        >>> validate_step_config(invalid)  # Raises PipelineConfigurationError
    """
    try:
        step.validate()
    except PipelineExecutionError as e:
        raise PipelineConfigurationError(f"Invalid step configuration: {e}") from e

def serialize_pipeline_config(config: PipelineConfig) -> str:
    """Serialize pipeline configuration to JSON.

    Converts a PipelineConfig instance to a JSON string for storage or
    transmission. Uses the config's `to_json` method.

    Args:
        config: PipelineConfig instance to serialize.

    Returns:
        JSON string representation of the configuration.

    Example:
        >>> config = create_pipeline_config(schema="analytics")
        >>> json_str = serialize_pipeline_config(config)
        >>> print(json_str)  # {"schema": "analytics", ...}
    """
    return config.to_json()

def deserialize_pipeline_config(json_str: str) -> PipelineConfig:
    """Deserialize pipeline configuration from JSON.

    Converts a JSON string back to a PipelineConfig instance. This is the
    inverse operation of `serialize_pipeline_config`.

    Args:
        json_str: JSON string representation of a PipelineConfig.

    Returns:
        PipelineConfig instance reconstructed from the JSON string.

    Raises:
        json.JSONDecodeError: If the JSON string is invalid.
        KeyError: If required fields are missing from the JSON.

    Example:
        >>> config = create_pipeline_config(schema="analytics")
        >>> json_str = serialize_pipeline_config(config)
        >>> restored = deserialize_pipeline_config(json_str)
        >>> print(restored.schema)  # "analytics"
    """
    data = json.loads(json_str)
    return PipelineConfig(
        schema=data["schema"],
        thresholds=ValidationThresholds(
            bronze=data["thresholds"]["bronze"],
            silver=data["thresholds"]["silver"],
            gold=data["thresholds"]["gold"],
        ),
        verbose=data.get("verbose", True),
    )

In [None]:
# Module: pipeline_builder.writer.models (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.models.execution, pipeline_builder.pipeline.models, pipeline_builder_base.models

"""
Writer-specific models and type definitions.

This module contains all the dataclasses, TypedDict definitions, and type aliases
used by the writer module. It integrates with existing framework models while
providing writer-specific functionality.

"""

from __future__ import annotations

import os
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Literal, Optional, TypedDict, cast
# from .models import ExecutionContext, ExecutionResult, StepResult  # Removed: defined in notebook cells above

# from ..compat import types  # Removed: defined in notebook cells above
from pyspark.sql import types  # types from pyspark (not from compat)
# from ..pipeline.models import PipelineReport  # Removed: defined in notebook cells above

# Import specific types for convenience
BooleanType = types.BooleanType
FloatType = types.FloatType
IntegerType = types.IntegerType
StringType = types.StringType
# Use the appropriate StructField based on the engine

if os.environ.get("SPARKFORGE_ENGINE") == "mock":
    StructField = types.StructField
else:
    StructField = types.StructField
StructType = types.StructType
TimestampType = types.TimestampType

# ============================================================================
# Enums
# ============================================================================

class WriteMode(Enum):
    """Write mode for log operations."""

    OVERWRITE = "overwrite"
    APPEND = "append"
    MERGE = "merge"
    IGNORE = "ignore"

class LogLevel(Enum):
    """Log level for writer operations."""

    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"

# ============================================================================
# TypedDict Definitions
# ============================================================================

class LogRow(TypedDict):
    """
    Enhanced log row with full type safety and framework integration.

    This replaces the previous MinimalLogRow with proper integration
    with framework models and enhanced type safety.
    """

    # Run-level information
    run_id: str
    run_mode: Literal["initial", "incremental", "full_refresh", "validation_only"]
    run_started_at: Optional[datetime]
    run_ended_at: Optional[datetime]

    # Execution context
    execution_id: str
    pipeline_id: str
    schema: str

    # Step-level information
    phase: Literal["bronze", "silver", "gold", "pipeline"]
    step_name: str
    step_type: str

    # Timing information
    start_time: Optional[datetime]
    end_time: Optional[datetime]
    duration_secs: float

    # Table information
    table_fqn: Optional[str]
    write_mode: Optional[Literal["overwrite", "append"]]

    # Data metrics
    input_rows: Optional[int]
    output_rows: Optional[int]
    rows_written: Optional[int]
    rows_processed: int
    table_total_rows: Optional[int]  # Total rows in table after this write

    # Validation metrics
    valid_rows: int
    invalid_rows: int
    validation_rate: float

    # Execution status
    success: bool
    error_message: Optional[str]

    # Performance metrics
    memory_usage_mb: Optional[float]
    cpu_usage_percent: Optional[float]

    # Metadata
    metadata: Dict[str, Any]

class WriterMetrics(TypedDict):
    """Metrics for writer operations."""

    total_writes: int
    successful_writes: int
    failed_writes: int
    total_duration_secs: float
    avg_write_duration_secs: float
    total_rows_written: int
    memory_usage_peak_mb: float

# ============================================================================
# Configuration Models
# ============================================================================

@dataclass
class WriterConfig:
    """
    Configuration for the LogWriter.

    Provides comprehensive configuration options for the writer module
    including table settings, performance tuning, and feature flags.
    """

    # Table configuration
    table_schema: str
    table_name: str
    write_mode: WriteMode = WriteMode.APPEND

    # Custom table naming patterns
    table_name_pattern: Optional[str] = (
        None  # e.g., "{schema}.{pipeline_id}_{timestamp}"
    )
    table_suffix_pattern: Optional[str] = None  # e.g., "_{run_mode}_{date}"

    # Partitioning and optimization
    partition_columns: Optional[list[str]] = None
    partition_count: Optional[int] = None
    compression: str = "snappy"

    # Schema options
    enable_schema_evolution: bool = True
    schema_validation_mode: str = "strict"  # strict, lenient, ignore
    auto_optimize_schema: bool = True

    # Performance settings
    batch_size: int = 1000
    max_file_size_mb: int = 128
    enable_optimization: bool = True
    memory_fraction: float = 0.6

    # Feature flags
    enable_performance_monitoring: bool = True
    enable_data_quality_checks: bool = True
    enable_validation: bool = True
    enable_metrics_collection: bool = True
    enable_audit_trail: bool = True
    enable_backup_before_write: bool = False

    # Logging configuration
    log_level: LogLevel = LogLevel.INFO
    enable_detailed_logging: bool = False
    log_performance_metrics: bool = True
    log_data_quality_results: bool = True

    # Error handling
    max_retries: int = 3
    retry_delay_secs: float = 1.0
    fail_fast: bool = False
    retry_exponential_backoff: bool = True

    # Data quality thresholds
    min_validation_rate: float = 95.0
    max_invalid_rows_percent: float = 5.0
    enable_anomaly_detection: bool = False

    def validate(self) -> None:
        """Validate the configuration."""
        if not self.table_schema or not self.table_schema.strip():
            raise ValueError("Table schema cannot be empty or whitespace-only")
        if not self.table_name or not self.table_name.strip():
            raise ValueError("Table name cannot be empty or whitespace-only")
        if self.batch_size <= 0:
            raise ValueError("Batch size must be positive")
        if self.max_file_size_mb <= 0:
            raise ValueError("Max file size must be positive")
        if self.max_retries < 0:
            raise ValueError("Max retries cannot be negative")
        if self.retry_delay_secs < 0:
            raise ValueError("Retry delay cannot be negative")
        if not 0 < self.memory_fraction <= 1:
            raise ValueError("Memory fraction must be between 0 and 1")
        if self.schema_validation_mode not in ["strict", "lenient", "ignore"]:
            raise ValueError(
                "Schema validation mode must be 'strict', 'lenient', or 'ignore'"
            )
        if not 0 <= self.min_validation_rate <= 100:
            raise ValueError("Min validation rate must be between 0 and 100")
        if not 0 <= self.max_invalid_rows_percent <= 100:
            raise ValueError("Max invalid rows percent must be between 0 and 100")

    def generate_table_name(
        self,
        pipeline_id: Optional[str] = None,
        run_mode: Optional[str] = None,
        timestamp: Optional[str] = None,
    ) -> str:
        """
        Generate dynamic table name based on patterns.

        Args:
            pipeline_id: Pipeline identifier
            run_mode: Run mode (initial, incremental, etc.)
            timestamp: Timestamp for naming

        Returns:
            Generated table name
        """
        table_name = self.table_name

        # Apply suffix pattern if provided
        if self.table_suffix_pattern:
            # Use explicit None checking instead of 'or' to avoid masking None values
            if run_mode is None:
                raise ValueError(
                    "run_mode cannot be None when using table_suffix_pattern"
                )
            if timestamp is None:
                raise ValueError(
                    "timestamp cannot be None when using table_suffix_pattern"
                )

            suffix_vars = {
                "run_mode": run_mode,
                "date": timestamp,
                "timestamp": timestamp,
            }
            suffix = self.table_suffix_pattern.format(**suffix_vars)
            table_name = f"{table_name}{suffix}"

        # Apply full pattern if provided
        if self.table_name_pattern:
            # Use explicit None checking instead of 'or' to avoid masking None values
            if pipeline_id is None:
                raise ValueError(
                    "pipeline_id cannot be None when using table_name_pattern"
                )
            if run_mode is None:
                raise ValueError(
                    "run_mode cannot be None when using table_name_pattern"
                )
            if timestamp is None:
                raise ValueError(
                    "timestamp cannot be None when using table_name_pattern"
                )

            pattern_vars = {
                "schema": self.table_schema,
                "table_name": table_name,
                "pipeline_id": pipeline_id,
                "run_mode": run_mode,
                "date": timestamp,
                "timestamp": timestamp,
            }
            return self.table_name_pattern.format(**pattern_vars)

        return table_name

# ============================================================================
# Spark Schema Definitions
# ============================================================================

# from ..compat import types  # noqa: E402  # Removed: defined in notebook cells above
from pyspark.sql import types  # types from pyspark (not from compat)

def create_log_schema() -> Any:
    """
    Create the Spark schema for log tables.

    Returns:
        StructType: Spark schema for log tables with proper types
    """
    return types.StructType(
        [
            # Run-level fields
            types.StructField("run_id", types.StringType(), False),
            types.StructField("run_mode", types.StringType(), False),
            types.StructField("run_started_at", types.TimestampType(), True),
            types.StructField("run_ended_at", types.TimestampType(), True),
            # Execution context
            types.StructField("execution_id", types.StringType(), False),
            types.StructField("pipeline_id", types.StringType(), False),
            types.StructField("schema", types.StringType(), False),
            # Step-level fields
            types.StructField("phase", types.StringType(), False),
            types.StructField("step_name", types.StringType(), False),
            types.StructField("step_type", types.StringType(), False),
            # Timing fields
            types.StructField("start_time", types.TimestampType(), True),
            types.StructField("end_time", types.TimestampType(), True),
            types.StructField("duration_secs", types.FloatType(), False),
            # Table fields
            types.StructField("table_fqn", types.StringType(), True),
            types.StructField("write_mode", types.StringType(), True),
            # Data metrics
            types.StructField("input_rows", types.IntegerType(), True),
            types.StructField("output_rows", types.IntegerType(), True),
            types.StructField("rows_written", types.IntegerType(), True),
            types.StructField("rows_processed", types.IntegerType(), False),
            types.StructField("table_total_rows", types.IntegerType(), True),
            # Validation metrics
            types.StructField("valid_rows", types.IntegerType(), False),
            types.StructField("invalid_rows", types.IntegerType(), False),
            types.StructField("validation_rate", types.FloatType(), False),
            # Execution status
            types.StructField("success", types.BooleanType(), False),
            types.StructField("error_message", types.StringType(), True),
            # Performance metrics
            types.StructField("memory_usage_mb", types.FloatType(), True),
            types.StructField("cpu_usage_percent", types.FloatType(), True),
            # Metadata (stored as JSON string)
            types.StructField("metadata", types.StringType(), True),
            # Timestamp fields for tracking
            types.StructField("created_at", types.StringType(), True),
            types.StructField("updated_at", types.StringType(), True),
        ]
    )

# ============================================================================
# Factory Functions
# ============================================================================

def create_log_row_from_step_result(
    step_result: StepResult,
    execution_context: ExecutionContext,
    run_id: str,
    run_mode: str,
    metadata: Optional[Dict[str, Any]] = None,
) -> LogRow:
    """
    Create a LogRow from a StepResult and ExecutionContext.

    Args:
        step_result: The step result to convert
        execution_context: The execution context
        run_id: Unique run identifier
        run_mode: Mode of the run (initial, incremental, etc.)
        metadata: Additional metadata

    Returns:
        LogRow: Log row with all fields populated
    """
    return LogRow(
        # Run-level information
        run_id=run_id,
        run_mode=run_mode,  # type: ignore[typeddict-item]
        run_started_at=execution_context.started_at,
        run_ended_at=execution_context.ended_at,
        # Execution context
        execution_id=execution_context.execution_id,
        pipeline_id=execution_context.pipeline_id,
        schema=execution_context.schema,
        # Step-level information
        phase=step_result.phase.value,
        step_name=step_result.step_name,
        step_type=(
            step_result.step_type if step_result.step_type is not None else "unknown"
        ),
        # Timing information
        start_time=step_result.start_time,
        end_time=step_result.end_time,
        duration_secs=step_result.duration_secs,
        # Table information
        table_fqn=step_result.table_fqn,
        write_mode=cast(
            Optional[Literal["overwrite", "append"]], step_result.write_mode
        ),
        # Data metrics
        input_rows=step_result.input_rows,
        output_rows=step_result.rows_processed,
        rows_written=step_result.rows_written,
        rows_processed=step_result.rows_processed,
        table_total_rows=None,
        # Validation metrics
        valid_rows=int(step_result.rows_processed * step_result.validation_rate / 100),
        invalid_rows=int(
            step_result.rows_processed * (100 - step_result.validation_rate) / 100
        ),
        validation_rate=step_result.validation_rate,
        # Execution status
        success=step_result.success,
        error_message=step_result.error_message,
        # Performance metrics
        memory_usage_mb=getattr(step_result, "memory_usage_mb", None),
        cpu_usage_percent=getattr(step_result, "cpu_usage_percent", None),
        # Metadata
        metadata=metadata or {},
    )

def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: str,
    metadata: Optional[Dict[str, Any]] = None,
) -> list[LogRow]:
    """
    Create multiple LogRows from an ExecutionResult.

    Args:
        execution_result: The execution result to convert
        run_id: Unique run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List[LogRow]: List of log rows for each step
    """
    rows = []
    # Process step results from the execution result
    for step_result in execution_result.step_results:
        row = create_log_row_from_step_result(
            step_result=step_result,
            execution_context=execution_result.context,
            run_id=run_id,
            run_mode=run_mode,
            metadata=metadata,
        )
        rows.append(row)
    return rows

def create_log_rows_from_pipeline_report(
    pipeline_report: PipelineReport,
    run_id: str,
    run_mode: str,
    metadata: Optional[Dict[str, Any]] = None,
) -> list[LogRow]:
    """
    Create multiple LogRows from a PipelineReport.

    Args:
        pipeline_report: The pipeline report to convert
        run_id: Unique run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List[LogRow]: List of log rows for each step
    """
    rows = []

    # Create a main log row for the pipeline execution
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,  # type: ignore[typeddict-item]
        "run_started_at": pipeline_report.start_time,
        "run_ended_at": pipeline_report.end_time,
        "execution_id": pipeline_report.execution_id,
        "pipeline_id": pipeline_report.pipeline_id,
        "schema": "default",  # PipelineReport doesn't have schema
        "phase": "pipeline",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": pipeline_report.start_time,
        "end_time": pipeline_report.end_time,
        "duration_secs": pipeline_report.duration_seconds,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": 0,
        "output_rows": 0,
        "rows_written": 0,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": pipeline_report.success,
        "error_message": pipeline_report.errors[0] if pipeline_report.errors else None,
        "memory_usage_mb": None,
        "cpu_usage_percent": None,
        "metadata": metadata or {},
    }
    rows.append(main_row)

    # Add step results from bronze, silver, and gold layers
    all_results = {}
    all_results.update(pipeline_report.bronze_results)
    all_results.update(pipeline_report.silver_results)
    all_results.update(pipeline_report.gold_results)

    for step_name, _step_data in all_results.items():
        # Create a simplified step row since we don't have full StepResult objects
        step_row: LogRow = {
            "run_id": run_id,
            "run_mode": run_mode,  # type: ignore[typeddict-item]
            "run_started_at": pipeline_report.start_time,
            "run_ended_at": pipeline_report.end_time,
            "execution_id": pipeline_report.execution_id,
            "pipeline_id": pipeline_report.pipeline_id,
            "schema": "default",
            "phase": "bronze"
            if step_name in pipeline_report.bronze_results
            else "silver"
            if step_name in pipeline_report.silver_results
            else "gold",
            "step_name": step_name,
            "step_type": "transform",
            "start_time": pipeline_report.start_time,
            "end_time": pipeline_report.end_time,
            "duration_secs": 0.0,  # Not available in PipelineReport
            "table_fqn": None,
            "write_mode": None,
            "input_rows": 0,
            "output_rows": 0,
            "rows_written": 0,
            "rows_processed": 0,
            "table_total_rows": None,
            "valid_rows": 0,
            "invalid_rows": 0,
            "validation_rate": 100.0,
            "success": True,  # Assume success if in results
            "error_message": None,
            "memory_usage_mb": None,
            "cpu_usage_percent": None,
            "metadata": metadata or {},
        }
        rows.append(step_row)

    return rows

# ============================================================================
# Validation Functions
# ============================================================================

def validate_log_row(row: LogRow) -> None:
    """
    Validate a log row for data quality.

    Args:
        row: The log row to validate

    Raises:
        ValueError: If the log row is invalid
    """
    # Validate required fields
    if not row["run_id"]:
        raise ValueError("Run ID cannot be empty")
    if not row["execution_id"]:
        raise ValueError("Execution ID cannot be empty")
    if not row["pipeline_id"]:
        raise ValueError("Pipeline ID cannot be empty")
    if not row["step_name"]:
        raise ValueError("Step name cannot be empty")

    # Validate numeric fields
    if row["duration_secs"] < 0:
        raise ValueError("Duration cannot be negative")
    if row["rows_processed"] < 0:
        raise ValueError("Rows processed cannot be negative")
    if row["valid_rows"] < 0:
        raise ValueError("Valid rows cannot be negative")
    if row["invalid_rows"] < 0:
        raise ValueError("Invalid rows cannot be negative")
    if not 0 <= row["validation_rate"] <= 100:
        raise ValueError("Validation rate must be between 0 and 100")

    # Validate logical consistency
    total_rows = row["valid_rows"] + row["invalid_rows"]
    if total_rows != row["rows_processed"]:
        raise ValueError("Valid + invalid rows must equal rows processed")

def validate_log_data(rows: list[LogRow]) -> Dict[str, Any]:
    """
    Validate a list of log rows.

    Args:
        rows: List of log rows to validate

    Returns:
        Dictionary with validation results
    """
    errors = []
    for i, row in enumerate(rows):
        try:
            validate_log_row(row)
        except ValueError as e:
            errors.append(f"Invalid log row at index {i}: {e}")

    return {"is_valid": len(errors) == 0, "errors": errors}

In [None]:
# Module: pipeline_builder.validation.data_validation (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.compat_helpers, pipeline_builder.functions, pipeline_builder.functions, pipeline_builder.models, pipeline_builder.models.execution, pipeline_builder.models.types, pipeline_builder_base.errors, pipeline_builder_base.errors, pipeline_builder_base.logging, pipeline_builder_base.logging, pipeline_builder_base.models

# mypy: ignore-errors
"""
Data validation functions for the framework.

This module provides functions for validating data using PySpark expressions,
including string rule conversion, column validation, and data quality assessment.

"""

from __future__ import annotations

import time
from typing import Any, Dict, Optional, Union, cast
# from .errors import ValidationError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import StageStats  # Removed: defined in notebook cells above

# from ..compat import Column, DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..compat_helpers import detect_spark_type  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..models import ColumnRules  # Removed: defined in notebook cells above

logger = PipelineLogger("DataValidation")

def _convert_rule_to_expression(
    rule: Union[str, list],
    column_name: str,
    functions: Optional[FunctionsProtocol] = None,
) -> Column:
    """Convert a string rule to a PySpark Column expression."""
    if functions is None:
        functions = get_default_functions()

    # Handle list-based rules like ["gt", 0]
    if isinstance(rule, list):
        if len(rule) == 0:
            # Empty rule means no validation
            return functions.lit(True)
        elif len(rule) == 1:
            return _convert_rule_to_expression(rule[0], column_name, functions)
        elif len(rule) == 2:
            op, value = rule
            if op == "gt":
                result = functions.col(column_name) > value
                return cast(Column, result)
            elif op == "gte":
                result = functions.col(column_name) >= value
                return cast(Column, result)
            elif op == "lt":
                result = functions.col(column_name) < value
                return cast(Column, result)
            elif op == "lte":
                result = functions.col(column_name) <= value
                return cast(Column, result)
            elif op == "eq":
                result = functions.col(column_name) == value
                return cast(Column, result)
            elif op == "ne":
                result = functions.col(column_name) != value
                return cast(Column, result)
            elif op == "in":
                if not isinstance(value, (list, tuple, set)):
                    raise ValidationError(
                        f"'in' rule for column '{column_name}' requires list/tuple/set values"
                    )
                result = functions.col(column_name).isin(list(value))  # type: ignore[attr-defined]
                return cast(Column, result)
            elif op == "not_in":
                if not isinstance(value, (list, tuple, set)):
                    raise ValidationError(
                        f"'not_in' rule for column '{column_name}' requires list/tuple/set values"
                    )
                result = ~functions.col(column_name).isin(list(value))  # type: ignore[attr-defined]
                return cast(Column, result)
            elif op == "like":
                result = functions.col(column_name).like(value)
                return cast(Column, result)
            else:
                # For unknown operators, assume it's a valid PySpark expression
                return functions.expr(f"{column_name} {op} {value}")
        elif len(rule) == 3:
            op, min_val, max_val = rule
            if op == "between":
                result = functions.col(column_name).between(min_val, max_val)
                return result
            else:
                # For unknown operators, assume it's a valid PySpark expression
                return functions.expr(f"{column_name} {op} {min_val} {max_val}")
        else:
            # For complex rules, assume it's a valid PySpark expression
            return functions.expr(str(rule))

    # Handle string-based rules
    if rule == "not_null":
        result = functions.col(column_name).isNotNull()
        return result
    elif rule == "positive":
        result = functions.col(column_name) > 0
        return result
    elif rule == "non_negative":
        result = functions.col(column_name) >= 0
        return result
    elif rule == "non_zero":
        return functions.col(column_name) != 0
    else:
        # For unknown rules, assume it's a valid PySpark expression
        return functions.expr(rule)

def _convert_rules_to_expressions(
    rules: ColumnRules,
    functions: Optional[FunctionsProtocol] = None,
) -> Dict[str, list[Union[str, Column]]]:
    """Convert string rules to PySpark Column expressions."""
    if functions is None:
        functions = get_default_functions()

    converted_rules: Dict[str, list[Union[str, Column]]] = {}
    for column_name, rule_list in rules.items():
        converted_rule_list: list[Union[str, Column]] = []
        i = 0
        while i < len(rule_list):
            rule = rule_list[i]
            # Doc-style "in" rule: ["in", ["a", "b"]] is often written as
            # rule_list = ["in", ["a", "b"]] (two elements). Coalesce into one rule.
            if (
                i + 1 < len(rule_list)
                and rule == "in"
                and isinstance(rule_list[i + 1], (list, tuple, set))
            ):
                converted_rule_list.append(
                    _convert_rule_to_expression(
                        ["in", rule_list[i + 1]], column_name, functions
                    )
                )
                i += 2
            elif isinstance(rule, (str, list)):
                converted_rule_list.append(
                    _convert_rule_to_expression(rule, column_name, functions)
                )
                i += 1
            else:
                converted_rule_list.append(rule)
                i += 1
        converted_rules[column_name] = converted_rule_list
    return converted_rules

def and_all_rules(
    rules: ColumnRules,
    functions: Optional[FunctionsProtocol] = None,
) -> Union[Column, bool]:
    """Combine all validation rules with AND logic."""
    if not rules:
        return True

    if functions is None:
        functions = get_default_functions()

    converted_rules = _convert_rules_to_expressions(rules, functions)
    expressions = []
    for _, exprs in converted_rules.items():
        expressions.extend(exprs)

    if not expressions:
        return True

    # Filter out non-Column expressions and convert strings to Columns
    column_expressions = []
    for expr in expressions:
        # Check if it's a Column-like object (has column operations)
        if isinstance(expr, str):
            column_expressions.append(functions.expr(expr))
        elif hasattr(expr, "__and__") and hasattr(expr, "__invert__"):
            # Column-like object (use duck typing for Python 3.8 compatibility)
            column_expressions.append(cast(Column, expr))

    if not column_expressions:
        return True

    pred = column_expressions[0]
    for e in column_expressions[1:]:
        pred = pred & e

    # Note: sparkless 3.17.1+ fixes the bug where combined ColumnOperation expressions
    # were treated as column names, so we can return the combined expression directly
    return pred

def apply_column_rules(
    df: DataFrame,
    rules: ColumnRules,
    stage: str,
    step: str,
    filter_columns_by_rules: bool = True,
    functions: Optional[FunctionsProtocol] = None,
) -> tuple[DataFrame, DataFrame, StageStats]:
    """
    Apply validation rules to a DataFrame and return valid/invalid DataFrames with statistics.

    Args:
        df: DataFrame to validate
        rules: Dictionary mapping column names to validation rules
        stage: Pipeline stage name
        step: Step name within the stage
        filter_columns_by_rules: If True, output DataFrames only contain columns with rules

    Returns:
        Tuple of (valid_df, invalid_df, stats)
    """
    if rules is None:
        raise ValidationError("Validation rules cannot be None")

    # Handle empty rules - return all rows as valid
    if not rules:
        total_rows = df.count()
        duration = 0.0
        stats = StageStats(
            stage=stage,
            step=step,
            total_rows=total_rows,
            valid_rows=total_rows,
            invalid_rows=0,
            validation_rate=100.0,
            duration_secs=duration,
        )
        return (
            df,
            df.limit(0),
            stats,
        )  # Return original df as valid, empty df as invalid

    # Validate that all columns referenced in rules exist in the DataFrame
    df_columns = set(df.columns)
    rule_columns = set(rules.keys())
    missing_columns = rule_columns - df_columns

    if missing_columns:
        available_columns = sorted(df_columns)
        missing_columns_list = sorted(missing_columns)

        # Filter out rules for non-existent columns with a warning
        # This handles cases where transforms drop columns that were in the input
        filtered_rules = {col: rules[col] for col in rules.keys() if col in df_columns}

        if not filtered_rules:
            # All rules reference missing columns - this is an error
            raise ValidationError(
                f"All columns referenced in validation rules do not exist in DataFrame. "
                f"Missing columns: {missing_columns_list}. "
                f"Available columns: {available_columns}. "
                f"Stage: {stage}, Step: {step}. "
                f"This may indicate that the transform function dropped columns that are referenced in validation rules. "
                f"Please update validation rules to only reference columns that exist after the transform."
            )

        # Log warning about filtered rules
        logger.warning(
            f"Validation rules reference columns that do not exist in DataFrame after transform. "
            f"Filtered out rules for missing columns: {missing_columns_list}. "
            f"Available columns: {available_columns}. "
            f"Stage: {stage}, Step: {step}. "
            f"This may indicate that the transform function dropped columns. "
            f"Continuing validation with remaining rules for existing columns."
        )

        # Use filtered rules for validation
        rules = filtered_rules

    start_time = time.time()

    # Create validation predicate
    validation_predicate = and_all_rules(rules, functions)

    # Apply validation
    if validation_predicate is True:
        # No validation rules, return all data as valid
        valid_df = df
        invalid_df = df.limit(0)  # Empty DataFrame with same schema
        total_rows = df.count()
        valid_rows = total_rows
        invalid_rows = 0
    elif (
        hasattr(validation_predicate, "__and__")
        and hasattr(validation_predicate, "__invert__")
        and not isinstance(validation_predicate, bool)
    ):
        # Handle PySpark Column expressions
        # Note: sparkless 3.17.1+ fixes the bug where combined ColumnOperation expressions
        # were treated as column names, so we can use the combined predicate for both
        # sparkless and PySpark
        if isinstance(validation_predicate, str):
            validation_predicate = functions.expr(validation_predicate)
        elif not (
            hasattr(validation_predicate, "__and__")
            and hasattr(validation_predicate, "__invert__")
        ):
            # Check if we're in real PySpark mode and predicate is not a PySpark Column
            # This can happen when tests use sparkless functions in real PySpark mode
            try:
                # Try to detect if we're in real PySpark mode
                spark_type = detect_spark_type(df.sql_ctx.sparkSession)  # type: ignore[attr-defined]
                if spark_type == "pyspark":
                    # Check if predicate is a PySpark Column by checking for _jc attribute
                    if not hasattr(validation_predicate, "_jc"):
                        # Not a PySpark Column - try to convert via string representation
                        # This handles ColumnOperation from sparkless
                        try:
                            # Try to get string representation and convert
                            pred_str = str(validation_predicate)
                            validation_predicate = functions.expr(pred_str)
                        except Exception:
                            # If conversion fails, cast and hope it works
                            # This will raise an error if it doesn't work, which is better than silent failure
                            validation_predicate = cast(Column, validation_predicate)
                    else:
                        # It's a PySpark Column, just cast for type checking
                        validation_predicate = cast(Column, validation_predicate)
                else:
                    # Not in real PySpark mode, safe to cast
                    validation_predicate = cast(Column, validation_predicate)
            except Exception:
                # If detection fails, try casting anyway
                validation_predicate = cast(Column, validation_predicate)

        valid_df = df.filter(validation_predicate)
        invalid_df = df.filter(~validation_predicate)
        total_rows = df.count()
        valid_rows = valid_df.count()
        invalid_rows = invalid_df.count()
    else:
        # Handle boolean False case (shouldn't happen with current logic)
        valid_df = df.limit(0)
        invalid_df = df
        total_rows = df.count()
        valid_rows = 0
        invalid_rows = total_rows

    # Apply column filtering if requested
    if filter_columns_by_rules:
        # Only keep columns that have validation rules
        rule_columns_list: list[str] = list(rules.keys())
        valid_df = valid_df.select(*rule_columns_list)
        # For invalid_df, also include the _failed_rules column if it exists
        invalid_columns: list[str] = rule_columns_list.copy()
        if "_failed_rules" in invalid_df.columns:
            invalid_columns.append("_failed_rules")
        invalid_df = invalid_df.select(*invalid_columns)

    # Calculate validation rate
    validation_rate = (valid_rows / total_rows * 100) if total_rows > 0 else 100.0

    # Create statistics
    duration = time.time() - start_time
    stats = StageStats(
        stage=stage,
        step=step,
        total_rows=total_rows,
        valid_rows=valid_rows,
        invalid_rows=invalid_rows,
        validation_rate=validation_rate,
        duration_secs=duration,
    )

    logger.info(
        f"Validation completed for {stage}.{step}: {validation_rate:.1f}% valid"
    )

    return valid_df, invalid_df, stats

def validate_dataframe_schema(
    df: DataFrame,
    expected_columns: list[str],
) -> bool:
    """Validate that DataFrame has expected columns."""
    actual_columns = set(df.columns)
    expected_set = set(expected_columns)
    missing_columns = expected_set - actual_columns
    return len(missing_columns) == 0

def assess_data_quality(
    df: DataFrame,
    rules: Optional[ColumnRules] = None,
    functions: Optional[FunctionsProtocol] = None,
) -> Dict[str, Any]:
    """
    Assess data quality of a DataFrame.

    Args:
        df: DataFrame to assess
        rules: Optional validation rules

    Returns:
        Dictionary with quality metrics
    """
    try:
        total_rows = df.count()

        if total_rows == 0:
            return {
                "total_rows": 0,
                "valid_rows": 0,
                "invalid_rows": 0,
                "quality_rate": 100.0,
                "is_empty": True,
            }

        if rules:
            valid_df, invalid_df, stats = apply_column_rules(
                df, rules, "test", "test", functions=functions
            )
            return {
                "total_rows": stats.total_rows,
                "valid_rows": stats.valid_rows,
                "invalid_rows": stats.invalid_rows,
                "quality_rate": stats.validation_rate,
                "is_empty": False,
            }
        else:
            return {
                "total_rows": total_rows,
                "valid_rows": total_rows,
                "invalid_rows": 0,
                "quality_rate": 100.0,
                "is_empty": False,
            }
    except ValidationError as e:
        # Re-raise validation errors as they are specific and actionable
        raise e
    except Exception as e:
        # Log the unexpected error and re-raise with context
        import logging

        logger = logging.getLogger(__name__)
        logger.error(f"Unexpected error in assess_data_quality: {e}")
        raise ValidationError(
            f"Data quality assessment failed: {e}",
            context={"function": "assess_data_quality", "original_error": str(e)},
        ) from e

In [None]:
# Module: pipeline_builder.validation.pipeline_validation (pipeline_builder)
#
# Dependencies: pipeline_builder.models, pipeline_builder.models.execution, pipeline_builder.models.pipeline, pipeline_builder.models.steps, pipeline_builder_base.logging, pipeline_builder_base.logging, pipeline_builder_base.models

"""
Pipeline validation functions for the framework.

This module provides functions and classes for validating pipeline configurations,
step dependencies, and overall pipeline structure.

"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionContext, PipelineConfig  # Removed: defined in notebook cells above

# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above

# Type alias for step names
StepName = str

class StepValidatorProtocol:
    """Protocol for custom step validators."""

    def validate(self, step: Any, context: ExecutionContext) -> list[str]:
        """Validate a step and return any validation errors."""
        return []

@dataclass
class ValidationResult:
    """Result of validation."""

    is_valid: bool
    errors: list[str]
    warnings: list[str]
    recommendations: list[str]

    def __bool__(self) -> bool:
        """Return whether validation passed."""
        return self.is_valid

class UnifiedValidator:
    """
    Unified validation system for both data and pipeline validation.

    This class provides a single interface for all validation needs,
    combining data validation and pipeline validation functionality.
    """

    def __init__(self, logger: Optional[PipelineLogger] = None):
        """Initialize the unified validator."""
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger
        self.custom_validators: list[StepValidator] = []

    def add_validator(self, validator: StepValidator) -> None:
        """Add a custom step validator."""
        self.custom_validators.append(validator)
        self.logger.info(f"Added custom validator: {validator.__class__.__name__}")

    def validate_schema(self, schema: Any) -> list[str]:
        """
        Validate schema name format.

        Same contract as UnifiedValidator.validate_schema so that schema
        validation works when the builder uses UnifiedValidator.

        Args:
            schema: Schema name to validate.

        Returns:
            List of validation errors (empty if valid).
        """
        errors: list[str] = []
        if not schema:
            errors.append("Schema name cannot be empty")
        elif not isinstance(schema, str):
            errors.append("Schema name must be a string")
        elif not schema.strip():
            errors.append("Schema name cannot be whitespace only")
        elif len(schema) > 128:
            errors.append("Schema name is too long (max 128 characters)")
        return errors

    def validate_pipeline(
        self,
        config: PipelineConfig,
        bronze_steps: Dict[StepName, BronzeStep],
        silver_steps: Dict[StepName, SilverStep],
        gold_steps: Dict[StepName, GoldStep],
    ) -> ValidationResult:
        """
        Validate the entire pipeline configuration.

        Returns:
            ValidationResult object containing:
            - errors: List[str] of validation error messages
            - warnings: List[str] of validation warnings
            - recommendations: List[str] of recommendations
            - is_valid: bool indicating if validation passed

        Note:
            This differs from UnifiedValidator.validate_pipeline() which returns List[str].
            The PipelineBuilder.validate_pipeline() method handles both return types
            using runtime type checks and type guard functions.
        """
        errors: list[str] = []
        warnings: list[str] = []
        recommendations: list[str] = []

        # Validate configuration
        config_errors = self._validate_config(config)
        errors.extend(config_errors)

        # Validate steps
        bronze_errors, bronze_warnings = self._validate_bronze_steps(bronze_steps)
        errors.extend(bronze_errors)
        warnings.extend(bronze_warnings)

        silver_errors, silver_warnings = self._validate_silver_steps(
            silver_steps, bronze_steps
        )
        errors.extend(silver_errors)
        warnings.extend(silver_warnings)

        gold_errors, gold_warnings = self._validate_gold_steps(gold_steps, silver_steps)
        errors.extend(gold_errors)
        warnings.extend(gold_warnings)

        # Validate dependencies
        dep_errors, dep_warnings = self._validate_dependencies(
            bronze_steps, silver_steps, gold_steps
        )
        errors.extend(dep_errors)
        warnings.extend(dep_warnings)

        is_valid = len(errors) == 0

        # Logging is handled by the builder to avoid duplicate messages
        return ValidationResult(
            is_valid=is_valid,
            errors=errors,
            warnings=warnings,
            recommendations=recommendations,
        )

    def validate_step(
        self, step: Any, step_type: str, context: ExecutionContext
    ) -> ValidationResult:
        """Validate a single step."""
        errors: list[str] = []
        warnings: list[str] = []

        # Run custom validators
        for validator in self.custom_validators:
            try:
                validator_errors = validator.validate(step, context)
                errors.extend(validator_errors)
            except Exception as e:
                errors.append(
                    f"Custom validator {validator.__class__.__name__} failed: {e}"
                )

        return ValidationResult(
            is_valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
            recommendations=[],
        )

    def _validate_config(self, config: PipelineConfig) -> list[str]:
        """Validate pipeline configuration."""
        errors = []

        if not config.schema:
            errors.append("Pipeline schema is required")

        # Table prefix is optional in simplified config
        # if not config.table_prefix:
        #     errors.append("Table prefix is required")

        return errors

    def _validate_bronze_steps(
        self, bronze_steps: Dict[StepName, BronzeStep]
    ) -> tuple[list[str], list[str]]:
        """Validate bronze steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in bronze_steps.items():
            # Simplified validation - just check that step has required basic attributes
            if not step.name:
                errors.append(f"Bronze step {step_name} missing name")

            if not step.rules:
                errors.append(f"Bronze step {step_name} missing validation rules")

        return errors, warnings

    def _validate_silver_steps(
        self,
        silver_steps: Dict[StepName, SilverStep],
        bronze_steps: Dict[StepName, BronzeStep],
    ) -> tuple[list[str], list[str]]:
        """Validate silver steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in silver_steps.items():
            # Handle validation-only steps (existing=True, transform=None)
            if step.existing and step.transform is None:
                # Validation-only step - check rules and table_name, but skip source_bronze
                if not step.rules:
                    errors.append(f"Silver step '{step_name}' missing validation rules")
                if not step.table_name:
                    errors.append(f"Silver step '{step_name}' missing table_name")
                continue

            # SQL-source steps (sql_source set) have no source_bronze
            if getattr(step, "sql_source", None) is not None:
                if not step.rules:
                    errors.append(f"Silver step '{step_name}' missing validation rules")
                if not step.table_name:
                    errors.append(f"Silver step '{step_name}' missing table_name")
                continue

            if not step.source_bronze:
                errors.append(f"Silver step {step_name} missing source_bronze")

            # Check source_bronze exists
            if step.source_bronze and step.source_bronze not in bronze_steps:
                errors.append(
                    f"Silver step {step_name} depends on non-existent bronze step {step.source_bronze}"
                )

        return errors, warnings

    def _validate_gold_steps(
        self,
        gold_steps: Dict[StepName, GoldStep],
        silver_steps: Dict[StepName, SilverStep],
    ) -> tuple[list[str], list[str]]:
        """Validate gold steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in gold_steps.items():
            # Handle validation-only steps (existing=True, transform=None)
            if step.existing and step.transform is None:
                # Validation-only step - check rules and table_name, but skip source_silvers
                if not step.rules:
                    errors.append(f"Gold step '{step_name}' missing validation rules")
                if not step.table_name:
                    errors.append(f"Gold step '{step_name}' missing table_name")
                continue

            # SQL-source steps (sql_source set) have no source_silvers
            if getattr(step, "sql_source", None) is not None:
                if not step.rules:
                    errors.append(f"Gold step '{step_name}' missing validation rules")
                if not step.table_name:
                    errors.append(f"Gold step '{step_name}' missing table_name")
                continue

            # Check source_silvers exist (if specified)
            if step.source_silvers:
                for silver_name in step.source_silvers:
                    if silver_name not in silver_steps:
                        errors.append(
                            f"Gold step {step_name} depends on non-existent silver step {silver_name}"
                        )

        return errors, warnings

    def _validate_dependencies(
        self,
        bronze_steps: Dict[StepName, BronzeStep],
        silver_steps: Dict[StepName, SilverStep],
        gold_steps: Dict[StepName, GoldStep],
    ) -> tuple[list[str], list[str]]:
        """Validate step dependencies."""
        errors = []
        warnings: list[str] = []

        # Check for circular dependencies
        all_steps = {**bronze_steps, **silver_steps, **gold_steps}

        for step_name, step in all_steps.items():
            # Check for circular dependencies in non-standard dependencies attribute
            # This is only for custom step types that might have a dependencies field
            if hasattr(step, "dependencies"):
                dependencies = getattr(step, "dependencies", None)
                if dependencies and isinstance(dependencies, (list, tuple, set)):
                    for dep in dependencies:
                        if hasattr(dep, "step_name") and dep.step_name == step_name:
                            errors.append(
                                f"Step {step_name} has circular dependency on itself"
                            )

        return errors, warnings

In [None]:
# Module: pipeline_builder.writer.monitoring (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.writer.exceptions, pipeline_builder.writer.models, pipeline_builder.writer.models, pipeline_builder.writer.query_builder, pipeline_builder.writer.query_builder, pipeline_builder_base.logging, pipeline_builder_base.logging, writer.exceptions

# mypy: ignore-errors
"""
Writer monitoring module for performance tracking and metrics collection.

This module handles performance monitoring, metrics collection, and
analytics for the writer operations.

"""

from __future__ import annotations

import time
from datetime import datetime
from typing import Dict, Optional, TypedDict, cast

try:
    import psutil

    HAS_PSUTIL = True
except ImportError:
    HAS_PSUTIL = False
    psutil = None  # type: ignore[assignment, unused-ignore]
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from .exceptions import WriterError  # Removed: defined in notebook cells above
# from .models import WriterMetrics  # Removed: defined in notebook cells above
# from .query_builder import QueryBuilder  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================

class OperationMetrics(TypedDict):
    """Metrics for a single operation."""

    operation_id: str
    success: bool
    duration_secs: float
    rows_written: int
    memory_usage_mb: float
    error_message: Optional[str]
    timestamp: str

class SparkMemoryInfo(TypedDict, total=False):
    """Spark memory configuration."""

    executor_memory: str
    driver_memory: str

class MemoryUsageInfo(TypedDict):
    """Memory usage information structure."""

    total_mb: float
    available_mb: float
    used_mb: float
    percentage: float
    spark_memory: SparkMemoryInfo
    psutil_available: bool

class SuccessRateTrend(TypedDict):
    """Success rate trend data point."""

    date: str
    success_rate: float
    avg_validation_rate: float
    avg_execution_time: float

class PerformanceByPhase(TypedDict):
    """Performance metrics by phase."""

    phase: str
    avg_execution_time: float
    avg_validation_rate: float
    execution_count: int

class DataQualityTrend(TypedDict):
    """Data quality trend data point."""

    date: str
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float

class PerformanceTrends(TypedDict):
    """Execution trends analysis structure."""

    success_rate_trend: list[SuccessRateTrend]
    performance_by_phase: list[PerformanceByPhase]
    data_quality_trend: list[DataQualityTrend]

class PerformanceAnomaly(TypedDict):
    """Performance anomaly data point."""

    step: str
    execution_time: float
    validation_rate: float
    success: bool

class QualityAnomaly(TypedDict):
    """Quality anomaly data point."""

    step: str
    validation_rate: float
    valid_rows: int
    invalid_rows: int

class AnomalyReport(TypedDict):
    """Anomaly detection results structure."""

    performance_anomalies: list[PerformanceAnomaly]
    quality_anomalies: list[QualityAnomaly]
    anomaly_score: float
    total_anomalies: int
    total_executions: int

class OverallStatistics(TypedDict):
    """Overall performance statistics."""

    total_executions: int
    successful_executions: int
    success_rate: float
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int

class PhaseStatistics(TypedDict):
    """Phase-wise performance statistics."""

    phase: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int

class RecentPerformance(TypedDict):
    """Recent performance data point."""

    date: str
    daily_executions: int
    avg_execution_time: float
    avg_validation_rate: float

class PerformanceReport(TypedDict):
    """Comprehensive performance report structure."""

    overall_statistics: OverallStatistics
    phase_statistics: list[PhaseStatistics]
    recent_performance: list[RecentPerformance]
    generated_at: str

class PerformanceMonitor:
    """Handles performance monitoring and metrics collection."""

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the performance monitor."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("PerformanceMonitor")
        else:
            self.logger = logger
        self.metrics: WriterMetrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        self.operation_start_times: Dict[str, float] = {}

    def start_operation(self, operation_id: str, operation_type: str) -> None:
        """
        Start monitoring an operation.

        Args:
            operation_id: Unique identifier for the operation
            operation_type: Type of operation being monitored
        """
        try:
            self.operation_start_times[operation_id] = time.time()
            self.logger.info(
                f"Started monitoring {operation_type} operation: {operation_id}"
            )

        except Exception as e:
            self.logger.error(
                f"Failed to start monitoring operation {operation_id}: {e}"
            )
            raise WriterError(
                f"Failed to start monitoring operation {operation_id}: {e}"
            ) from e

    def end_operation(
        self,
        operation_id: str,
        success: bool,
        rows_written: int = 0,
        error_message: Optional[str] = None,
    ) -> OperationMetrics:
        """
        End monitoring an operation and update metrics.

        Args:
            operation_id: Unique identifier for the operation
            success: Whether the operation was successful
            rows_written: Number of rows written
            error_message: Error message if operation failed

        Returns:
            Dictionary containing operation metrics
        """
        try:
            if operation_id not in self.operation_start_times:
                self.logger.warning(f"Operation {operation_id} was not being monitored")
                # Return empty metrics matching the TypedDict
                return {
                    "operation_id": operation_id,
                    "success": False,
                    "duration_secs": 0.0,
                    "rows_written": 0,
                    "memory_usage_mb": 0.0,
                    "error_message": "Operation was not being monitored",
                    "timestamp": datetime.now().isoformat(),
                }

            # Calculate duration
            start_time = self.operation_start_times[operation_id]
            duration = time.time() - start_time

            # Update metrics
            self.metrics["total_writes"] += 1
            if success:
                self.metrics["successful_writes"] += 1
            else:
                self.metrics["failed_writes"] += 1

            self.metrics["total_duration_secs"] += duration
            self.metrics["total_rows_written"] += rows_written

            # Calculate average duration
            if self.metrics["total_writes"] > 0:
                self.metrics["avg_write_duration_secs"] = (
                    self.metrics["total_duration_secs"] / self.metrics["total_writes"]
                )

            # Update peak memory usage
            current_memory = self.get_memory_usage()["used_mb"]
            if current_memory > self.metrics["memory_usage_peak_mb"]:
                self.metrics["memory_usage_peak_mb"] = current_memory

            # Create operation metrics
            operation_metrics = {
                "operation_id": operation_id,
                "success": success,
                "duration_secs": duration,
                "rows_written": rows_written,
                "memory_usage_mb": current_memory,
                "error_message": error_message,
                "timestamp": datetime.now().isoformat(),
            }

            # Clean up
            del self.operation_start_times[operation_id]

            self.logger.info(
                f"Completed monitoring {operation_id}: {duration:.2f}s, {rows_written} rows"
            )
            return cast(OperationMetrics, operation_metrics)

        except Exception as e:
            self.logger.error(f"Failed to end monitoring operation {operation_id}: {e}")
            raise WriterError(
                f"Failed to end monitoring operation {operation_id}: {e}"
            ) from e

    def get_metrics(self) -> WriterMetrics:
        """Get current performance metrics."""
        return self.metrics.copy()

    def reset_metrics(self) -> None:
        """Reset performance metrics."""
        self.metrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        self.logger.info("Performance metrics reset")

    def get_memory_usage(self) -> MemoryUsageInfo:
        """
        Get current memory usage information.

        Returns:
            Dictionary containing memory usage details
        """
        # Check if psutil is available at all
        if not HAS_PSUTIL or psutil is None:
            self.logger.warning("psutil not available, returning basic memory info")
            return {
                "total_mb": 0.0,
                "available_mb": 0.0,
                "used_mb": 0.0,
                "percentage": 0.0,
                "spark_memory": {},
                "psutil_available": False,
            }

        try:
            # Get system memory info
            memory = psutil.virtual_memory()

            # Get Spark memory info if available
            spark_memory = {}
            try:
                spark_context = self.spark.sparkContext
                spark_memory = {
                    "executor_memory": spark_context.getConf().get(
                        "spark.executor.memory", "N/A"
                    ),
                    "driver_memory": spark_context.getConf().get(
                        "spark.driver.memory", "N/A"
                    ),
                }
            except Exception:
                pass

            memory_info = {
                "total_mb": round(memory.total / (1024 * 1024), 2),
                "available_mb": round(memory.available / (1024 * 1024), 2),
                "used_mb": round(memory.used / (1024 * 1024), 2),
                "percentage": memory.percent,
                "spark_memory": spark_memory,
                "psutil_available": True,
            }

            return cast(MemoryUsageInfo, memory_info)

        except Exception as e:
            self.logger.error(f"Failed to get memory usage: {e}")
            raise WriterError(f"Failed to get memory usage: {e}") from e

    def check_performance_thresholds(
        self, operation_metrics: OperationMetrics
    ) -> list[str]:
        """
        Check if performance thresholds are exceeded.

        Args:
            operation_metrics: Metrics for the operation

        Returns:
            List of threshold violations
        """
        violations = []

        try:
            # Check duration threshold (5 minutes)
            if operation_metrics.get("duration_secs", 0) > 300:
                violations.append("Operation duration exceeded 5 minutes")

            # Check memory usage threshold (8GB)
            if operation_metrics.get("memory_usage_mb", 0) > 8192:
                violations.append("Memory usage exceeded 8GB")

            # Check success rate threshold (95%)
            if self.metrics["total_writes"] > 0:
                success_rate = (
                    self.metrics["successful_writes"] / self.metrics["total_writes"]
                ) * 100
                if success_rate < 95.0:
                    violations.append(f"Success rate below 95%: {success_rate:.1f}%")

            return violations

        except Exception as e:
            self.logger.error(f"Failed to check performance thresholds: {e}")
            raise WriterError(f"Failed to check performance thresholds: {e}") from e

class AnalyticsEngine:
    """Handles analytics and trend analysis for writer operations."""

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the analytics engine."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("AnalyticsEngine")
        else:
            self.logger = logger

    def analyze_execution_trends(self, df: DataFrame) -> PerformanceTrends:
        """
        Analyze execution trends from log data.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing trend analysis
        """
        try:
            self.logger.info("Analyzing execution trends")

            # Use query builder for all trend analyses
            trends = {}

            # Success rate trend using query builder
            success_trend_df = QueryBuilder.build_daily_trends_query(df, 30)
            success_trend = success_trend_df.collect()

            trends["success_rate_trend"] = [
                {
                    "date": row["date"],
                    "success_rate": (
                        row["successful_executions"] / row["daily_executions"]
                    )
                    * 100,
                    "avg_validation_rate": row.get("avg_validation_rate", 0),
                    "avg_execution_time": row["avg_execution_time"],
                }
                for row in success_trend
            ]

            # Performance trends using query builder
            performance_trend_df = QueryBuilder.build_phase_trends_query(df, 30)
            performance_trend = performance_trend_df.collect()

            trends["performance_by_phase"] = [
                {
                    "phase": row["phase"],
                    "avg_execution_time": row["avg_execution_time"],
                    "avg_validation_rate": row["avg_validation_rate"],
                    "execution_count": row["execution_count"],
                }
                for row in performance_trend
            ]

            # Data quality trends using query builder
            quality_trend_df = QueryBuilder.build_quality_trends_query(df, 30)
            quality_trend = quality_trend_df.collect()

            trends["data_quality_trend"] = [
                {
                    "date": row["date"],
                    "avg_validation_rate": row["avg_validation_rate"],
                    "min_validation_rate": row["min_validation_rate"],
                    "max_validation_rate": row["max_validation_rate"],
                }
                for row in quality_trend
            ]

            self.logger.info("Execution trends analysis completed")
            return cast(PerformanceTrends, trends)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def detect_anomalies(self, df: DataFrame) -> AnomalyReport:
        """
        Detect anomalies in execution data.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting anomalies in execution data")

            anomalies: AnomalyReport = {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": 0,
            }

            # Calculate performance thresholds using query builder
            performance_stats = QueryBuilder.calculate_statistics(df, "execution_time")
            performance_threshold = performance_stats["avg"] + (
                2 * performance_stats["stddev"]
            )

            # Detect performance anomalies using query builder
            performance_anomalies_df = QueryBuilder.build_performance_anomaly_query(
                df, performance_threshold
            ).select("step", "execution_time", "validation_rate", "success")

            performance_anomalies = performance_anomalies_df.collect()

            anomalies["performance_anomalies"] = [
                {
                    "step": row["step"],
                    "execution_time": row["execution_time"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                }
                for row in performance_anomalies
            ]

            # Detect data quality anomalies using query builder
            quality_anomalies_df = (
                QueryBuilder.build_quality_anomaly_query(df, 90.0)
                .select("step", "validation_rate", "valid_rows", "invalid_rows")
                .orderBy("validation_rate")
            )

            quality_anomalies = quality_anomalies_df.collect()

            anomalies["quality_anomalies"] = [
                {
                    "step": row["step"],
                    "validation_rate": row["validation_rate"],
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                }
                for row in quality_anomalies
            ]

            # Calculate anomaly score
            total_executions = df.count()
            anomaly_count = len(performance_anomalies) + len(quality_anomalies)
            anomaly_score = (
                (anomaly_count / total_executions) * 100 if total_executions > 0 else 0
            )

            anomalies["anomaly_score"] = float(round(anomaly_score, 2))
            anomalies["total_anomalies"] = int(anomaly_count)
            anomalies["total_executions"] = int(total_executions)

            self.logger.info(
                f"Anomaly detection completed: {anomaly_count} anomalies found"
            )
            return anomalies

        except Exception as e:
            self.logger.error(f"Failed to detect anomalies: {e}")
            raise WriterError(f"Failed to detect anomalies: {e}") from e

    def generate_performance_report(self, df: DataFrame) -> PerformanceReport:
        """
        Generate comprehensive performance report.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing performance report
        """
        try:
            self.logger.info("Generating performance report")

            # Overall statistics using query builder
            overall_stats_df = df.agg(**QueryBuilder.get_common_aggregations())
            overall_stats = overall_stats_df.collect()[0]

            # Phase-wise statistics using query builder
            phase_stats_df = QueryBuilder.build_phase_trends_query(df, 30)
            phase_stats = phase_stats_df.collect()

            # Recent performance using query builder
            recent_performance_df = QueryBuilder.build_recent_performance_query(df, 7)
            recent_performance = recent_performance_df.collect()

            report = {
                "overall_statistics": {
                    "total_executions": overall_stats["total_executions"],
                    "successful_executions": overall_stats["successful_executions"],
                    "success_rate": (
                        (
                            overall_stats["successful_executions"]
                            / overall_stats["total_executions"]
                        )
                        * 100
                        if overall_stats["total_executions"] > 0
                        else 0
                    ),
                    "avg_execution_time": overall_stats["avg_execution_time"],
                    "avg_validation_rate": overall_stats["avg_validation_rate"],
                    "total_rows_written": overall_stats["total_rows_written"],
                },
                "phase_statistics": [
                    {
                        "phase": row["phase"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": row["avg_execution_time"],
                        "avg_validation_rate": row["avg_validation_rate"],
                        "total_rows_written": row["total_rows_written"],
                    }
                    for row in phase_stats
                ],
                "recent_performance": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_executions": row["daily_executions"],
                        "avg_execution_time": row["avg_execution_time"],
                        "avg_validation_rate": row["avg_validation_rate"],
                    }
                    for row in recent_performance
                ],
                "generated_at": datetime.now().isoformat(),
            }

            self.logger.info("Performance report generated successfully")
            return cast(PerformanceReport, report)

        except Exception as e:
            self.logger.error(f"Failed to generate performance report: {e}")
            raise WriterError(f"Failed to generate performance report: {e}") from e

In [None]:
# Module: pipeline_builder.writer.operations (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.functions, pipeline_builder.models.execution, pipeline_builder.validation, pipeline_builder.writer.exceptions, pipeline_builder.writer.models, pipeline_builder.writer.models, pipeline_builder_base.logging, pipeline_builder_base.logging, pipeline_builder_base.models, validation.utils, writer.exceptions

# mypy: ignore-errors
"""
Writer operations module for data processing and transformations.

This module contains the core data processing operations for the writer,
including data transformation, validation, and quality checks.

"""

from __future__ import annotations

import json
from datetime import datetime
from typing import Callable, Dict, Optional, TypedDict, Union, cast
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionResult, StepResult  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..validation import get_dataframe_info  # Removed: defined in notebook cells above
# from .exceptions import WriterValidationError  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
    # LogRow,
    # create_log_rows_from_execution_result,
    # create_log_schema,
    # validate_log_data,
# )

# ============================================================================
# TypedDict Definitions
# ============================================================================

class DataQualityReport(TypedDict):
    """Data quality validation report."""

    is_valid: bool
    total_rows: int
    null_counts: Dict[str, int]
    validation_issues: list[str]
    failed_executions: int
    data_quality_score: float

class DataProcessor:
    """Handles data processing and transformation operations."""

    def __init__(
        self,
        spark: SparkSession,
        functions: Optional[FunctionsProtocol] = None,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the data processor."""
        self.spark = spark
        self.functions = functions if functions is not None else get_default_functions()
        self.logger = logger or PipelineLogger("DataProcessor")

    def process_execution_result(
        self,
        execution_result: ExecutionResult,
        run_id: str,
        run_mode: str = "initial",
        metadata: Union[Dict[str, Union[str, int, float, bool]], None] = None,
        table_total_rows_provider: Optional[
            Callable[[Optional[str]], Optional[int]]
        ] = None,
    ) -> list[LogRow]:
        """
        Process execution result into log rows.

        Args:
            execution_result: The execution result to process
            run_id: Unique run identifier
            run_mode: Mode of the run
            metadata: Additional metadata
            table_total_rows_provider: Optional callback to supply table row counts

        Returns:
            List of processed log rows

        Raises:
            WriterValidationError: If validation fails
        """
        try:
            self.logger.info(f"Processing execution result for run {run_id}")

            # Create log rows from execution result
            log_rows = create_log_rows_from_execution_result(
                execution_result, run_id, run_mode, metadata
            )

            # Validate log data
            validation_result = validate_log_data(log_rows)
            if not validation_result["is_valid"]:
                raise WriterValidationError(
                    f"Log data validation failed: {validation_result['errors']}",
                    validation_errors=validation_result["errors"],
                    context={"run_id": run_id, "log_rows_count": len(log_rows)},
                    suggestions=[
                        "Check data quality in source execution result",
                        "Verify all required fields are present",
                        "Ensure data types are correct",
                    ],
                )

            # Populate table_total_rows when possible
            if table_total_rows_provider is not None:
                for row in log_rows:
                    if row.get("table_total_rows") is None:
                        row["table_total_rows"] = table_total_rows_provider(
                            row.get("table_fqn")
                        )

            self.logger.info(f"Successfully processed {len(log_rows)} log rows")
            return log_rows

        except Exception as e:
            self.logger.error(f"Failed to process execution result: {e}")
            raise

    def process_step_results(
        self,
        step_results: Dict[str, StepResult],
        run_id: str,
        run_mode: str = "initial",
        metadata: Union[Dict[str, Union[str, int, float, bool]], None] = None,
    ) -> list[LogRow]:
        """
        Process step results into log rows.

        Args:
            step_results: Dictionary of step results
            run_id: Unique run identifier
            run_mode: Mode of the run
            metadata: Additional metadata

        Returns:
            List of processed log rows
        """
        try:
            self.logger.info(
                f"Processing {len(step_results)} step results for run {run_id}"
            )

            log_rows = []
            for step_name, step_result in step_results.items():
                # Create log row for each step
                log_row = LogRow(
                    run_id=run_id,
                    run_mode=run_mode,  # type: ignore[typeddict-item]
                    run_started_at=datetime.now(),
                    run_ended_at=datetime.now(),
                    execution_id=run_id,
                    pipeline_id=run_id,
                    schema="default",
                    phase=step_result.phase.value,
                    step_name=step_name,
                    step_type=step_result.phase.value,
                    start_time=step_result.start_time,
                    end_time=step_result.end_time,
                    duration_secs=step_result.duration_secs,
                    table_fqn=f"{step_result.phase.value}_{step_name}",
                    write_mode="append",
                    input_rows=step_result.rows_processed,
                    output_rows=step_result.rows_written,
                    rows_written=step_result.rows_written,
                    valid_rows=int(
                        step_result.rows_processed * step_result.validation_rate / 100
                    ),
                    invalid_rows=int(
                        step_result.rows_processed
                        * (100 - step_result.validation_rate)
                        / 100
                    ),
                    validation_rate=step_result.validation_rate,
                    success=step_result.success,
                    error_message=step_result.error_message,
                    metadata=metadata or {},
                    rows_processed=step_result.rows_processed,
                    table_total_rows=None,
                    memory_usage_mb=0.0,
                    cpu_usage_percent=0.0,
                )
                log_rows.append(log_row)

            self.logger.info(f"Successfully processed {len(log_rows)} step log rows")
            return log_rows

        except Exception as e:
            self.logger.error(f"Failed to process step results: {e}")
            raise

    def create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> DataFrame:
        """
        Create DataFrame from log rows.

        Args:
            log_rows: List of log rows to convert

        Returns:
            DataFrame containing the log rows
        """
        try:
            self.logger.info(f"Creating DataFrame from {len(log_rows)} log rows")

            # Convert log rows to dictionaries
            log_data = []
            for row in log_rows:
                row_dict = {
                    "run_id": row["run_id"],
                    "run_mode": row["run_mode"],
                    "run_started_at": row["run_started_at"],
                    "run_ended_at": row["run_ended_at"],
                    "execution_id": row["execution_id"],
                    "pipeline_id": row["pipeline_id"],
                    "schema": row["schema"],
                    "phase": row["phase"],
                    "step_name": row["step_name"],
                    "step_type": row["step_type"],
                    "start_time": row["start_time"],
                    "end_time": row["end_time"],
                    "duration_secs": row["duration_secs"],
                    "table_fqn": row["table_fqn"],
                    "write_mode": row["write_mode"],
                    "input_rows": row["input_rows"],
                    "output_rows": row["output_rows"],
                    "rows_written": row["rows_written"],
                    "rows_processed": row["rows_processed"],
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                    "error_message": row["error_message"],
                    "memory_usage_mb": row["memory_usage_mb"],
                    "cpu_usage_percent": row["cpu_usage_percent"],
                    "metadata": (
                        json.dumps(row["metadata"]) if row["metadata"] else None
                    ),
                    "created_at": datetime.now().isoformat(),  # Include timestamp directly as string
                }
                log_data.append(row_dict)

            # Create DataFrame with explicit schema for type safety and None value handling
            schema = create_log_schema()
            df = self.spark.createDataFrame(log_data, schema)  # type: ignore[type-var]

            self.logger.info("Successfully created DataFrame from log rows")
            return df

        except Exception as e:
            self.logger.error(f"Failed to create DataFrame from log rows: {e}")
            raise

    def validate_data_quality(self, df: DataFrame) -> DataQualityReport:
        """
        Validate data quality of the DataFrame.

        Args:
            df: DataFrame to validate

        Returns:
            Dictionary containing validation results
        """
        try:
            self.logger.info("Validating data quality")

            # Get DataFrame info
            df_info = get_dataframe_info(df)

            # Check for null values in critical columns
            critical_columns = ["run_id", "phase", "step", "success"]
            null_counts = {}

            for col_name in critical_columns:
                if col_name in df.columns:
                    null_count = df.filter(
                        self.functions.col(col_name).isNull()
                    ).count()
                    null_counts[col_name] = null_count

            # Check validation rates
            validation_issues = []
            if "validation_rate" in df.columns:
                low_validation = df.filter(
                    self.functions.col("validation_rate") < 95.0
                ).count()
                if low_validation > 0:
                    validation_issues.append(
                        f"{low_validation} records with validation rate < 95%"
                    )

            # Check for failed executions
            failed_executions = 0
            if "success" in df.columns:
                failed_executions = df.filter(~self.functions.col("success")).count()

            validation_result = {
                "is_valid": len(validation_issues) == 0 and failed_executions == 0,
                "total_rows": df_info["row_count"],
                "null_counts": null_counts,
                "validation_issues": validation_issues,
                "failed_executions": failed_executions,
                "data_quality_score": self._calculate_quality_score(
                    df_info, null_counts, validation_issues, failed_executions
                ),
            }

            self.logger.info(
                f"Data quality validation completed: {validation_result['is_valid']}"
            )
            return cast(DataQualityReport, validation_result)

        except Exception as e:
            self.logger.error(f"Failed to validate data quality: {e}")
            raise

    def _calculate_quality_score(
        self,
        df_info: Dict[str, Union[int, str]],
        null_counts: Dict[str, int],
        validation_issues: list[str],
        failed_executions: int,
    ) -> float:
        """Calculate data quality score."""
        try:
            total_rows = df_info["row_count"]
            if total_rows == 0:
                return 0.0

            # Ensure total_rows is an integer for division
            if not isinstance(total_rows, int):
                total_rows = int(total_rows) if total_rows else 0
            if total_rows == 0:
                return 0.0

            # Calculate null penalty
            null_penalty = sum(null_counts.values()) / total_rows

            # Calculate validation penalty
            validation_penalty = len(validation_issues) * 0.1

            # Calculate failure penalty
            failure_penalty = failed_executions / total_rows

            # Calculate quality score (0-100)
            quality_score = max(
                0.0, 100.0 - (null_penalty + validation_penalty + failure_penalty) * 100
            )

            return float(round(quality_score, 2))

        except Exception:
            return 0.0

    def apply_data_transformations(self, df: DataFrame) -> DataFrame:
        """
        Apply data transformations to the DataFrame.

        Args:
            df: DataFrame to transform

        Returns:
            Transformed DataFrame
        """
        try:
            self.logger.info("Applying data transformations")

            # Add computed columns
            df_transformed = df.withColumn(
                "processing_efficiency",
                self.functions.when(
                    self.functions.col("input_rows") > 0,
                    self.functions.col("output_rows")  # type: ignore[arg-type]
                    / self.functions.col("input_rows")
                    * 100,
                ).otherwise(0),
            ).withColumn(
                "data_quality_score",
                self.functions.when(
                    self.functions.col("validation_rate") >= 95.0, "High"
                )
                .when(self.functions.col("validation_rate") >= 80.0, "Medium")
                .otherwise("Low"),
            )

            self.logger.info("Data transformations applied successfully")
            return df_transformed

        except Exception as e:
            self.logger.error(f"Failed to apply data transformations: {e}")
            raise

In [None]:
# Module: pipeline_builder.reporting.execution_reporter (pipeline_builder)
#
# Dependencies: pipeline_builder.execution, pipeline_builder_base.logging

"""Execution reporter for creating execution reports.

This module provides a service for creating reports from execution results.
The ExecutionReporter separates reporting logic from execution, making it
easy to generate summaries and reports from pipeline execution results.
"""

from __future__ import annotations

from typing import Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..execution import ExecutionResult, StepExecutionResult  # Removed: defined in notebook cells above

class ExecutionReporter:
    """Service for creating execution reports.

    Creates reports from execution results, separating reporting from execution.
    Provides methods to generate summary dictionaries from ExecutionResult and
    StepExecutionResult objects.

    Attributes:
        logger: PipelineLogger instance for logging.

    Example:
        >>> from pipeline_builder.reporting.execution_reporter import ExecutionReporter
        >>>
        >>> reporter = ExecutionReporter()
        >>> summary = reporter.create_execution_summary(execution_result)
        >>> step_summary = reporter.create_step_summary(step_result)
        >>> print(f"Pipeline status: {summary['status']}")
        >>> print(f"Total rows processed: {summary['total_rows_processed']}")
    """

    def __init__(
        self,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the execution reporter.

        Args:
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.logger = logger or PipelineLogger()

    def create_execution_summary(
        self,
        result: ExecutionResult,
    ) -> dict:
        """Create a summary of execution results.

        Generates a dictionary summary of pipeline execution results including
        status, timing, step counts, and row metrics.

        Args:
            result: ExecutionResult from pipeline execution.

        Returns:
            Dictionary containing:
            - execution_id: Unique execution identifier
            - mode: Execution mode used
            - status: Overall pipeline status
            - duration: Total execution duration in seconds
            - steps_count: Total number of steps
            - completed_steps: Number of successfully completed steps
            - failed_steps: Number of failed steps
            - total_rows_processed: Sum of rows processed across all steps
            - total_rows_written: Sum of rows written across all steps
            - error: Error message if pipeline failed (optional)
        """
        summary = {
            "execution_id": result.execution_id,
            "mode": result.mode.value
            if hasattr(result.mode, "value")
            else str(result.mode),
            "status": result.status,
            "duration": result.duration,
            "steps_count": len(result.steps) if result.steps else 0,
        }

        if result.steps:
            completed_steps = [s for s in result.steps if s.status.value == "completed"]
            failed_steps = [s for s in result.steps if s.status.value == "failed"]

            summary["completed_steps"] = len(completed_steps)
            summary["failed_steps"] = len(failed_steps)
            summary["total_rows_processed"] = sum(
                s.rows_processed or 0 for s in completed_steps
            )
            summary["total_rows_written"] = sum(
                s.rows_written or 0 for s in completed_steps
            )

        if result.error:
            summary["error"] = result.error

        return summary

    def create_step_summary(
        self,
        result: StepExecutionResult,
    ) -> dict:
        """Create a summary of a step execution result.

        Generates a dictionary summary of individual step execution results
        including status, timing, row counts, and validation metrics.

        Args:
            result: StepExecutionResult from step execution.

        Returns:
            Dictionary containing:
            - step_name: Name of the step
            - step_type: Type of step (BRONZE, SILVER, GOLD)
            - status: Step execution status
            - duration: Step execution duration in seconds
            - rows_processed: Number of rows processed
            - rows_written: Number of rows written (None for Bronze steps)
            - validation_rate: Percentage of rows that passed validation
            - output_table: Fully qualified output table name (None for Bronze steps)
            - error: Error message if step failed (optional)
        """
        return {
            "step_name": result.step_name,
            "step_type": result.step_type.value
            if hasattr(result.step_type, "value")
            else str(result.step_type),
            "status": result.status.value
            if hasattr(result.status, "value")
            else str(result.status),
            "duration": result.duration,
            "rows_processed": result.rows_processed,
            "rows_written": result.rows_written,
            "validation_rate": result.validation_rate,
            "output_table": result.output_table,
            "error": result.error,
        }

In [None]:
# Module: pipeline_builder.table_operations (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.errors, pipeline_builder.performance

"""
Table operations utilities for the pipeline framework.

This module provides comprehensive utilities for reading, writing, and managing
tables in the data lake. It includes Delta Lake support, table existence checks,
schema validation, and standardized write patterns.

**Key Features:**
    - **Delta Lake Integration**: Automatic detection and support for Delta tables
    - **Standardized Write Patterns**: Consistent write operations across the framework
    - **Table Management**: Existence checks, schema validation, and table operations
    - **Error Handling**: Comprehensive error handling with custom exceptions
    - **Performance Monitoring**: Built-in timing and performance tracking

**Common Patterns:**
    - Check if table exists before writing
    - Prepare Delta tables for overwrite operations
    - Create fully qualified table names (FQN)
    - Write DataFrames with standardized patterns

Dependencies:
    - compat: Compatibility layer for Spark/PySpark types
    - errors: Custom exception classes
    - performance: Performance monitoring decorators

Example:
    >>> from pipeline_builder.table_operations import (
    ...     table_exists,
    ...     write_overwrite_table,
    ...     fqn
    ... )
    >>> from pipeline_builder.compat import SparkSession
    >>>
    >>> # Create fully qualified name
    >>> table_name = fqn("analytics", "user_events")
    >>>
    >>> # Check if table exists
    >>> if table_exists(spark, table_name):
    ...     print(f"Table {table_name} exists")
    >>>
    >>> # Write DataFrame
    >>> rows = write_overwrite_table(df, table_name)
    >>> print(f"Wrote {rows} rows")
"""

from __future__ import annotations

import logging
import os
import tempfile
from typing import Any, Dict, Optional, Union

# from .compat import AnalysisException, DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from .errors import TableOperationError  # Removed: defined in notebook cells above
# from .performance import time_operation  # Removed: defined in notebook cells above
# Fallback: time_operation decorator (will be replaced when performance module loads)
def time_operation(name):
    def decorator(func):
        return func
    return decorator

# Handle optional Delta Lake dependency
try:
    from delta.tables import DeltaTable

    HAS_DELTA = True
except (ImportError, AttributeError, RuntimeError):
    DeltaTable = None  # type: ignore[misc, assignment]
    HAS_DELTA = False

logger = logging.getLogger(__name__)

# Cache for Delta Lake availability checks
_delta_availability_cache: Dict[str, bool] = {}

def is_delta_lake_available(spark: SparkSession) -> bool:  # type: ignore[valid-type]
    """Check if Delta Lake is available in the Spark session.

    Checks whether Delta Lake extensions and catalog are configured in the
    Spark session. Uses caching to avoid repeated checks for the same session.

    Args:
        spark: SparkSession instance to check.

    Returns:
        True if Delta Lake is available (extensions and catalog configured),
        False otherwise.

    Example:
        >>> from pipeline_builder.table_operations import is_delta_lake_available
        >>> if is_delta_lake_available(spark):
        ...     print("Delta Lake is available")
        ...     # Use Delta-specific operations
        ... else:
        ...     print("Delta Lake is not available")
    """
    # Use session ID as cache key
    spark_id = str(id(spark))
    if spark_id in _delta_availability_cache:
        return _delta_availability_cache[spark_id]

    # Check if Delta extensions are configured
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        catalog = spark.conf.get("spark.sql.catalog.spark_catalog", "")  # type: ignore[attr-defined]
        if (
            extensions
            and catalog
            and "DeltaSparkSessionExtension" in extensions
            and "DeltaCatalog" in catalog
        ):
            _delta_availability_cache[spark_id] = True
            return True
    except Exception:
        pass  # Config check failed; proceed to lightweight test

    # If only extensions are configured, do a lightweight test
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        if extensions and "DeltaSparkSessionExtension" in extensions:
            # Try a simple test - create a minimal DataFrame and try to write it
            test_df = spark.createDataFrame([(1, "test")], ["id", "name"])
            # Use a unique temp directory to avoid conflicts
            with tempfile.TemporaryDirectory() as temp_dir:
                test_path = os.path.join(temp_dir, "delta_test")
                try:
                    test_df.write.format("delta").mode("overwrite").save(test_path)
                    _delta_availability_cache[spark_id] = True
                    return True
                except Exception:
                    # Delta format failed - not available
                    pass
    except Exception:
        pass  # Lightweight Delta test failed or config unavailable

    # Delta is not available in this Spark session
    _delta_availability_cache[spark_id] = False
    return False

def create_dataframe_writer(
    df: DataFrame,
    spark: SparkSession,  # type: ignore[valid-type]
    mode: str,
    table_name: Optional[str] = None,
    **options: Any,
) -> Any:
    """Create a DataFrameWriter using the standardized Delta overwrite pattern.

    Creates a DataFrameWriter configured with Delta format and appropriate
    options. For overwrite mode, uses `overwriteSchema=true` to allow schema
    evolution. Always uses Delta format - failures will propagate if Delta
    is not available.

    Args:
        df: DataFrame to write.
        spark: SparkSession instance (used for Delta table preparation if needed).
        mode: Write mode string. Common values:
            - "overwrite": Replace existing data
            - "append": Add to existing data
            - "ignore": Skip if table exists
            - "error": Fail if table exists
        table_name: Optional fully qualified table name. If provided and mode
            is "overwrite", prepares the Delta table for overwrite.
        **options: Additional write options to pass to the writer (e.g.,
            partitionBy, mergeSchema, etc.).

    Returns:
        DataFrameWriter instance configured with Delta format and options.

    Example:
        >>> from pipeline_builder.table_operations import create_dataframe_writer
        >>> writer = create_dataframe_writer(
        ...     df,
        ...     spark,
        ...     mode="overwrite",
        ...     table_name="analytics.events",
        ...     partitionBy="date"
        ... )
        >>> writer.saveAsTable("analytics.events")
    """
    # Use standardized overwrite pattern: overwrite + overwriteSchema
    if mode == "overwrite":
        # Prepare Delta table for overwrite by dropping it if it exists
        # This avoids "Table does not support truncate in batch mode" errors
        if table_name is not None:
            prepare_delta_overwrite(spark, table_name)
        writer = (
            df.write.format("delta").mode("overwrite").option("overwriteSchema", "true")
        )
    else:
        # Append or other modes - always use Delta
        writer = df.write.format("delta").mode(mode)

    for key, value in options.items():
        writer = writer.option(key, value)

    return writer

def prepare_delta_overwrite(
    spark: SparkSession,  # type: ignore[valid-type]
    table_name: str,
) -> None:
    """Prepare for Delta table overwrite by dropping existing table if it exists.

    Delta tables don't support truncate in batch mode, so we must drop the table
    before overwriting it. This function safely handles this preparation by
    dropping the table if it exists, avoiding "Table does not support truncate
    in batch mode" errors.

    **Important:** This function should be called before any Delta overwrite
    operation to ensure compatibility with Delta Lake's limitations.

    Args:
        spark: SparkSession instance for executing SQL commands.
        table_name: Fully qualified table name (e.g., "schema.table") or
            table path. If it's a table name (contains dot and doesn't start
            with "/"), it will be dropped via SQL. If it's a path, the function
            checks if it's a Delta table but cannot drop it (overwrite will
            handle it).

    Example:
        >>> from pipeline_builder.table_operations import prepare_delta_overwrite
        >>> prepare_delta_overwrite(spark, "analytics.user_events")
        >>> df.write.format("delta").mode("overwrite").saveAsTable("analytics.user_events")

    Note:
        This function is safe to call even if the table doesn't exist. It uses
        `DROP TABLE IF EXISTS` to avoid errors. If the drop fails for any reason,
        a warning is logged but execution continues (the write operation may
        still succeed or fail with a more specific error).
    """
    if not HAS_DELTA:
        return

    # Check if it's a table name (contains dot) or a path
    is_table_name = "." in table_name and not table_name.startswith("/")

    if is_table_name:
        # Always try to drop the table if it exists, since we're about to overwrite it
        # This is safer than failing later with truncate error for Delta tables
        # Delta tables don't support truncate, so we must drop before overwrite
        # Use DROP TABLE IF EXISTS to avoid errors if table doesn't exist
        try:
            # First try using SQL DROP TABLE
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")  # type: ignore[attr-defined]
            logger.debug(f"Dropped table {table_name} before overwrite (if it existed)")

            # For Delta tables, also try using DeltaTable API if available
            # This ensures the table is fully removed, including metadata
            if HAS_DELTA:
                try:
                    # Try to get the table path and delete using DeltaTable
                    # This is a more thorough cleanup for Delta tables
                    spark.sql(f"DESCRIBE TABLE EXTENDED {table_name}").collect()  # type: ignore[attr-defined]
                    # If we get here, table still exists - try DeltaTable.delete()
                    delta_table = DeltaTable.forName(spark, table_name)  # type: ignore[attr-defined,assignment,arg-type]
                    delta_table.delete()  # type: ignore[attr-defined]
                    logger.debug(
                        f"Deleted Delta table {table_name} using DeltaTable API"
                    )
                except Exception:
                    # DeltaTable API might not work or table might not be Delta
                    # This is fine - SQL DROP should have worked
                    pass
        except Exception as e:
            # If drop fails, log warning but continue
            # The write might still work if table doesn't exist
            logger.warning(f"Could not drop table {table_name} before overwrite: {e}")
    else:
        # It's a path - check if Delta table exists at that path
        try:
            if DeltaTable.isDeltaTable(spark, table_name):  # type: ignore[attr-defined,arg-type]
                # For path-based Delta tables, we can't drop via SQL
                # The overwrite will handle it, but we log a warning
                logger.debug(
                    f"Delta table exists at path {table_name}, overwrite will replace it"
                )
        except Exception:
            pass  # If we can't check Delta at path, assume and proceed with overwrite

# Keep the old function name for backward compatibility, but it now calls the public function
def _prepare_delta_overwrite_table_ops(
    spark: SparkSession,  # type: ignore[valid-type]
    table_name: str,
) -> None:
    """
    Legacy function name - use prepare_delta_overwrite() instead.

    This function is kept for backward compatibility but now delegates to
    the public prepare_delta_overwrite() function.
    """
    prepare_delta_overwrite(spark, table_name)

def fqn(schema: str, table: str) -> str:
    """Create a fully qualified table name (FQN).

    Combines schema and table names into a fully qualified table name in the
    format "schema.table". This is the standard format used throughout the
    framework for table references.

    Args:
        schema: Database schema name. Must be non-empty.
        table: Table name. Must be non-empty.

    Returns:
        Fully qualified table name in the format "schema.table".

    Raises:
        ValueError: If schema or table is empty.

    Example:
        >>> from pipeline_builder.table_operations import fqn
        >>> table_name = fqn("analytics", "user_events")
        >>> print(table_name)  # "analytics.user_events"
    """
    if not schema or not table:
        raise ValueError("Schema and table names cannot be empty")
    return f"{schema}.{table}"

@time_operation("table write (overwrite)")
def write_overwrite_table(
    df: DataFrame,
    fqn: str,
    **options: Union[str, int] | Union[float, bool],  # type: ignore[valid-type]
) -> int:
    """Write DataFrame to table in overwrite mode using Delta overwrite pattern.

    Writes a DataFrame to a table, replacing all existing data. Uses Delta
    format with `overwriteSchema=true` to allow schema evolution. Automatically
    prepares the Delta table for overwrite by dropping it if it exists (to
    avoid truncate errors).

    Args:
        df: DataFrame to write. Will be cached before writing.
        fqn: Fully qualified table name (e.g., "schema.table").
        **options: Additional write options to pass to the writer. Common
            options include:
            - partitionBy: Column(s) to partition by
            - mergeSchema: Whether to merge schemas (default: true via overwriteSchema)

    Returns:
        Number of rows written to the table.

    Raises:
        TableOperationError: If write operation fails (e.g., table creation
            fails, write fails, or Delta Lake is not available).

    Example:
        >>> from pipeline_builder.table_operations import write_overwrite_table
        >>> rows = write_overwrite_table(
        ...     df,
        ...     "analytics.user_events",
        ...     partitionBy="date"
        ... )
        >>> print(f"Wrote {rows} rows")
    """
    try:
        df.cache()  # type: ignore[attr-defined]
        cnt: int = df.count()  # type: ignore[attr-defined]

        # Get SparkSession from DataFrame to prepare Delta overwrite
        spark = df.sql_ctx.sparkSession  # type: ignore[attr-defined]

        # Prepare for Delta overwrite by dropping existing Delta table if it exists
        prepare_delta_overwrite(spark, fqn)

        # Delta Lake doesn't support append in batch mode
        # Always use overwrite mode for Delta tables
        # This is safe because we've already dropped the table if it existed
        write_mode = "overwrite"
        writer = (
            df.write.format("delta").mode(write_mode).option("overwriteSchema", "true")
        )  # type: ignore[attr-defined]

        for key, value in options.items():
            writer = writer.option(key, value)

        writer.saveAsTable(fqn)
        logger.info(f"Successfully wrote {cnt} rows to {fqn} in {write_mode} mode")
        return cnt

    except Exception as e:
        raise TableOperationError(f"Failed to write table {fqn}: {e}") from e

@time_operation("table write (append)")
def write_append_table(
    df: DataFrame,
    fqn: str,
    **options: Union[str, int] | Union[float, bool],  # type: ignore[valid-type]
) -> int:
    """Write DataFrame to table in append mode.

    Writes a DataFrame to a table, adding new data to existing data. Uses
    Parquet format for append operations. The table will be created if it
    doesn't exist.

    Args:
        df: DataFrame to write. Will be cached before writing.
        fqn: Fully qualified table name (e.g., "schema.table").
        **options: Additional write options to pass to the writer. Common
            options include:
            - partitionBy: Column(s) to partition by
            - compression: Compression codec (e.g., "snappy", "gzip")

    Returns:
        Number of rows written to the table.

    Raises:
        TableOperationError: If write operation fails (e.g., table creation
            fails or write fails).

    Example:
        >>> from pipeline_builder.table_operations import write_append_table
        >>> rows = write_append_table(
        ...     df,
        ...     "analytics.user_events",
        ...     partitionBy="date"
        ... )
        >>> print(f"Appended {rows} rows")
    """
    try:
        # Cache DataFrame for potential multiple operations
        df.cache()  # type: ignore[attr-defined]
        cnt: int = df.count()  # type: ignore[attr-defined]
        writer = df.write.format("parquet").mode("append")  # type: ignore[attr-defined]

        # Apply additional options
        for key, value in options.items():
            writer = writer.option(key, value)

        writer.saveAsTable(fqn)
        logger.info(f"Successfully wrote {cnt} rows to {fqn} in append mode")
        return cnt

    except Exception as e:
        raise TableOperationError(f"Failed to write table {fqn}: {e}") from e

def read_table(
    spark: SparkSession,
    fqn: str,  # type: ignore[valid-type]
) -> DataFrame:  # type: ignore[valid-type]
    """Read data from a table.

    Reads data from a table using Spark's `table()` method. Supports both
    regular tables and Delta tables.

    Args:
        spark: SparkSession instance for reading the table.
        fqn: Fully qualified table name (e.g., "schema.table").

    Returns:
        DataFrame containing the table data.

    Raises:
        TableOperationError: If read operation fails. Common causes:
            - Table does not exist (wrapped AnalysisException)
            - Permission errors
            - Table corruption

    Example:
        >>> from pipeline_builder.table_operations import read_table
        >>> df = read_table(spark, "analytics.user_events")
        >>> print(f"Read {df.count()} rows")
    """
    try:
        df = spark.table(fqn)  # type: ignore[attr-defined]
        logger.info(f"Successfully read table {fqn}")
        return df
    except Exception as e:
        # Check if it's an AnalysisException (table doesn't exist) - use type name check for Python 3.8 compatibility
        error_type_name = type(e).__name__
        if "AnalysisException" in error_type_name:
            raise TableOperationError(f"Table {fqn} does not exist: {e}") from e
        else:
            raise TableOperationError(f"Failed to read table {fqn}: {e}") from e

def table_exists(
    spark: SparkSession,
    fqn: str,  # type: ignore[valid-type]
) -> bool:  # type: ignore[valid-type]
    """Check if a table exists.

    Checks whether a table exists in the Spark catalog. Uses multiple methods
    for reliability: first tries the catalog's `tableExists()` method, then
    falls back to attempting to read the table.

    Args:
        spark: SparkSession instance for checking table existence.
        fqn: Fully qualified table name (e.g., "schema.table").

    Returns:
        True if the table exists and is accessible, False otherwise.
        Returns False if the table doesn't exist or an error occurs.

    Example:
        >>> from pipeline_builder.table_operations import table_exists
        >>> if table_exists(spark, "analytics.user_events"):
        ...     print("Table exists")
        ... else:
        ...     print("Table does not exist")
    """
    try:
        # If catalog has a fast check, use it first
        try:
            if hasattr(spark, "catalog") and spark.catalog.tableExists(fqn):  # type: ignore[attr-defined]
                # Run a lightweight count to mirror legacy behavior/side effects
                spark.table(fqn).count()  # type: ignore[attr-defined]
                return True
        except Exception:
            # Fall back to direct table access below
            pass

        spark.table(fqn).count()  # type: ignore[attr-defined]
        return True
    except AnalysisException:  # type: ignore[misc]
        logger.debug(f"Table {fqn} does not exist (AnalysisException)")
        return False
    except Exception as e:
        logger.warning(f"Error checking if table {fqn} exists: {e}")
        return False

def table_schema_is_empty(spark: SparkSession, fqn: str) -> bool:
    """Check if a table exists but reports an empty schema (struct<>).

    Detects catalog synchronization issues where the metastore has a placeholder
    entry for a table but the table has no columns (empty schema). This can
    happen when table creation is interrupted or when there are catalog sync
    issues. Callers can drop and recreate the table to fix this.

    Args:
        spark: SparkSession instance for checking the table schema.
        fqn: Fully qualified table name (e.g., "schema.table").

    Returns:
        True if the table exists but has an empty schema (no fields),
        False otherwise. Returns False if the table doesn't exist or
        an error occurs.

    Example:
        >>> from pipeline_builder.table_operations import table_schema_is_empty
        >>> if table_schema_is_empty(spark, "analytics.user_events"):
        ...     print("Table has empty schema - needs recreation")
        ...     drop_table(spark, "analytics.user_events")
    """
    try:
        if not table_exists(spark, fqn):
            return False
        schema = spark.table(fqn).schema  # type: ignore[attr-defined]
        if hasattr(schema, "fields"):
            return len(schema.fields) == 0
        return False
    except Exception as e:
        logger.debug(f"Could not inspect schema for {fqn}: {e}")
        return False

def drop_table(
    spark: SparkSession,
    fqn: str,  # type: ignore[valid-type]
) -> bool:  # type: ignore[valid-type]
    """Drop a table if it exists.

    Drops a table from the Spark catalog using the external catalog API.
    This is a safe operation that only drops the table if it exists.

    Args:
        spark: SparkSession instance for dropping the table.
        fqn: Fully qualified table name (e.g., "schema.table"). If the name
            doesn't contain a dot, it's assumed to be in the "default" schema.

    Returns:
        True if the table was dropped, False if it didn't exist or an
        error occurred (logged as warning).

    Example:
        >>> from pipeline_builder.table_operations import drop_table
        >>> if drop_table(spark, "analytics.user_events"):
        ...     print("Table dropped successfully")
        ... else:
        ...     print("Table did not exist or could not be dropped")
    """
    try:
        if table_exists(spark, fqn):
            # Use Java SparkSession to access external catalog
            jspark_session = spark._jsparkSession  # type: ignore[attr-defined]
            external_catalog = jspark_session.sharedState().externalCatalog()

            # Parse fully qualified name
            if "." in fqn:
                database_name, table_name = fqn.split(".", 1)
            else:
                database_name = "default"
                table_name = fqn

            # Drop the table using external catalog
            # Parameters: db, table, ignoreIfNotExists, purge
            external_catalog.dropTable(database_name, table_name, True, True)
            logger.info(f"Dropped table {fqn}")
            return True
        return False
    except Exception as e:
        logger.warning(f"Failed to drop table {fqn}: {e}")
        return False

In [None]:
# Module: pipeline_builder.performance (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.table_operations, table_operations

"""
Performance monitoring utilities for the pipeline framework.

This module contains functions for timing operations, monitoring performance,
and managing execution metrics.

"""

from __future__ import annotations

import logging
import time
from contextlib import contextmanager
from datetime import datetime, timezone
from functools import wraps
from typing import Any, Callable, Generator, Optional

# from .compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)

logger = logging.getLogger(__name__)

def now_dt() -> datetime:
    """Get current UTC datetime."""
    return datetime.now(timezone.utc)

def format_duration(seconds: float) -> str:
    """
    Format duration in seconds to human-readable string.

    Args:
        seconds: Duration in seconds

    Returns:
        Formatted duration string
    """
    if seconds < 60:
        return f"{seconds:.2f}s"
    elif seconds < 3600:
        minutes = seconds / 60
        return f"{minutes:.2f}m"
    else:
        hours = seconds / 3600
        return f"{hours:.2f}h"

def time_operation(operation_name: str = "operation") -> Callable[[Callable], Callable]:
    """Decorator to time operations and log performance."""

    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            start_time = time.time()
            logger.info(f"Starting {operation_name}...")

            try:
                result = func(*args, **kwargs)
                duration = time.time() - start_time
                logger.info(f"Completed {operation_name} in {duration:.3f}s")
                return result
            except Exception as e:
                duration = time.time() - start_time
                logger.error(f"Failed {operation_name} after {duration:.3f}s: {e}")
                raise

        return wrapper

    return decorator

@contextmanager
def performance_monitor(
    operation_name: str, max_duration: Optional[float] = None
) -> Generator[None, None, None]:
    """Context manager to monitor operation performance."""
    start_time = time.time()
    logger.info(f"Starting {operation_name}...")

    try:
        yield
        duration = time.time() - start_time
        logger.info(f"Completed {operation_name} in {duration:.3f}s")

        if max_duration and duration > max_duration:
            logger.warning(
                f"{operation_name} took {duration:.3f}s, exceeding threshold of {max_duration}s"
            )

    except Exception as e:
        duration = time.time() - start_time
        logger.error(f"Failed {operation_name} after {duration:.3f}s: {e}")
        raise

@time_operation("write operation")
def time_write_operation(
    mode: str,
    df: DataFrame,
    fqn: str,
    **options: Any,
) -> tuple[int, float, datetime, datetime]:
    """
    Time a write operation and return results with timing info.

    Args:
        mode: Write mode (overwrite/append)
        df: DataFrame to write
        fqn: Fully qualified table name
        **options: Additional write options

    Returns:
        Tuple of (rows_written, duration_secs, start_time, end_time)

    Raises:
        ValueError: If mode is invalid
        TableOperationError: If write operation fails
    """
    # from .table_operations import write_append_table, write_overwrite_table  # Removed: defined in notebook cells above

    start = now_dt()
    t0 = time.time()

    try:
        if mode == "overwrite":
            rows = write_overwrite_table(df, fqn, **options)
        elif mode == "append":
            rows = write_append_table(df, fqn, **options)
        else:
            raise ValueError(
                f"Unknown write mode '{mode}'. Supported modes: overwrite, append"
            )

        t1 = time.time()
        end = now_dt()
        duration = round(t1 - t0, 3)

        logger.info(f"Write operation completed: {rows} rows in {duration}s to {fqn}")
        return rows, duration, start, end

    except Exception as e:
        t1 = time.time()
        end = now_dt()
        duration = round(t1 - t0, 3)
        logger.error(f"Write operation failed after {duration}s: {e}")
        raise

def monitor_performance(
    operation_name: str, max_duration: Optional[float] = None
) -> Callable:
    """
    Decorator factory for performance monitoring.

    Args:
        operation_name: Name of the operation
        max_duration: Maximum allowed duration in seconds

    Returns:
        Decorator function
    """

    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            with performance_monitor(operation_name, max_duration):
                return func(*args, **kwargs)

        return wrapper

    return decorator

In [None]:
# Module: pipeline_builder.step_executors.base (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.table_operations, pipeline_builder_base.errors, pipeline_builder_base.logging

"""Base step executor with common functionality.

This module provides the base class for all step executors with shared logic
and a common interface. Step executors handle the execution of specific step
types (Bronze, Silver, Gold) in the pipeline execution engine.

The base class provides:
    - Common initialization with SparkSession, logger, and functions
    - Abstract execute() method that must be implemented by subclasses
    - Shared infrastructure for all step executors
"""

from __future__ import annotations

import inspect
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..table_operations import fqn, table_exists  # Removed: defined in notebook cells above

# Empty DataFrame for optional validation-only steps (mock Spark needs explicit schema)
try:
    from pyspark.sql.types import StructType as _EmptyStructType
except ImportError:
    _EmptyStructType = None  # type: ignore[misc, assignment]

class BaseStepExecutor(ABC):
    """Base class for all step executors.

    Provides common functionality and interface for executing pipeline steps.
    Each step type (Bronze, Silver, Gold) has a specialized executor that
    inherits from this base class.

    Attributes:
        spark: SparkSession instance for DataFrame operations.
        logger: PipelineLogger instance for logging.
        functions: FunctionsProtocol instance for PySpark operations.

    Example:
        >>> from pipeline_builder.step_executors.base import BaseStepExecutor
        >>> from pipeline_builder.compat import SparkSession
        >>>
        >>> class MyStepExecutor(BaseStepExecutor):
        ...     def execute(self, step, context, mode=None):
        ...         # Implementation here
        ...         return output_df
    """

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
    ):
        """Initialize the base step executor.

        Args:
            spark: Active SparkSession instance for DataFrame operations.
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
            functions: Optional FunctionsProtocol instance for PySpark
                operations. If None, functions must be provided by subclasses.
        """
        self.spark = spark
        self.logger = logger or PipelineLogger()
        self.functions = functions

    @staticmethod
    def _accepts_params(func: Any) -> bool:
        """Check if a function accepts a 'params' argument or **kwargs.

        Args:
            func: The function to inspect.

        Returns:
            True if the function accepts 'params' as a named argument or **kwargs,
            False otherwise.
        """
        try:
            sig = inspect.signature(func)
            for param_name, param in sig.parameters.items():
                if param_name == "params":
                    return True
                if param.kind == inspect.Parameter.VAR_KEYWORD:  # **kwargs
                    return True
            return False
        except (ValueError, TypeError):
            # If we can't inspect the signature, assume it doesn't accept params
            return False

    def _handle_validation_only_step(
        self, step: Any, step_type: str
    ) -> Optional[DataFrame]:
        """Handle validation-only steps by reading from existing table.

        This method checks if a step is validation-only (transform=None, existing=True)
        and if so, reads the data from the existing table. Returns None if the step
        is not validation-only.

        Args:
            step: Step instance (SilverStep or GoldStep).
            step_type: Step type string for error messages ("silver" or "gold").

        Returns:
            DataFrame if step is validation-only and table exists, None otherwise.

        Raises:
            ExecutionError: If schema doesn't exist, table doesn't exist, or
                reading fails.
        """
        # Check if this is a validation-only step
        if not (
            hasattr(step, "transform")
            and step.transform is None
            and hasattr(step, "existing")
            and step.existing
        ):
            return None  # Not a validation-only step

        table_name = getattr(step, "table_name", step.name)
        schema = getattr(step, "schema", None)

        if schema is None:
            raise ExecutionError(
                f"Validation-only {step_type} step '{step.name}' requires schema to read from table"
            )

        # Validate schema exists before checking table
        try:
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema not in databases:
                raise ExecutionError(
                    f"Validation-only {step_type} step '{step.name}' requires schema '{schema}', but schema does not exist. "
                    f"Available schemas: {databases}"
                )
        except ExecutionError:
            raise  # Re-raise ExecutionError
        except Exception as e:
            raise ExecutionError(
                f"Failed to check if schema '{schema}' exists for validation-only {step_type} step '{step.name}': {e}"
            ) from e

        table_fqn = fqn(schema, table_name)
        # Validation-only steps just read the existing table via spark.table().
        # Delta's schema check is disabled for the whole run in ExecutionEngine.execute_pipeline().
        if table_exists(self.spark, table_fqn):
            return self.spark.table(table_fqn)
        if getattr(step, "optional", False):
            self.logger.info(
                f"Validation-only {step_type} step '{step.name}': table '{table_fqn}' does not exist (optional=True), using empty DataFrame"
            )
            return self._empty_dataframe()
        raise ExecutionError(
            f"Validation-only {step_type} step '{step.name}' requires existing table '{table_fqn}', but table does not exist"
        )

    def _empty_dataframe(self) -> DataFrame:
        """Return an empty DataFrame (zero rows) for optional validation-only steps when table is missing."""
        # Prefer createDataFrame([], StructType([])) for mock Spark (sparkless) compatibility
        if _EmptyStructType is not None:
            try:
                return self.spark.createDataFrame([], _EmptyStructType())
            except (TypeError, ValueError):
                pass
        try:
            return self.spark.range(0, 0).toDF()  # type: ignore[no-any-return,attr-defined]
        except (TypeError, ValueError):
            return self.spark.range(0).limit(0).toDF()  # type: ignore[no-any-return,attr-defined]

    @abstractmethod
    def execute(
        self,
        step: Any,
        context: Dict[str, DataFrame],
        mode: Any = None,
    ) -> DataFrame:
        """Execute a pipeline step.

        Abstract method that must be implemented by subclasses. Each step
        executor implements step-specific execution logic.

        Args:
            step: The step to execute (BronzeStep, SilverStep, or GoldStep).
            context: Dictionary mapping step names to DataFrames. Contains
                source data required for step execution.
            mode: Optional execution mode (ExecutionMode enum). Some executors
                use this for incremental processing.

        Returns:
            Output DataFrame after step execution.

        Raises:
            ExecutionError: If step execution fails or required data is missing.

        Note:
            Subclasses must implement this method with step-specific logic.
            The method should handle data retrieval from context, apply
            transformations, and return the result DataFrame.
        """
        pass

In [None]:
# Module: pipeline_builder.step_executors.bronze (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models, pipeline_builder.step_executors.base, pipeline_builder_base.errors

"""Bronze step executor.

This module provides the executor for bronze steps, which validate existing
data without transformation or writing. Bronze steps are the first layer in
the Medallion architecture and serve as data quality gates.
"""

from __future__ import annotations

from typing import Any, Dict
# from .errors import ExecutionError  # Removed: defined in notebook cells above

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..models import BronzeStep  # Removed: defined in notebook cells above
# from .base import BaseStepExecutor  # Removed: defined in notebook cells above

class BronzeStepExecutor(BaseStepExecutor):
    """Executor for bronze steps in the pipeline.

    Bronze steps validate existing raw data without transformation or writing.
    They serve as data quality gates, ensuring that incoming data meets
    basic validation rules before being processed by silver steps.

    Bronze steps:
        - Validate data according to step rules
        - Do not transform data
        - Do not write to tables
        - Return the same DataFrame (validated but unchanged)

    Example:
        >>> from pipeline_builder.step_executors.bronze import BronzeStepExecutor
        >>> from pipeline_builder.models import BronzeStep
        >>>
        >>> executor = BronzeStepExecutor(spark)
        >>> result = executor.execute(
        ...     step=BronzeStep(name="events", rules={"id": [F.col("id").isNotNull()]}),
        ...     context={"events": source_df}
        ... )
        >>> # result is the same DataFrame, validated
    """

    def execute(
        self,
        step: BronzeStep,
        context: Dict[str, DataFrame],
        mode: Any = None,  # Mode not used for bronze steps
    ) -> DataFrame:
        """Execute a bronze step.

        Validates existing data from context without transformation. The step
        name must exist in the context dictionary with the source DataFrame.

        Args:
            step: BronzeStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                the step name as a key with the source DataFrame.
            mode: Execution mode (not used for bronze steps, can be None).

        Returns:
            Output DataFrame (same as input, validated but unchanged).

        Raises:
            ExecutionError: If step name not found in context or DataFrame is
                invalid.

        Note:
            - Bronze steps only validate data, they don't transform or write
            - Validation is applied separately by the execution engine
            - Empty DataFrames are allowed but logged as warnings
        """
        # Bronze steps require data to be provided in context
        # This is the expected behavior - bronze steps validate existing data
        if step.name not in context:
            raise ExecutionError(
                f"Bronze step '{step.name}' requires data to be provided in context. "
                f"Bronze steps are for validating existing data, not creating it. "
                f"Please provide data using bronze_sources parameter or context dictionary. "
                f"Available context keys: {list(context.keys())}"
            )

        df: DataFrame = context[step.name]

        # Validate that the DataFrame is not empty (optional check)
        if df.count() == 0:
            self.logger.warning(
                f"Bronze step '{step.name}' received empty DataFrame. "
                f"This may indicate missing or invalid data source."
            )

        return df

In [None]:
# Module: pipeline_builder.step_executors.silver (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models, pipeline_builder.sql_source, pipeline_builder.step_executors.base, pipeline_builder.table_operations, pipeline_builder_base.errors, pipeline_builder_base.models

"""Silver step executor.

This module provides the executor for silver steps, which transform bronze
data into cleaned and enriched data. Silver steps can handle incremental
processing to only process new data since the last run.
"""

from __future__ import annotations

import inspect
from typing import Any, Dict, Optional, cast
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .models import ExecutionMode  # Removed: defined in notebook cells above

# from ..compat import DataFrame, F  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..models import SilverStep  # Removed: defined in notebook cells above
# from ..table_operations import fqn  # Removed: defined in notebook cells above
# from .base import BaseStepExecutor  # Removed: defined in notebook cells above

class SilverStepExecutor(BaseStepExecutor):
    """Executor for silver steps in the pipeline.

    Silver steps transform bronze data into cleaned and enriched data. They
    can handle incremental processing to only process new rows since the last
    run, improving efficiency for large datasets.

    Silver steps:
        - Transform bronze data using step.transform() function
        - Support incremental processing via watermark columns
        - Write results to Delta Lake tables
        - Apply validation rules after transformation

    Example:
        >>> from pipeline_builder.step_executors.silver import SilverStepExecutor
        >>> from pipeline_builder_base.models import ExecutionMode
        >>> from pipeline_builder.models import SilverStep
        >>>
        >>> executor = SilverStepExecutor(spark)
        >>> result = executor.execute(
        ...     step=SilverStep(
        ...         name="clean_events",
        ...         source_bronze="events",
        ...         transform=lambda spark, df, silvers: df.filter(F.col("status") == "active"),
        ...         rules={"status": [F.col("status").isNotNull()]},
        ...         table_name="clean_events"
        ...     ),
        ...     context={"events": bronze_df},
        ...     mode=ExecutionMode.INITIAL
        ... )
    """

    def execute(  # type: ignore[override]
        self,
        step: SilverStep,
        context: Dict[str, DataFrame],
        mode: ExecutionMode,
        step_params: Optional[Dict[str, Any]] = None,
        step_types: Optional[Dict[str, str]] = None,
    ) -> DataFrame:
        """Execute a silver step.

        Transforms bronze data using the step's transform function. For
        INCREMENTAL mode, filters bronze input to only process new rows.

        Args:
            step: SilverStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                the source bronze step name (step.source_bronze).
            mode: Execution mode. INCREMENTAL mode triggers incremental filtering
                of bronze input.
            step_params: Optional dictionary of parameters to pass to the transform
                function. If the transform function accepts a 'params' argument or
                **kwargs, these will be passed. Otherwise, ignored for backward
                compatibility.

        Returns:
            Transformed DataFrame ready for validation and writing.

        Raises:
            ExecutionError: If source bronze step not found in context or
                incremental filtering fails.

        Note:
            - Applies incremental filtering if mode is INCREMENTAL
            - Calls step.transform() with bronze DataFrame and prior_silvers dict
            - If step_params is provided and transform accepts params/kwargs, passes them
            - Transformation logic is defined in the step's transform function
        """
        # SQL-source step: load from JDBC/SQLAlchemy and return (engine validates/writes)
        sql_src = getattr(step, "sql_source", None)
        if sql_src is not None:
            # from ..sql_source import read_sql_source  # Removed: defined in notebook cells above

            # read_sql_source returns a Spark DataFrame; cast for type checkers.
            return cast(DataFrame, read_sql_source(sql_src, self.spark))

        # Handle validation-only steps (no transform function) - check this first
        if step.transform is None:
            if step.existing:
                # Use base class method for validation-only step handling
                result = self._handle_validation_only_step(step, "silver")
                if result is not None:
                    return result
            else:
                raise ExecutionError(
                    f"Silver step '{step.name}' has no transform function and is not marked as existing"
                )

        # Get source bronze data (only needed for non-validation-only steps)
        if step.source_bronze not in context:
            raise ExecutionError(
                f"Source bronze step {step.source_bronze} not found in context"
            )

        bronze_df: DataFrame = context[step.source_bronze]

        if mode == ExecutionMode.INCREMENTAL:
            bronze_df = self._filter_incremental_bronze_input(step, bronze_df)

        # Build prior_silvers dict from context
        # If source_silvers is specified, only include those steps
        # Otherwise, include all previously executed steps (excluding bronze and current step)
        prior_silvers: Dict[str, DataFrame] = {}
        source_silvers = getattr(step, "source_silvers", None)

        if source_silvers:
            # Only include explicitly specified silver steps
            for silver_name in source_silvers:
                if silver_name in context and silver_name != step.name:
                    prior_silvers[silver_name] = context[silver_name]
        else:
            # Include all previously executed steps (excluding bronze and current step)
            # This allows backward compatibility for silver steps that access prior_silvers
            # without explicitly declaring dependencies
            for key, value in context.items():
                if key != step.name and key != step.source_bronze:
                    # Only include silver steps (exclude gold steps from prior_silvers)
                    if step_types is None or step_types.get(key) != "gold":
                        prior_silvers[key] = value

        # Build prior_golds dict from context (all gold steps that have been executed)
        prior_golds: Dict[str, DataFrame] = {}
        if step_types is not None:
            for key, value in context.items():
                if key != step.name and step_types.get(key) == "gold":
                    prior_golds[key] = value

        # From here on transform is set (validation-only path returned or raised above)
        transform = step.transform
        assert transform is not None

        # Detect if transform function accepts prior_golds parameter
        has_prior_golds = False
        try:
            sig = inspect.signature(transform)
            has_prior_golds = "prior_golds" in sig.parameters
        except (ValueError, TypeError):
            has_prior_golds = False

        # Apply transform with source bronze data, prior silvers dict, and optionally prior_golds
        # Support backward-compatible params passing (signatures vary: call-arg ignored)
        if step_params is not None and self._accepts_params(transform):
            try:
                sig = inspect.signature(transform)
                if "params" in sig.parameters:
                    if has_prior_golds:
                        return transform(  # type: ignore[call-arg]
                            self.spark,
                            bronze_df,
                            prior_silvers,
                            prior_golds,
                            params=step_params,
                        )
                    else:
                        return transform(  # type: ignore[call-arg]
                            self.spark, bronze_df, prior_silvers, params=step_params
                        )
                else:
                    if has_prior_golds:
                        return transform(
                            self.spark,
                            bronze_df,
                            prior_silvers,
                            prior_golds,
                            **step_params,
                        )
                    else:
                        return transform(  # type: ignore[call-arg]
                            self.spark, bronze_df, prior_silvers, **step_params
                        )
            except Exception:
                if has_prior_golds:
                    return transform(self.spark, bronze_df, prior_silvers, prior_golds)
                else:
                    return transform(self.spark, bronze_df, prior_silvers)  # type: ignore[call-arg]
        else:
            if has_prior_golds:
                return transform(self.spark, bronze_df, prior_silvers, prior_golds)
            else:
                return transform(self.spark, bronze_df, prior_silvers)  # type: ignore[call-arg]

    def _filter_incremental_bronze_input(
        self,
        step: SilverStep,
        bronze_df: DataFrame,
    ) -> DataFrame:
        """Filter bronze input rows already processed in previous incremental runs.

        Filters bronze DataFrame to only include rows that haven't been processed
        yet. Uses the source bronze step's incremental column and the silver step's
        watermark column to determine which rows to exclude.

        Args:
            step: SilverStep instance with incremental configuration.
            bronze_df: Bronze DataFrame to filter.

        Returns:
            Filtered DataFrame containing only new rows to process. Returns
            original DataFrame if filtering cannot be performed (missing columns,
            table doesn't exist, etc.).

        Raises:
            ExecutionError: If filtering fails due to column or type issues.

        Note:
            Filtering logic:
            1. Reads existing silver table to get maximum watermark value
            2. Filters bronze rows where incremental_col > max_watermark
            3. Returns original DataFrame if table doesn't exist (first run)

            Requires:
            - step.source_incremental_col: Column in bronze DataFrame
            - step.watermark_col: Column in existing silver table
            - step.schema and step.table_name: To locate existing table

            Skips filtering gracefully if requirements not met (returns original DataFrame).
        """
        incremental_col = getattr(step, "source_incremental_col", None)
        watermark_col = getattr(step, "watermark_col", None)
        schema = getattr(step, "schema", None)
        table_name = getattr(step, "table_name", step.name)

        if not incremental_col or not watermark_col or schema is None:
            return bronze_df

        if incremental_col not in getattr(bronze_df, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: incremental column '{incremental_col}' "
                f"not present in bronze DataFrame; skipping incremental filter"
            )
            return bronze_df

        # Validate that incremental column type is appropriate for filtering
        try:
            df_schema = bronze_df.schema
            col_field = df_schema[incremental_col]  # type: ignore[index]
            col_type = col_field.dataType
            col_type_name = str(col_type)

            # Check if type is comparable (numeric, date, timestamp, string)
            # Non-comparable types: boolean, array, map, struct
            non_comparable_types = ["boolean", "array", "map", "struct", "binary"]
            if any(
                non_comp in col_type_name.lower() for non_comp in non_comparable_types
            ):
                self.logger.warning(
                    f"Silver step {step.name}: incremental column '{incremental_col}' "
                    f"has type '{col_type_name}' which may not be suitable for comparison operations. "
                    f"Filtering may fail or produce unexpected results. "
                    f"Consider using a numeric, date, timestamp, or string column for incremental processing."
                )
        except (KeyError, AttributeError, Exception) as e:
            # If we can't inspect the schema, log a warning but continue
            self.logger.debug(
                f"Silver step {step.name}: could not validate incremental column type: {e}"
            )

        output_table = fqn(schema, table_name)

        try:
            existing_table = self.spark.table(output_table)
        except Exception as exc:
            self.logger.debug(
                f"Silver step {step.name}: unable to read existing table {output_table} "
                f"for incremental filter: {exc}"
            )
            return bronze_df

        if watermark_col not in getattr(existing_table, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: watermark column '{watermark_col}' "
                f"not present in existing table {output_table}; skipping incremental filter"
            )
            return bronze_df

        try:
            watermark_rows = existing_table.select(watermark_col).collect()
        except Exception as exc:
            self.logger.warning(
                f"Silver step {step.name}: failed to collect watermark values "
                f"from {output_table}: {exc}"
            )
            return bronze_df

        if not watermark_rows:
            return bronze_df

        cutoff_value = None
        for row in watermark_rows:
            value = None
            if hasattr(row, "__getitem__"):
                try:
                    value = row[watermark_col]
                except Exception:
                    try:
                        value = row[0]
                    except Exception:
                        value = None
            if value is None and hasattr(row, "asDict"):
                value = row.asDict().get(watermark_col)
            if value is None:
                continue
            cutoff_value = value if cutoff_value is None else max(cutoff_value, value)

        if cutoff_value is None:
            return bronze_df

        try:
            filtered_df = bronze_df.filter(F.col(incremental_col) > F.lit(cutoff_value))
        except Exception as exc:
            # Provide detailed error context for incremental filtering failures
            error_msg = str(exc).lower()
            if "cannot resolve" in error_msg or "column" in error_msg:
                # Column-related error - provide schema context
                available_cols = sorted(getattr(bronze_df, "columns", []))
                raise ExecutionError(
                    f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                    f"Error: {exc!r}. "
                    f"Available columns in bronze DataFrame: {available_cols}. "
                    f"This may indicate that the incremental column was dropped or renamed in a previous transform. "
                    f"Please ensure the incremental column '{incremental_col}' exists in the bronze DataFrame."
                ) from exc
            elif "type" in error_msg or "cast" in error_msg:
                # Type-related error - provide type information
                try:
                    df_schema = bronze_df.schema
                    col_type = df_schema[incremental_col].dataType  # type: ignore[index]
                    raise ExecutionError(
                        f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                        f"Error: {exc!r}. "
                        f"Column type: {col_type}. "
                        f"Cutoff value type: {type(cutoff_value).__name__}. "
                        f"Incremental columns must be comparable types (numeric, date, timestamp). "
                        f"Please ensure the incremental column type is compatible with the cutoff value."
                    ) from exc
                except (KeyError, AttributeError, Exception):
                    # If we can't get type info, provide generic error
                    raise ExecutionError(
                        f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                        f"Error: {exc!r}. "
                        f"This may be a type mismatch between the incremental column and the cutoff value. "
                        f"Please ensure the incremental column type is compatible with the cutoff value type."
                    ) from exc
            else:
                # Generic error with context
                raise ExecutionError(
                    f"Silver step {step.name}: failed to filter bronze rows using "
                    f"{incremental_col} > {cutoff_value}: {exc!r}. "
                    f"Please check that the incremental column exists and is of a comparable type."
                ) from exc

        self.logger.info(
            f"Silver step {step.name}: filtering bronze rows where "
            f"{incremental_col} <= {cutoff_value}"
        )
        return filtered_df  # type: ignore[no-any-return]

In [None]:
# Module: pipeline_builder.step_executors.gold (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models, pipeline_builder.sql_source, pipeline_builder.step_executors.base, pipeline_builder_base.errors

"""Gold step executor.

This module provides the executor for gold steps, which aggregate silver data
into final business metrics and analytics. Gold steps are the final layer
in the Medallion architecture.
"""

from __future__ import annotations

import inspect
from typing import Any, Dict, Optional, cast
# from .errors import ExecutionError  # Removed: defined in notebook cells above

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..models import GoldStep  # Removed: defined in notebook cells above
# from .base import BaseStepExecutor  # Removed: defined in notebook cells above

class GoldStepExecutor(BaseStepExecutor):
    """Executor for gold steps in the pipeline.

    Gold steps aggregate silver data into final business metrics and analytics.
    They typically perform aggregations, joins, and business logic to produce
    final reporting tables.

    Gold steps:
        - Aggregate multiple silver tables
        - Perform business logic and calculations
        - Write results to Delta Lake tables
        - Apply validation rules after transformation

    Example:
        >>> from pipeline_builder.step_executors.gold import GoldStepExecutor
        >>> from pipeline_builder.models import GoldStep
        >>>
        >>> executor = GoldStepExecutor(spark)
        >>> result = executor.execute(
        ...     step=GoldStep(
        ...         name="daily_metrics",
        ...         transform=lambda spark, silvers: (
        ...             silvers["clean_events"]
        ...             .groupBy("date")
        ...             .agg(F.count("*").alias("count"))
        ...         ),
        ...         rules={"count": [F.col("count") > 0]},
        ...         table_name="daily_metrics",
        ...         source_silvers=["clean_events"]
        ...     ),
        ...     context={"clean_events": silver_df}
        ... )
    """

    def execute(
        self,
        step: GoldStep,
        context: Dict[str, DataFrame],
        mode: Any = None,  # Mode not used for gold steps
        step_params: Optional[Dict[str, Any]] = None,
        step_types: Optional[Dict[str, str]] = None,
    ) -> DataFrame:
        """Execute a gold step.

        Transforms silver data using the step's transform function. Builds a
        dictionary of source silver DataFrames from step.source_silvers.

        Args:
            step: GoldStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                all source silver step names listed in step.source_silvers.
            mode: Execution mode (not used for gold steps, can be None).
            step_params: Optional dictionary of parameters to pass to the transform
                function. If the transform function accepts a 'params' argument or
                **kwargs, these will be passed. Otherwise, ignored for backward
                compatibility.

        Returns:
            Transformed DataFrame ready for validation and writing.

        Raises:
            ExecutionError: If any source silver step not found in context.

        Note:
            - Builds silvers dictionary from step.source_silvers
            - Calls step.transform() with SparkSession and silvers dictionary
            - If step_params is provided and transform accepts params/kwargs, passes them
            - Transformation logic is defined in the step's transform function
            - Gold steps typically perform aggregations and business metrics
        """
        # SQL-source step: load from JDBC/SQLAlchemy and return (engine validates/writes)
        sql_src = getattr(step, "sql_source", None)
        if sql_src is not None:
            # from ..sql_source import read_sql_source  # Removed: defined in notebook cells above

            # read_sql_source returns a Spark DataFrame; cast for type checkers.
            return cast(DataFrame, read_sql_source(sql_src, self.spark))

        # Handle validation-only steps (no transform function)
        if step.transform is None:
            if step.existing:
                # Use base class method for validation-only step handling
                result = self._handle_validation_only_step(step, "gold")
                if result is not None:
                    return result
            else:
                raise ExecutionError(
                    f"Gold step '{step.name}' has no transform function and is not marked as existing"
                )

        # Build silvers dict from source_silvers
        silvers = {}
        if step.source_silvers is not None:
            for silver_name in step.source_silvers:
                if silver_name not in context:
                    raise ExecutionError(
                        f"Source silver {silver_name} not found in context"
                    )
                silvers[silver_name] = context[silver_name]

        # Build prior_golds dict from context (all previously executed gold steps)
        prior_golds: Dict[str, DataFrame] = {}
        if step_types is not None:
            for key, value in context.items():
                if key != step.name and step_types.get(key) == "gold":
                    prior_golds[key] = value

        # From here on transform is set (we raised above if None)
        transform = step.transform
        assert transform is not None

        # Detect if transform function accepts prior_golds parameter
        has_prior_golds = False
        try:
            sig = inspect.signature(transform)
            has_prior_golds = "prior_golds" in sig.parameters
        except (ValueError, TypeError):
            has_prior_golds = False

        # Apply transform with silvers dict and optionally prior_golds
        # Support backward-compatible params passing (signatures vary: call-arg ignored)
        if step_params is not None and self._accepts_params(transform):
            try:
                sig = inspect.signature(transform)
                if "params" in sig.parameters:
                    if has_prior_golds:
                        return transform(  # type: ignore[call-arg]
                            self.spark, silvers, prior_golds, params=step_params
                        )
                    else:
                        return transform(self.spark, silvers, params=step_params)  # type: ignore[call-arg]
                else:
                    if has_prior_golds:
                        return transform(
                            self.spark, silvers, prior_golds, **step_params
                        )
                    else:
                        return transform(self.spark, silvers, **step_params)  # type: ignore[call-arg]
            except Exception:
                if has_prior_golds:
                    return transform(self.spark, silvers, prior_golds)
                else:
                    return transform(self.spark, silvers)  # type: ignore[call-arg]
        else:
            if has_prior_golds:
                return transform(self.spark, silvers, prior_golds)
            else:
                return transform(self.spark, silvers)  # type: ignore[call-arg]

In [None]:
# Module: pipeline_builder.storage.table_service (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.storage.schema_manager, pipeline_builder.table_operations, pipeline_builder_base.logging

"""Table service for table operations.

This module provides centralized table operations including existence checks,
schema management, and table lifecycle operations. The TableService acts as
a facade for table-related operations, delegating to SchemaManager for
schema-specific functionality.
"""

from __future__ import annotations

from typing import Any, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import SparkSession  # Removed: defined in notebook cells above
# from ..table_operations import fqn, table_exists  # Removed: defined in notebook cells above
# from .schema_manager import SchemaManager  # Removed: defined in notebook cells above

class TableService:
    """Service for table operations.

    Centralizes all table-related operations including existence checks,
    schema management, and table lifecycle operations. Acts as a facade
    for table operations, delegating schema management to SchemaManager.

    Attributes:
        spark: SparkSession instance for table operations.
        logger: PipelineLogger instance for logging.
        schema_manager: SchemaManager instance for schema operations.

    Example:
        >>> from pipeline_builder.storage.table_service import TableService
        >>> from pipeline_builder.compat import SparkSession
        >>>
        >>> service = TableService(spark)
        >>> service.ensure_schema_exists("analytics")
        >>> exists = service.table_exists("analytics.events")
        >>> schema = service.get_table_schema("analytics.events")
    """

    def __init__(
        self,
        spark: SparkSession,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the table service.

        Args:
            spark: Active SparkSession instance for table operations.
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.spark = spark
        self.logger = logger or PipelineLogger()
        self.schema_manager = SchemaManager(spark, logger)

    def ensure_schema_exists(self, schema: str) -> None:
        """Ensure a schema exists, creating it if necessary.

        Delegates to SchemaManager to create the schema if it doesn't exist.
        Uses idempotent CREATE SCHEMA IF NOT EXISTS.

        Args:
            schema: Schema name to create or verify.

        Raises:
            ExecutionError: If schema creation fails after all attempts.
        """
        self.schema_manager.ensure_schema_exists(schema)

    def table_exists(self, table_name: str) -> bool:
        """Check if a table exists.

        Args:
            table_name: Fully qualified table name (schema.table).

        Returns:
            True if table exists, False otherwise.
        """
        return table_exists(self.spark, table_name)

    def get_table_schema(
        self,
        table_name: str,
        refresh: bool = False,
    ) -> Optional[Any]:
        """Get the schema of an existing table.

        Retrieves the StructType schema of an existing table, optionally
        refreshing table metadata first to ensure accurate schema information.

        Args:
            table_name: Fully qualified table name (schema.table).
            refresh: Whether to refresh table metadata before reading schema.
                Defaults to False.

        Returns:
            StructType schema if table exists and schema is readable, None
            otherwise. May return empty struct<> if catalog sync issues occur.
        """
        return self.schema_manager.get_table_schema(table_name, refresh)

    def validate_schema_match(
        self,
        table_name: str,
        output_schema: Any,
        mode: Any,
        step_name: str,
    ) -> tuple[bool, list[str]]:
        """Validate that output schema matches existing table schema.

        Validates that the output DataFrame schema matches the existing table
        schema. Required for INCREMENTAL and FULL_REFRESH modes to prevent
        schema drift.

        Args:
            table_name: Fully qualified table name (schema.table).
            output_schema: StructType schema of the output DataFrame.
            mode: ExecutionMode enum value.
            step_name: Name of the step being validated (for error messages).

        Returns:
            Tuple of (matches: bool, differences: list[str]) where:
            - matches: True if schemas match, False otherwise
            - differences: List of human-readable mismatch descriptions

        Raises:
            ExecutionError: If schema cannot be read or doesn't match (for
                INCREMENTAL/FULL_REFRESH modes).
        """
        return self.schema_manager.validate_schema_match(
            table_name, output_schema, mode, step_name
        )

    def drop_table_if_exists(self, table_name: str) -> None:
        """Drop a table if it exists.

        Safely drops a table, handling cases where the table doesn't exist.
        Errors are logged but not raised.

        Args:
            table_name: Fully qualified table name (schema.table).

        Note:
            Uses DROP TABLE IF EXISTS for idempotent operation. Errors are
            logged at debug level but not raised.
        """
        try:
            if self.table_exists(table_name):
                self.spark.sql(f"DROP TABLE IF EXISTS {table_name}")
        except Exception as e:
            self.logger.debug(f"Could not drop table {table_name}: {e}")

    def refresh_table(self, table_name: str) -> None:
        """Refresh table metadata.

        Refreshes Spark catalog metadata for a table, ensuring subsequent
        operations see the latest schema and data.

        Args:
            table_name: Fully qualified table name (schema.table).

        Note:
            Uses REFRESH TABLE SQL command. Errors are logged at debug level
            but not raised, as some table types may not support refresh.
        """
        try:
            self.spark.sql(f"REFRESH TABLE {table_name}")
        except Exception as refresh_error:
            # Refresh might fail for some table types - log but continue
            self.logger.debug(f"Could not refresh table {table_name}: {refresh_error}")

    def fqn(self, schema: str, table: str) -> str:
        """Create a fully qualified table name.

        Combines schema and table names into a fully qualified table name
        (schema.table).

        Args:
            schema: Schema name.
            table: Table name.

        Returns:
            Fully qualified table name in format "schema.table".
        """
        return fqn(schema, table)

In [None]:
# Module: pipeline_builder.pipeline.debug_session (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models, pipeline_builder.pipeline.models, pipeline_builder.pipeline.runner, pipeline_builder_base.models

"""Pipeline debug session for interactive stepwise execution.

This module provides a PipelineDebugSession class that simplifies interactive
debugging and iterative refinement of pipeline steps in notebook environments.
"""

from __future__ import annotations

from typing import Any, Dict, Optional
# from .models import PipelineConfig  # Removed: defined in notebook cells above

# from ..compat import DataFrame  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .models import PipelineMode, PipelineReport  # Removed: defined in notebook cells above
# from .runner import SimplePipelineRunner  # Removed: defined in notebook cells above

class PipelineDebugSession:
    """Interactive debug session for stepwise pipeline execution.

    Provides a convenient interface for running individual steps, overriding
    parameters, and iteratively refining pipeline logic without re-running
    the entire pipeline.

    Attributes:
        runner: SimplePipelineRunner instance for execution.
        steps: List of all pipeline steps.
        mode: Current execution mode.
        context: Execution context dictionary mapping step names to DataFrames.
        step_params: Dictionary mapping step names to parameter dictionaries.

    Example:
        >>> from pipeline_builder.pipeline.debug_session import PipelineDebugSession
        >>> from pipeline_builder_base.models import PipelineConfig
        >>>
        >>> # Create session
        >>> config = PipelineConfig.create_default(schema="my_schema")
        >>> session = PipelineDebugSession(spark, config, steps=[bronze, silver, gold])
        >>>
        >>> # Run until a specific step
        >>> report, context = session.run_until("clean_events")
        >>>
        >>> # Run a single step
        >>> report, context = session.run_step("clean_events")
        >>>
        >>> # Rerun with parameter override
        >>> session.step_params["clean_events"] = {"threshold": 0.9}
        >>> report, context = session.rerun_step("clean_events")
    """

    def __init__(
        self,
        spark: Any,  # SparkSession
        config: PipelineConfig,
        steps: list[BronzeStep | SilverStep | GoldStep],
        mode: PipelineMode = PipelineMode.INITIAL,
        bronze_sources: Optional[Dict[str, DataFrame]] = None,
        logger: Optional[Any] = None,  # PipelineLogger
        functions: Optional[Any] = None,  # FunctionsProtocol
    ):
        """Initialize the debug session.

        Args:
            spark: Active SparkSession instance.
            config: Pipeline configuration.
            steps: List of pipeline steps (Bronze, Silver, Gold).
            mode: Initial execution mode. Defaults to INITIAL.
            bronze_sources: Optional bronze source data dictionary.
            logger: Optional logger instance.
            functions: Optional functions protocol instance.
        """
        # Group steps by type for runner
        bronze_steps: Dict[str, BronzeStep] = {}
        silver_steps: Dict[str, SilverStep] = {}
        gold_steps: Dict[str, GoldStep] = {}

        for step in steps:
            if step.step_type.value == "bronze":
                bronze_steps[step.name] = step  # type: ignore[assignment]
            elif step.step_type.value == "silver":
                silver_steps[step.name] = step  # type: ignore[assignment]
            elif step.step_type.value == "gold":
                gold_steps[step.name] = step  # type: ignore[assignment]

        self.runner = SimplePipelineRunner(
            spark=spark,
            config=config,
            bronze_steps=bronze_steps,
            silver_steps=silver_steps,
            gold_steps=gold_steps,
            logger=logger,
            functions=functions,
        )
        self.steps = steps
        self.mode = mode
        self.context: Dict[str, DataFrame] = {}
        self.step_params: Dict[str, Dict[str, Any]] = {}

        # Initialize context with bronze sources if provided
        if bronze_sources:
            self.context.update(bronze_sources)

    def run_until(
        self,
        step_name: str,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:
        """Run pipeline until a specific step completes (inclusive).

        Args:
            step_name: Name of the step to stop after (inclusive).
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary). Context is updated
            with all step outputs and stored in self.context.

        Example:
            >>> report, context = session.run_until("clean_events")
            >>> # Context now contains outputs up to clean_events
        """
        # Extract bronze sources from self.context for bronze steps
        bronze_sources = {}
        for step in self.steps:
            if step.step_type.value == "bronze" and step.name in self.context:
                bronze_sources[step.name] = self.context[step.name]

        report, context = self.runner.run_until(
            step_name=step_name,
            steps=self.steps,
            mode=self.mode,
            bronze_sources=bronze_sources if bronze_sources else None,
            step_params=self.step_params,
            write_outputs=write_outputs,
        )
        self.context = context
        return report, context

    def run_step(
        self,
        step_name: str,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:
        """Run a single step, loading dependencies from context or tables.

        Args:
            step_name: Name of the step to execute.
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary). Context is updated
            with the step output and stored in self.context.

        Example:
            >>> report, context = session.run_step("clean_events")
            >>> # Step executed, context updated
        """
        report, context = self.runner.run_step(
            step_name=step_name,
            steps=self.steps,
            mode=self.mode,
            context=self.context,
            step_params=self.step_params,
            write_outputs=write_outputs,
        )
        self.context = context
        return report, context

    def rerun_step(
        self,
        step_name: str,
        invalidate_downstream: bool = True,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:
        """Rerun a step with current parameter overrides.

        Uses self.step_params for parameter overrides. To change parameters,
        modify self.step_params before calling this method.

        Args:
            step_name: Name of the step to rerun.
            invalidate_downstream: If True, remove downstream outputs from context.
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary). Context is updated
            with the step output and stored in self.context.

        Example:
            >>> # Set parameter override
            >>> session.step_params["clean_events"] = {"threshold": 0.9}
            >>> # Rerun with override
            >>> report, context = session.rerun_step("clean_events")
        """
        report, context = self.runner.rerun_step(
            step_name=step_name,
            steps=self.steps,
            mode=self.mode,
            context=self.context,
            step_params=self.step_params,
            invalidate_downstream=invalidate_downstream,
            write_outputs=write_outputs,
        )
        self.context = context
        return report, context

    def set_step_params(self, step_name: str, params: Dict[str, Any]) -> None:
        """Set parameters for a step.

        Convenience method to update step_params.

        Args:
            step_name: Name of the step.
            params: Parameter dictionary to pass to the step's transform function.

        Example:
            >>> session.set_step_params("clean_events", {"threshold": 0.9})
        """
        self.step_params[step_name] = params

    def clear_step_params(self, step_name: Optional[str] = None) -> None:
        """Clear parameter overrides for a step or all steps.

        Args:
            step_name: Name of the step. If None, clears all step params.

        Example:
            >>> session.clear_step_params("clean_events")  # Clear one step
            >>> session.clear_step_params()  # Clear all steps
        """
        if step_name is None:
            self.step_params.clear()
        else:
            self.step_params.pop(step_name, None)

In [None]:
# Module: pipeline_builder_base.reporting (pipeline_builder_base)
#
# Dependencies: models.execution, performance, pipeline_builder_base.models, pipeline_builder_base.validation, validation.utils

"""
Reporting utilities for the pipeline framework.

This module contains functions for creating reports, statistics, and summaries
for pipeline execution.

"""

from __future__ import annotations

from datetime import datetime
from typing import Optional, TypedDict

# from .models import StageStats  # Removed: defined in notebook cells above
# from .validation import safe_divide  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================

class ValidationReport(TypedDict):
    """Validation report structure."""

    stage: Optional[str]
    step: Optional[str]
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_at: datetime
    end_at: datetime

class TransformReport(TypedDict):
    """Transform operation report structure."""

    input_rows: int
    output_rows: int
    duration_secs: float
    skipped: bool
    start_at: datetime
    end_at: datetime

class WriteReport(TypedDict):
    """Write operation report structure."""

    mode: str
    rows_written: int
    duration_secs: float
    table_fqn: str
    skipped: bool
    start_at: datetime
    end_at: datetime

class ExecutionSummary(TypedDict):
    """Execution summary nested structure."""

    total_steps: int
    successful_steps: int
    failed_steps: int
    success_rate: float
    failure_rate: float

class PerformanceMetrics(TypedDict):
    """Performance metrics nested structure."""

    total_duration_secs: float
    formatted_duration: str
    avg_validation_rate: float

class DataMetrics(TypedDict):
    """Data metrics nested structure."""

    total_rows_processed: int
    total_rows_written: int
    processing_efficiency: float

class SummaryReport(TypedDict):
    """Complete summary report structure."""

    execution_summary: ExecutionSummary
    performance_metrics: PerformanceMetrics
    data_metrics: DataMetrics

def create_validation_dict(
    stats: Optional[StageStats], *, start_at: datetime, end_at: datetime
) -> ValidationReport:
    """
    Create a validation report dictionary from stage stats.

    Args:
        stats: Stage statistics
        start_at: Start time
        end_at: End time

    Returns:
        Validation report dictionary
    """
    if stats is None:
        return {
            "stage": None,
            "step": None,
            "total_rows": 0,
            "valid_rows": 0,
            "invalid_rows": 0,
            "validation_rate": 0.0,
            "duration_secs": (end_at - start_at).total_seconds(),
            "start_at": start_at,
            "end_at": end_at,
        }

    return {
        "stage": stats.stage,
        "step": stats.step,
        "total_rows": stats.total_rows,
        "valid_rows": stats.valid_rows,
        "invalid_rows": stats.invalid_rows,
        "validation_rate": stats.validation_rate,
        "duration_secs": stats.duration_secs,
        "start_at": start_at,
        "end_at": end_at,
    }

def create_transform_dict(
    *,
    input_rows: int,
    output_rows: int,
    start_at: datetime,
    end_at: datetime,
    skipped: bool = False,
) -> TransformReport:
    """
    Create a transform report dictionary.

    Args:
        input_rows: Number of input rows
        output_rows: Number of output rows
        start_at: Start time
        end_at: End time
        skipped: Whether the transform was skipped

    Returns:
        Transform report dictionary
    """
    return {
        "input_rows": input_rows,
        "output_rows": output_rows,
        "duration_secs": (end_at - start_at).total_seconds(),
        "skipped": skipped,
        "start_at": start_at,
        "end_at": end_at,
    }

def create_write_dict(
    *,
    mode: str,
    rows_written: int,
    table_fqn: str,
    start_at: datetime,
    end_at: datetime,
    skipped: bool = False,
) -> WriteReport:
    """
    Create a write report dictionary.

    Args:
        mode: Write mode
        rows_written: Number of rows written
        table_fqn: Fully qualified table name
        start_at: Start time
        end_at: End time
        skipped: Whether the write was skipped

    Returns:
        Write report dictionary
    """
    return {
        "mode": mode,
        "rows_written": rows_written,
        "duration_secs": (end_at - start_at).total_seconds(),
        "table_fqn": table_fqn,
        "skipped": skipped,
        "start_at": start_at,
        "end_at": end_at,
    }

def format_duration(seconds: float) -> str:
    """
    Format duration in seconds to human-readable string.

    Args:
        seconds: Duration in seconds

    Returns:
        Formatted duration string
    """
    if seconds < 60:
        return f"{seconds:.2f}s"
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = seconds % 60
        return f"{minutes}m {secs:.2f}s"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = seconds % 60
        return f"{hours}h {minutes}m {secs:.2f}s"

def create_summary_report(
    *,
    total_steps: int,
    successful_steps: int,
    failed_steps: int,
    total_duration_secs: float,
    total_rows_processed: int,
    total_rows_written: int,
    avg_validation_rate: float,
) -> SummaryReport:
    """
    Create a complete summary report.

    Args:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        total_duration_secs: Total duration in seconds
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate

    Returns:
        Complete summary report
    """
    success_rate = safe_divide(successful_steps, total_steps, 0.0) * 100
    failure_rate = 100.0 - success_rate
    processing_efficiency = (
        safe_divide(total_rows_written, total_rows_processed, 0.0) * 100
    )

    return {
        "execution_summary": {
            "total_steps": total_steps,
            "successful_steps": successful_steps,
            "failed_steps": failed_steps,
            "success_rate": success_rate,
            "failure_rate": failure_rate,
        },
        "performance_metrics": {
            "total_duration_secs": total_duration_secs,
            "formatted_duration": format_duration(total_duration_secs),
            "avg_validation_rate": avg_validation_rate,
        },
        "data_metrics": {
            "total_rows_processed": total_rows_processed,
            "total_rows_written": total_rows_written,
            "processing_efficiency": processing_efficiency,
        },
    }

In [None]:
# Module: pipeline_builder.storage.write_service (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.storage.table_service, pipeline_builder.table_operations, pipeline_builder_base.errors, pipeline_builder_base.logging, pipeline_builder_base.models

"""Write service for handling all write operations.

This module provides a service for writing DataFrames to tables with proper
handling of write modes, schema overrides, and Delta Lake operations. The
WriteService centralizes all write logic, making it testable and maintainable.
"""

from __future__ import annotations

from typing import Any, Optional
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionMode  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..table_operations import (  # Removed: defined in notebook cells above
    # create_dataframe_writer,
# )
# from .table_service import TableService  # Removed: defined in notebook cells above

class WriteService:
    """Service for writing DataFrames to tables.

    Handles write modes, schema validation, schema overrides, and Delta Lake
    operations. Centralizes all write logic for Silver and Gold steps.

    Attributes:
        spark: SparkSession instance for DataFrame operations.
        table_service: TableService instance for table operations.
        logger: PipelineLogger instance for logging.

    Example:
        >>> from pipeline_builder.storage.write_service import WriteService
        >>> from pipeline_builder.storage.table_service import TableService
        >>> from pipeline_builder_base.models import ExecutionMode
        >>>
        >>> table_service = TableService(spark)
        >>> write_service = WriteService(spark, table_service)
        >>> rows = write_service.write_step_output(
        ...     df=output_df,
        ...     step=silver_step,
        ...     schema="analytics",
        ...     table_name="clean_events",
        ...     mode=ExecutionMode.INITIAL
        ... )
    """

    def __init__(
        self,
        spark: SparkSession,
        table_service: TableService,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the write service.

        Args:
            spark: Active SparkSession instance for DataFrame operations.
            table_service: TableService instance for table operations.
            logger: Optional PipelineLogger instance. If None, creates a
                default logger.
        """
        self.spark = spark
        self.table_service = table_service
        self.logger = logger or PipelineLogger()

    def write_step_output(
        self,
        df: DataFrame,
        step: Any,
        schema: str,
        table_name: str,
        mode: ExecutionMode,
    ) -> int:
        """Write step output to a table.

        Writes a DataFrame to a Delta Lake table with proper handling of write
        modes, schema validation, and schema overrides. Handles all the
        complexity of Delta Lake writes including overwrite semantics.

        Args:
            df: DataFrame to write.
            step: Step object (SilverStep or GoldStep) containing step
                configuration.
            schema: Schema name for the target table.
            table_name: Table name (without schema).
            mode: ExecutionMode enum value (INITIAL, INCREMENTAL, FULL_REFRESH,
                VALIDATION_ONLY).

        Returns:
            Number of rows written to the table.

        Raises:
            ExecutionError: If write fails, schema validation fails, or schema
                override application fails.

        Note:
            - Ensures schema exists before writing
            - Validates schema match for INCREMENTAL and FULL_REFRESH modes
            - Drops table in INITIAL mode for clean start
            - Applies schema override if provided in step
            - Handles Delta Lake overwrite semantics correctly
        """
        output_table = self.table_service.fqn(schema, table_name)

        # Ensure schema exists
        self.table_service.ensure_schema_exists(schema)

        # Determine write mode
        write_mode_str = self._determine_write_mode(step, mode)

        # Validate schema if needed
        if mode in (ExecutionMode.INCREMENTAL, ExecutionMode.FULL_REFRESH):
            self._validate_schema_for_mode(df, output_table, mode, step.name)

        # NOTE: We intentionally do NOT drop existing tables in INITIAL mode.
        # Dropping is destructive and can leave users with missing tables if a run fails
        # after the drop but before the overwrite commit. Delta overwrite is transactional.

        # Handle schema override if provided
        schema_override = getattr(step, "schema_override", None)
        if schema_override is not None:
            df = self._apply_schema_override(
                df, schema_override, step, output_table, write_mode_str
            )

        # Handle write based on step type and mode
        rows_written = self._execute_write(df, step, output_table, write_mode_str, mode)

        return rows_written

    def _determine_write_mode(
        self,
        step: Any,
        mode: ExecutionMode,
    ) -> str:
        """Determine the write mode for a step.

        Determines the appropriate write mode based on step type and execution
        mode. Gold steps always use overwrite, Silver steps use append for
        incremental mode and overwrite otherwise.

        Args:
            step: Step object (SilverStep or GoldStep).
            mode: ExecutionMode enum value.

        Returns:
            Write mode string ("overwrite" or "append").
        """
        # Gold steps always use overwrite to prevent duplicate aggregates
        if step.__class__.__name__ == "GoldStep":
            return "overwrite"
        elif mode == ExecutionMode.INCREMENTAL:
            return "append"
        else:  # INITIAL or FULL_REFRESH
            return "overwrite"

    def _validate_schema_for_mode(
        self,
        df: DataFrame,
        table_name: str,
        mode: ExecutionMode,
        step_name: str,
    ) -> None:
        """
        Validate schema for INCREMENTAL and FULL_REFRESH modes.

        Args:
            df: DataFrame to validate
            table_name: Fully qualified table name
            mode: Execution mode
            step_name: Name of the step

        Raises:
            ExecutionError: If schema validation fails
        """
        if not self.table_service.table_exists(table_name):
            return

        # Refresh table metadata
        self.table_service.refresh_table(table_name)

        # Validate schema match
        output_schema = df.schema
        self.table_service.validate_schema_match(
            table_name, output_schema, mode, step_name
        )

    def _apply_schema_override(
        self,
        df: DataFrame,
        schema_override: Any,
        step: Any,
        output_table: str,
        write_mode_str: str,
    ) -> DataFrame:
        """
        Apply schema override to DataFrame.

        Args:
            df: DataFrame to apply schema to
            schema_override: Schema to apply
            step: Step object
            output_table: Fully qualified table name
            write_mode_str: Write mode string

        Returns:
            DataFrame with schema override applied
        """
        try:
            # Cast DataFrame to the override schema
            df = self.spark.createDataFrame(df.rdd, schema_override)  # type: ignore[attr-defined]
        except Exception as e:
            raise ExecutionError(
                f"Failed to apply schema_override to step '{step.name}': {e}",
                context={
                    "step_name": step.name,
                    "table": output_table,
                    "schema_override": str(schema_override),
                },
                suggestions=[
                    "Verify that the schema_override matches the DataFrame structure",
                    "Check that all required columns are present in the DataFrame",
                    "Ensure data types are compatible",
                ],
            ) from e

        return df

    def _execute_write(
        self,
        df: DataFrame,
        step: Any,
        output_table: str,
        write_mode_str: str,
        mode: ExecutionMode,
    ) -> int:
        """
        Execute the actual write operation.

        Args:
            df: DataFrame to write
            step: Step object
            output_table: Fully qualified table name
            write_mode_str: Write mode string
            mode: Execution mode

        Returns:
            Number of rows written
        """
        # For overwrite mode with Delta tables, ensure table is dropped before writing
        # This prevents "Table does not support truncate in batch mode" errors
        if write_mode_str == "overwrite":
            # from ..table_operations import prepare_delta_overwrite  # Removed: defined in notebook cells above

            prepare_delta_overwrite(self.spark, output_table)

        writer = create_dataframe_writer(
            df, self.spark, write_mode_str, table_name=output_table
        )

        try:
            writer.saveAsTable(output_table)
            rows_written = df.count()
            return rows_written
        except Exception as e:
            # If write fails with truncate error, try dropping table and writing again
            error_msg = str(e).lower()
            if "truncate" in error_msg and "batch mode" in error_msg:
                self.logger.warning(
                    f"Write failed with truncate error for Delta table, "
                    f"dropping table and retrying: {e}"
                )
                try:
                    # Force drop the table (without CASCADE - not supported in all Spark versions)
                    self.spark.sql(f"DROP TABLE IF EXISTS {output_table}")
                    # Small delay to ensure catalog is updated
                    import time

                    time.sleep(0.1)
                    # Retry the write - table should not exist now
                    writer = create_dataframe_writer(
                        df, self.spark, write_mode_str, table_name=output_table
                    )
                    writer.saveAsTable(output_table)
                    rows_written = df.count()
                    self.logger.info(
                        f"Successfully wrote {rows_written} rows after retry"
                    )
                    return rows_written
                except Exception as retry_error:
                    raise ExecutionError(
                        f"Failed to write table '{output_table}' even after retry: {retry_error}",
                        context={
                            "step_name": step.name,
                            "table": output_table,
                            "mode": mode.value,
                            "write_mode": write_mode_str,
                            "original_error": str(e),
                        },
                    ) from retry_error

            raise ExecutionError(
                f"Failed to write table '{output_table}': {e}",
                context={
                    "step_name": step.name,
                    "table": output_table,
                    "mode": mode.value,
                    "write_mode": write_mode_str,
                },
            ) from e

In [None]:
# Module: pipeline_builder.execution (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.models, pipeline_builder.step_executors, pipeline_builder.storage, pipeline_builder.table_operations, pipeline_builder.validation.execution_validator, pipeline_builder_base.dependencies, pipeline_builder_base.errors, pipeline_builder_base.logging, pipeline_builder_base.models

# mypy: ignore-errors
"""Production-ready execution system for pipeline execution.

This module provides a robust execution engine that handles pipeline execution
with comprehensive error handling, step-by-step processing, and detailed reporting.
The engine uses a service-oriented architecture with dedicated step executors,
validation services, and storage services for clean separation of concerns.

Key Features:
    - Step-by-Step Execution: Process pipeline steps individually with detailed tracking
    - Comprehensive Error Handling: Detailed error messages with context and suggestions
    - Multiple Execution Modes: Initial load, incremental, full refresh, and validation-only
    - Dependency-Aware Execution: Automatically analyzes step dependencies and executes
      in correct order
    - Detailed Reporting: Comprehensive execution reports with metrics and timing
    - Validation Integration: Built-in validation with configurable thresholds
    - Service-Oriented Architecture: Clean separation with step executors, validators,
      and storage services

Execution Modes:
    INITIAL: First-time pipeline execution with full data processing. Allows schema
        changes and creates tables from scratch.
    INCREMENTAL: Process only new data based on watermark columns. Requires exact
        schema matching with existing tables.
    FULL_REFRESH: Reprocess all data, overwriting existing results. Requires exact
        schema matching.
    VALIDATION_ONLY: Validate data without writing results. Useful for testing
        validation rules.

Dependency Analysis:
    The engine automatically analyzes step dependencies and executes steps
    sequentially in the correct order using topological sort. Steps execute
    one at a time in dependency order to respect dependency constraints.

Service Architecture:
    The execution engine delegates to specialized services:
    - Step Executors: BronzeStepExecutor, SilverStepExecutor, GoldStepExecutor
        handle step-specific execution logic
    - ExecutionValidator: Validates data according to step rules
    - TableService: Manages table operations and schema management
    - WriteService: Handles all write operations to Delta Lake

Example:
    Basic usage with a single step:

    >>> from pipeline_builder.execution import ExecutionEngine
    >>> from pipeline_builder_base.models import ExecutionMode, PipelineConfig
    >>> from pipeline_builder.models import BronzeStep
    >>> from pipeline_builder.functions import get_default_functions
    >>> F = get_default_functions()
    >>>
    >>> # Create execution engine
    >>> config = PipelineConfig.create_default(schema="my_schema")
    >>> engine = ExecutionEngine(spark, config)
    >>>
    >>> # Execute a single step
    >>> result = engine.execute_step(
    ...     step=BronzeStep(name="events", rules={"id": [F.col("id").isNotNull()]}),
    ...     context={"events": source_df},
    ...     mode=ExecutionMode.INITIAL
    ... )
    >>> print(f"Step completed: {result.status}, rows: {result.rows_processed}")

    Full pipeline execution:

    >>> result = engine.execute_pipeline(
    ...     steps=[bronze_step, silver_step, gold_step],
    ...     mode=ExecutionMode.INITIAL,
    ...     context={"events": source_df}
    ... )
    >>> print(f"Pipeline completed: {result.status}")
    >>> print(f"Steps executed: {len(result.steps) if result.steps else 0}")

Note:
    This module depends on:
    - compat: Spark compatibility layer
    - dependencies: Dependency analysis
    - errors: Error handling
    - functions: PySpark function protocols
    - logging: Pipeline logging
    - models.pipeline: Pipeline configuration models
    - models.steps: Step models
    - table_operations: Table utility functions
    - validation.data_validation: Data validation logic
"""

from __future__ import annotations

import os
import tempfile
import uuid
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional, Union, cast

try:
    import psutil

    HAS_PSUTIL = True
except ImportError:
    HAS_PSUTIL = False
    psutil = None  # type: ignore[assignment, unused-ignore]
# from .dependencies import DependencyAnalyzer  # Removed: defined in notebook cells above
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
    # ExecutionMode,
    # PipelineConfig,
# )

# from .compat import DataFrame, F, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from .functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from .models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .step_executors import (  # Removed: defined in notebook cells above
    # BronzeStepExecutor,
    # GoldStepExecutor,
    # SilverStepExecutor,
# )
# from .storage import TableService, WriteService  # Removed: defined in notebook cells above
# from .table_operations import fqn, table_exists, table_schema_is_empty  # Removed: defined in notebook cells above
# from .validation.execution_validator import ExecutionValidator  # Removed: defined in notebook cells above

# Handle optional Delta Lake dependency
try:
    from delta.tables import DeltaTable

    HAS_DELTA = True
except (ImportError, AttributeError, RuntimeError):
    DeltaTable = None  # type: ignore[misc, assignment]
    HAS_DELTA = False

# Cache for Delta Lake availability per Spark session
_delta_availability_cache_execution: Dict[str, bool] = {}

def _is_delta_lake_available_execution(spark: SparkSession) -> bool:  # type: ignore[valid-type]
    """Check if Delta Lake is available and working in the Spark session.

    This function checks Spark configuration and optionally tests Delta
    functionality by attempting to write a test DataFrame. Results are cached
    per Spark session for performance.

    Args:
        spark: SparkSession instance to test for Delta Lake availability.

    Returns:
        True if Delta Lake is available and working, False otherwise.

    Note:
        The function checks:
        1. If delta package is installed
        2. Spark configuration for Delta extensions and catalog
        3. Actual Delta write capability via test write operation

        Results are cached per Spark session using the session's JVM ID.
    """
    # Use Spark session's underlying SparkContext ID as cache key
    try:
        spark_id = (
            str(id(spark._jsparkSession))
            if hasattr(spark, "_jsparkSession")
            else str(id(spark))
        )
    except Exception:
        # Fallback: use Python id if JVM/session id unavailable
        spark_id = str(id(spark))

    # Check cache first
    if spark_id in _delta_availability_cache_execution:
        return _delta_availability_cache_execution[spark_id]

    # If delta package is not installed, can't be available
    if not HAS_DELTA:
        _delta_availability_cache_execution[spark_id] = False
        return False

    # Check Spark configuration first (fast check)
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        catalog = spark.conf.get("spark.sql.catalog.spark_catalog", "")  # type: ignore[attr-defined]

        # If both extensions and catalog are configured for Delta, assume it works
        if (
            extensions
            and catalog
            and "DeltaSparkSessionExtension" in extensions
            and "DeltaCatalog" in catalog
        ):
            _delta_availability_cache_execution[spark_id] = True
            return True
    except Exception:
        pass  # Config check failed; proceed to lightweight test

    # If only extensions are configured, do a lightweight test
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        if extensions and "DeltaSparkSessionExtension" in extensions:
            # Try a simple test - create a minimal DataFrame and try to write it
            test_df = spark.createDataFrame([(1, "test")], ["id", "name"])
            # Use a unique temp directory to avoid conflicts
            with tempfile.TemporaryDirectory() as temp_dir:
                test_path = os.path.join(temp_dir, "delta_test")
                try:
                    test_df.write.format("delta").mode("overwrite").save(test_path)
                    _delta_availability_cache_execution[spark_id] = True
                    return True
                except Exception:
                    # Delta format failed - not available
                    pass
    except Exception:
        pass  # Lightweight Delta test failed or config unavailable

    # Delta is not available in this Spark session
    _delta_availability_cache_execution[spark_id] = False
    return False

# Removed _check_batch_mode_with_delta() - Delta Lake does support batch operations
# in real Spark mode. The previous restriction was incorrect.

def _create_dataframe_writer(
    df: DataFrame,
    spark: SparkSession,  # type: ignore[valid-type]
    mode: str,
    table_name: Optional[str] = None,
    **options: Any,
) -> Any:
    """Create a DataFrameWriter using the standardized Delta write pattern.

    Creates a DataFrameWriter configured for Delta Lake format with appropriate
    options based on the write mode. For overwrite mode, includes
    overwriteSchema option to allow schema evolution.

    Args:
        df: DataFrame to write.
        spark: SparkSession instance (used for Delta overwrite preparation).
        mode: Write mode ("overwrite", "append", etc.).
        table_name: Optional fully qualified table name for preparing Delta
            overwrite operations.
        **options: Additional write options to apply to the writer.

    Returns:
        Configured DataFrameWriter instance ready for saveAsTable().

    Note:
        Always uses Delta format. Failures will propagate if Delta is not
        available. For overwrite mode, uses format("delta").mode("overwrite")
        .option("overwriteSchema", "true").
    """
    # Use standardized overwrite pattern: overwrite + overwriteSchema
    if mode == "overwrite":
        writer = (
            df.write.format("delta").mode("overwrite").option("overwriteSchema", "true")
        )
    else:
        # Append or other modes - always use Delta
        writer = df.write.format("delta").mode(mode)

    for key, value in options.items():
        writer = writer.option(key, value)

    return writer

def _get_existing_schema_safe(spark: Any, table_name: str) -> Optional[Any]:
    """Safely get the schema of an existing table.

    Attempts multiple methods to retrieve the table schema, handling catalog
    sync issues where Spark may report empty schemas. Tries progressively
    more expensive methods until schema is found or all methods are exhausted.

    Args:
        spark: SparkSession instance.
        table_name: Fully qualified table name (schema.table).

    Returns:
        StructType schema if table exists and schema is readable (may be
        empty struct<>), None if table doesn't exist or schema can't be read.

    Note:
        Tries methods in order:
        1. Direct schema from spark.table()
        2. If empty schema (catalog sync issue), try DESCRIBE TABLE
        3. If still empty, try reading a sample row to infer schema

        Returns empty struct<> if table exists but schema cannot be determined,
        allowing callers to handle catalog sync issues appropriately.
    """
    try:
        table_df = spark.table(table_name)  # type: ignore[attr-defined]
        schema = table_df.schema  # type: ignore[attr-defined]

        # If schema is empty (catalog sync issue), try DESCRIBE TABLE as fallback
        if not schema.fields or len(schema.fields) == 0:
            try:
                # Try DESCRIBE TABLE to get schema information
                describe_df = spark.sql(f"DESCRIBE TABLE {table_name}")  # type: ignore[attr-defined]
                describe_rows = describe_df.collect()  # type: ignore[attr-defined]

                # If DESCRIBE returns rows with column info, try to read schema from data
                if describe_rows and len(describe_rows) > 0:
                    # Try reading a sample row to infer schema
                    try:
                        sample_df = spark.sql(f"SELECT * FROM {table_name} LIMIT 1")  # type: ignore[attr-defined]
                        inferred_schema = sample_df.schema  # type: ignore[attr-defined]
                        if inferred_schema.fields and len(inferred_schema.fields) > 0:
                            return inferred_schema
                    except Exception:
                        pass  # Schema inference from sample failed
            except Exception:
                pass  # DESCRIBE or sample read failed

        # Return schema even if empty (struct<>) - caller will handle empty schemas specially
        return schema
    except Exception:
        pass  # Schema recovery failed; caller gets None
    return None

def _schemas_match(existing_schema: Any, output_schema: Any) -> tuple[bool, list[str]]:
    """Compare two schemas and determine if they match exactly.

    Compares field names, types, and nullability between existing and output
    schemas. Returns detailed information about any mismatches found.

    Args:
        existing_schema: StructType schema of the existing table.
        output_schema: StructType schema of the output DataFrame.

    Returns:
        Tuple of (matches: bool, differences: list[str]) where:
        - matches: True if schemas match exactly, False otherwise
        - differences: List of human-readable descriptions of mismatches

    Note:
        Checks for:
        - Missing columns in output
        - New columns in output
        - Type mismatches in common columns
        - Nullable changes (informational only - doesn't fail validation)

        Column order differences are noted but don't affect the match result.
    """
    differences = []

    # Extract field dictionaries
    existing_fields = (
        {f.name: f for f in existing_schema.fields} if existing_schema.fields else {}
    )
    output_fields = (
        {f.name: f for f in output_schema.fields} if output_schema.fields else {}
    )

    existing_columns = set(existing_fields.keys())
    output_columns = set(output_fields.keys())

    # Check for missing columns in output
    missing_in_output = existing_columns - output_columns
    if missing_in_output:
        differences.append(f"Missing columns in output: {sorted(missing_in_output)}")

    # Check for new columns in output
    new_in_output = output_columns - existing_columns
    if new_in_output:
        differences.append(
            f"New columns in output (not in existing table): {sorted(new_in_output)}"
        )

    # Check for type mismatches and nullable changes in common columns
    common_columns = existing_columns & output_columns
    type_mismatches = []
    nullable_changes = []
    for col in common_columns:
        existing_field = existing_fields[col]
        output_field = output_fields[col]

        # Check type mismatch
        if existing_field.dataType != output_field.dataType:
            type_mismatches.append(
                f"{col}: existing={existing_field.dataType}, "
                f"output={output_field.dataType}"
            )

        # Check nullable changes (nullable -> non-nullable is stricter, non-nullable -> nullable is more lenient)
        existing_nullable = getattr(existing_field, "nullable", True)
        output_nullable = getattr(output_field, "nullable", True)
        if existing_nullable != output_nullable:
            if not existing_nullable and output_nullable:
                # Existing is non-nullable, output is nullable - this is usually OK (more lenient)
                nullable_changes.append(
                    f"{col}: nullable changed from False to True (more lenient - usually OK)"
                )
            else:
                # Existing is nullable, output is non-nullable - this is stricter and may cause issues
                nullable_changes.append(
                    f"{col}: nullable changed from True to False (stricter - may cause issues if data has nulls)"
                )

    if type_mismatches:
        differences.append(f"Type mismatches: {', '.join(type_mismatches)}")

    if nullable_changes:
        # Note nullable changes but don't fail validation for them (Delta Lake handles this)
        differences.append(
            f"Nullable changes (informational): {', '.join(nullable_changes)}"
        )

    # Check for column order differences (informational only - order doesn't affect functionality)
    existing_order = list(existing_fields.keys())
    output_order = list(output_fields.keys())
    if (
        existing_order != output_order
        and common_columns == existing_columns == output_columns
    ):
        # All columns match, just order is different
        differences.append(
            f"Column order differs (informational - order doesn't affect functionality): "
            f"existing={existing_order}, output={output_order}"
        )

    return len(
        [d for d in differences if "informational" not in d.lower()]
    ) == 0, differences

def _recover_table_schema(spark: Any, table_name: str) -> Optional[Any]:
    """Attempt to recover table schema when catalog shows empty schema.

    Attempts to recover schema information when Spark catalog reports an
    empty schema (struct<>), which can occur due to catalog sync issues
    with Delta Lake tables.

    Args:
        spark: SparkSession instance.
        table_name: Fully qualified table name (schema.table).

    Returns:
        Recovered StructType schema if recovery succeeds, None if all
        recovery methods fail.

    Note:
        Tries methods in order:
        1. REFRESH TABLE and re-read schema
        2. DESCRIBE TABLE to get column information
        3. Read sample data to force schema resolution
        4. Force schema evaluation by reading a row

        This is a best-effort recovery - may return None if table doesn't
        exist or all recovery methods fail.
    """
    try:
        # Method 1: Try DESCRIBE TABLE and REFRESH to force catalog sync
        try:
            spark.sql(f"REFRESH TABLE {table_name}")  # type: ignore[attr-defined]
        except Exception:
            pass  # Ignore refresh errors

        # Re-read table after refresh
        table_df = spark.table(table_name)  # type: ignore[attr-defined]
        df_schema = table_df.schema  # type: ignore[attr-defined]

        # Check if schema now has fields
        if (
            hasattr(df_schema, "fields")
            and df_schema.fields
            and len(df_schema.fields) > 0
        ):
            return df_schema

        # Method 2: Even if schema.fields is empty, check if DataFrame has columns
        # This indicates the table exists and has data, just catalog is out of sync
        if table_df.columns and len(table_df.columns) > 0:
            # DataFrame has columns - try DESCRIBE TABLE to get schema information
            try:
                describe_df = spark.sql(f"DESCRIBE TABLE {table_name}")  # type: ignore[attr-defined]
                describe_rows = describe_df.collect()  # type: ignore[attr-defined]

                # If DESCRIBE returns column information, try to re-read the table
                # Sometimes DESCRIBE helps Spark refresh its understanding of the schema
                if describe_rows and len(describe_rows) > 0:
                    # Try reading the table again after DESCRIBE
                    table_df_retry = spark.table(table_name)  # type: ignore[attr-defined]
                    df_schema_retry = table_df_retry.schema  # type: ignore[attr-defined]
                    if (
                        hasattr(df_schema_retry, "fields")
                        and df_schema_retry.fields
                        and len(df_schema_retry.fields) > 0
                    ):
                        return df_schema_retry
            except Exception:
                # DESCRIBE or re-read failed
                pass

            # If DESCRIBE didn't help, try to force schema resolution by reading data
            try:
                # Attempt to read a row to force Spark to resolve schema
                sample = table_df.limit(1)
                sample.collect()  # Force execution

                # Re-read schema after forcing execution
                df_schema_retry = table_df.schema  # type: ignore[attr-defined]
                if (
                    hasattr(df_schema_retry, "fields")
                    and df_schema_retry.fields
                    and len(df_schema_retry.fields) > 0
                ):
                    return df_schema_retry
            except Exception:
                # Schema recovery via data read failed
                pass
    except Exception:
        # Schema recovery failed
        pass

    return None

class StepStatus(Enum):
    """Execution status of a pipeline step.

    Attributes:
        PENDING: Step is queued but not yet started.
        RUNNING: Step is currently executing.
        COMPLETED: Step completed successfully.
        FAILED: Step execution failed with an error.
        SKIPPED: Step was skipped (e.g., due to dependencies).
    """

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    SKIPPED = "skipped"

class StepType(Enum):
    """Types of pipeline steps in the Medallion architecture.

    Attributes:
        BRONZE: Raw data ingestion and validation step.
        SILVER: Cleaned and enriched data step.
        GOLD: Business analytics and aggregation step.
    """

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

@dataclass
class StepExecutionResult:
    """Result of a single pipeline step execution.

    Contains comprehensive information about step execution including timing,
    validation metrics, resource usage, and output details.

    Attributes:
        step_name: Name of the executed step.
        step_type: Type of step (BRONZE, SILVER, or GOLD).
        status: Execution status (PENDING, RUNNING, COMPLETED, FAILED, SKIPPED).
        start_time: Timestamp when step execution started.
        end_time: Timestamp when step execution completed (None if still running).
        duration: Execution duration in seconds (calculated from start/end times).
        error: Error message if step failed (None if successful).
        rows_processed: Number of rows processed by the step.
        output_table: Fully qualified name of output table (None for Bronze steps).
        write_mode: Write mode used ("overwrite", "append", or None).
        validation_rate: Percentage of rows that passed validation (0-100).
        rows_written: Number of rows written to output table (None for Bronze steps).
        input_rows: Number of input rows (same as rows_processed for most steps).
        memory_usage_mb: Peak memory usage in megabytes (if psutil available).
        cpu_usage_percent: CPU usage percentage (if psutil available).

    Note:
        Duration is automatically calculated in __post_init__ if both start_time
        and end_time are provided. Bronze steps don't write to tables, so
        output_table and rows_written will be None for them.
    """

    step_name: str
    step_type: StepType
    status: StepStatus
    start_time: datetime
    end_time: Optional[datetime] = None
    duration: Optional[float] = None
    error: Optional[str] = None
    rows_processed: Optional[int] = None
    output_table: Optional[str] = None
    write_mode: Optional[str] = None
    validation_rate: float = 100.0
    rows_written: Optional[int] = None
    input_rows: Optional[int] = None
    memory_usage_mb: Optional[float] = None
    cpu_usage_percent: Optional[float] = None

    def __post_init__(self) -> None:
        """Calculate duration if both start and end times are available."""
        if self.end_time and self.start_time:
            self.duration = (self.end_time - self.start_time).total_seconds()

@dataclass
class ExecutionResult:
    """Result of complete pipeline execution.

    Contains comprehensive information about pipeline execution including
    all step results, timing, dependency analysis results, and overall status.

    Attributes:
        execution_id: Unique identifier for this execution run.
        mode: Execution mode used (INITIAL, INCREMENTAL, FULL_REFRESH, VALIDATION_ONLY).
        start_time: Timestamp when pipeline execution started.
        end_time: Timestamp when pipeline execution completed (None if still running).
        duration: Total execution duration in seconds (calculated from start/end times).
        status: Overall pipeline status ("running", "completed", "failed").
        steps: List of StepExecutionResult for each step in the pipeline.
        error: Error message if pipeline failed (None if successful).

    Note:
        Duration is automatically calculated in __post_init__ if both start_time
        and end_time are provided. Steps list is initialized to empty list if None.
        The status field tracks overall pipeline status based on individual step results.
    """

    execution_id: str
    mode: ExecutionMode
    start_time: datetime
    end_time: Optional[datetime] = None
    duration: Optional[float] = None
    status: str = "running"
    steps: Optional[list[StepExecutionResult]] = None
    error: Optional[str] = None

    def __post_init__(self) -> None:
        """Initialize steps list and calculate duration if times are available."""
        if self.steps is None:
            self.steps = []
        if self.end_time and self.start_time:
            self.duration = (self.end_time - self.start_time).total_seconds()

class ExecutionEngine:
    """Execution engine for pipeline execution with service-oriented architecture.

    This engine orchestrates pipeline execution using specialized services for
    clean separation of concerns. It handles both individual step execution and
    full pipeline execution with dependency-aware sequential processing.

    The engine uses a service-oriented architecture:
    - Step Executors: BronzeStepExecutor, SilverStepExecutor, GoldStepExecutor
      handle step-specific execution logic
    - ExecutionValidator: Validates data according to step rules
    - TableService: Manages table operations and schema management
    - WriteService: Handles all write operations to Delta Lake

    Key Features:
        - Dependency-aware execution: Automatically analyzes and respects step
          dependencies
        - Sequential execution: Steps execute in correct dependency order (topological sort)
        - Comprehensive validation: Built-in validation with configurable thresholds
        - Error handling: Detailed error messages with context and suggestions
        - Resource tracking: Monitors memory and CPU usage (if psutil available)

    Example:
        >>> from pipeline_builder.execution import ExecutionEngine
        >>> from pipeline_builder_base.models import PipelineConfig, ExecutionMode
        >>>
        >>> config = PipelineConfig.create_default(schema="analytics")
        >>> engine = ExecutionEngine(spark, config)
        >>>
        >>> # Execute a single step
        >>> result = engine.execute_step(
        ...     step=bronze_step,
        ...     context={"events": source_df},
        ...     mode=ExecutionMode.INITIAL
        ... )
        >>>
        >>> # Execute full pipeline
        >>> result = engine.execute_pipeline(
        ...     steps=[bronze, silver, gold],
        ...     mode=ExecutionMode.INITIAL,
        ...     context={"events": source_df}
        ... )
    """

    def __init__(
        self,
        spark: SparkSession,  # type: ignore[valid-type]
        config: PipelineConfig,
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
    ):
        """Initialize the execution engine.

        Creates an ExecutionEngine instance with all required services. The
        engine initializes step executors, validation service, and storage
        services for handling pipeline execution.

        Args:
            spark: Active SparkSession instance for DataFrame operations.
            config: PipelineConfig containing pipeline configuration including
                schema, validation thresholds, and other settings.
            logger: Optional PipelineLogger instance. If None, creates a default
                logger.
            functions: Optional FunctionsProtocol instance for PySpark operations.
                If None, uses get_default_functions() to get appropriate functions
                based on engine configuration.

        Note:
            All services are initialized during construction. The engine is ready
            to execute steps immediately after initialization.
        """
        self.spark: SparkSession = spark  # type: ignore[valid-type]
        self.config = config
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger

        # Store functions for validation
        if functions is None:
            # from .functions import get_default_functions  # Removed: defined in notebook cells above

            self.functions = get_default_functions()
        else:
            self.functions = functions

        # Initialize step executors
        self.bronze_executor = BronzeStepExecutor(spark, self.logger, self.functions)
        self.silver_executor = SilverStepExecutor(spark, self.logger, self.functions)
        self.gold_executor = GoldStepExecutor(spark, self.logger, self.functions)

        # Initialize validation service
        self.validator = ExecutionValidator(self.logger, self.functions)

        # Initialize storage services
        self.table_service = TableService(spark, self.logger)
        self.write_service = WriteService(spark, self.table_service, self.logger)

    def _ensure_schema_exists(self, schema: str) -> None:
        """Ensure a schema exists, creating it if necessary.

        Attempts to create the specified schema if it doesn't already exist.
        Uses SQL CREATE SCHEMA IF NOT EXISTS for idempotent creation.

        Args:
            schema: Schema name to create or verify.

        Raises:
            ExecutionError: If schema creation fails after all attempts.

        Note:
            First checks if schema exists in catalog, then attempts creation
            using SQL. If creation fails, raises ExecutionError with context.
        """
        # Check if schema already exists
        try:
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema in databases:
                return  # Schema already exists, nothing to do
        except Exception:
            pass  # If we can't check, try to create anyway

        try:
            # Use SQL CREATE SCHEMA (works for both PySpark and mock-spark)
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")  # type: ignore[attr-defined]
            # Verify it was created
            databases = [db.name for db in self.spark.catalog.listDatabases()]  # type: ignore[attr-defined]
            if schema not in databases:
                raise ExecutionError(
                    f"Schema '{schema}' creation via SQL failed - schema not in catalog. "
                    f"Available databases: {databases}"
                )
        except ExecutionError:
            raise  # Re-raise ExecutionError
        except Exception as e:
            # Wrap other exceptions
            raise ExecutionError(f"Failed to create schema '{schema}': {str(e)}") from e

    @staticmethod
    def _collect_resource_metrics() -> tuple[Optional[float], Optional[float]]:
        """Collect current memory and CPU usage metrics.

        Uses psutil to collect resource usage metrics for the current process.
        Returns None values if psutil is not available.

        Returns:
            Tuple of (memory_usage_mb, cpu_usage_percent) where:
            - memory_usage_mb: Memory usage in megabytes (RSS)
            - cpu_usage_percent: CPU usage percentage

            Returns (None, None) if psutil is unavailable or metrics collection fails.

        Note:
            Memory is measured as RSS (Resident Set Size) in megabytes.
            CPU usage is measured over a 0.1 second interval.
        """
        if not HAS_PSUTIL:
            return None, None

        try:
            process = psutil.Process()
            memory_info = process.memory_info()
            memory_mb = memory_info.rss / (1024 * 1024)  # Convert bytes to MB
            cpu_percent = process.cpu_percent(interval=0.1)
            return memory_mb, cpu_percent
        except Exception:
            # If metrics collection fails, return None values
            return None, None

    def execute_step(
        self,
        step: Union[BronzeStep, SilverStep, GoldStep],
        context: Dict[str, DataFrame],  # type: ignore[valid-type]
        mode: ExecutionMode = ExecutionMode.INITIAL,
        step_params: Optional[Dict[str, Any]] = None,
        write_outputs: bool = True,
        step_types: Optional[Dict[str, str]] = None,
    ) -> StepExecutionResult:
        """Execute a single pipeline step.

        Executes a single step (Bronze, Silver, or Gold) with validation,
        transformation, and optional table writing. Uses specialized step
        executors for step-specific logic.

        Args:
            step: The step to execute (BronzeStep, SilverStep, or GoldStep).
            context: Dictionary mapping step names to DataFrames. Must contain
                required source data for the step (e.g., bronze data for Silver
                steps, silver data for Gold steps).
            mode: Execution mode (INITIAL, INCREMENTAL, FULL_REFRESH,
                VALIDATION_ONLY). Defaults to INITIAL.
            step_params: Optional dictionary of parameters to pass to the step's
                transform function. Only used for Silver and Gold steps. If the
                transform function accepts a 'params' argument or **kwargs,
                these will be passed. Otherwise, ignored for backward compatibility.
            write_outputs: If True, write step outputs to Delta Lake tables.
                If False, skip writing (useful for debugging/iteration).
                Defaults to True.
            step_types: Optional dictionary mapping step names to step types
                ("bronze", "silver", "gold"). Used to build prior_golds for
                transform functions. If None, prior_golds will not be built.

        Returns:
            StepExecutionResult containing execution details including:
            - Status (COMPLETED, FAILED)
            - Timing information
            - Row counts (processed, written)
            - Validation metrics
            - Resource usage (if available)

        Raises:
            ExecutionError: If step execution fails for any reason.

        Example:
            >>> result = engine.execute_step(
            ...     step=silver_step,
            ...     context={"events": bronze_df},
            ...     mode=ExecutionMode.INITIAL
            ... )
            >>> print(f"Status: {result.status}, Rows: {result.rows_processed}")

        Note:
            - Bronze steps only validate data, they don't write to tables
            - Silver and Gold steps write to Delta Lake tables
            - Validation is applied according to step rules
            - Schema validation is performed for INCREMENTAL and FULL_REFRESH modes
        """
        start_time = datetime.now()
        # Collect initial resource metrics
        start_memory, start_cpu = self._collect_resource_metrics()

        # Determine step type using step_type property (avoids isinstance issues in Python 3.8)
        # Initialize step_type to None to ensure it's always defined for exception handling
        step_type = None
        try:
            phase = step.step_type
            if phase.value == "bronze":
                step_type = StepType.BRONZE
            elif phase.value == "silver":
                step_type = StepType.SILVER
            elif phase.value == "gold":
                step_type = StepType.GOLD
            else:
                raise ValueError(f"Unknown step type: {phase.value}")
        except AttributeError as err:
            raise ValueError(
                f"Unknown step type: Step must have step_type property (BronzeStep, SilverStep, or GoldStep), got {type(step)}"
            ) from err

        result = StepExecutionResult(
            step_name=step.name,
            step_type=step_type,
            status=StepStatus.RUNNING,
            start_time=start_time,
        )

        try:
            # Use logger's step_start method for consistent formatting with emoji and uppercase
            self.logger.step_start(step_type.value, step.name)

            # Execute the step based on type using executors
            output_df: DataFrame  # type: ignore[valid-type]
            if step_type == StepType.BRONZE:
                output_df = self.bronze_executor.execute(step, context, mode)  # type: ignore[arg-type]
            elif step_type == StepType.SILVER:
                output_df = self.silver_executor.execute(
                    step, context, mode, step_params=step_params, step_types=step_types
                )  # type: ignore[arg-type]
                # Store output DataFrame in context immediately after execution for downstream steps
                # This ensures prior_silvers is populated for subsequent silver steps
                context[step.name] = output_df  # type: ignore[assignment]
            elif step_type == StepType.GOLD:
                output_df = self.gold_executor.execute(
                    step, context, mode, step_params=step_params, step_types=step_types
                )  # type: ignore[arg-type]
                # Store output DataFrame in context immediately after execution for downstream steps
                context[step.name] = output_df  # type: ignore[assignment]
            else:
                raise ExecutionError(f"Unknown step type: {step_type}")

            # Apply validation if not in validation-only mode
            validation_rate = 100.0
            invalid_rows = 0
            if mode != ExecutionMode.VALIDATION_ONLY:
                # All step types (Bronze, Silver, Gold) have rules attribute
                if step.rules:
                    # Use validation service for validation
                    output_df, _, validation_stats = (
                        self.validator.validate_step_output(
                            output_df,
                            step.name,
                            step.rules,
                            "pipeline",
                        )
                    )
                    # Extract validation metrics
                    validation_rate, invalid_rows = (
                        self.validator.get_validation_metrics(validation_stats)
                    )

            # Write output if not in validation-only mode and write_outputs is True
            # Note: Bronze steps only validate data, they don't write to tables
            # Validation-only steps (with_silver_rules / with_gold_rules) only read existing tables
            _is_validation_only_step = (
                getattr(step, "existing", False) and step.transform is None
            )
            if (
                mode != ExecutionMode.VALIDATION_ONLY
                and step_type != StepType.BRONZE
                and write_outputs
                and not _is_validation_only_step
            ):
                # Use table_name attribute for SilverStep and GoldStep
                table_name = getattr(step, "table_name", step.name)
                schema = getattr(step, "schema", None)

                # Validate schema is provided
                if schema is None:
                    raise ExecutionError(
                        f"Step '{step.name}' requires a schema to be specified. "
                        f"Silver and Gold steps must have a valid schema for table operations. "
                        f"Please provide a schema when creating the step."
                    )

                output_table = fqn(schema, table_name)

                # Ensure schema exists before creating table
                # Use SQL CREATE SCHEMA (works for both PySpark and mock-spark)
                try:
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")  # type: ignore[attr-defined]
                except Exception as e:
                    # Schema might already exist, continue
                    # If all methods fail, raise error - schema creation is critical
                    # Check if it's a schema already exists error
                    error_msg = str(e).lower()
                    if (
                        "already exists" not in error_msg
                        and "duplicate" not in error_msg
                    ):
                        raise ExecutionError(
                            f"Failed to create schema '{schema}' before table creation: {e}"
                        ) from e

                # Determine write mode
                # - Gold steps always use overwrite to prevent duplicate aggregates
                # - Silver steps append during incremental runs to preserve history
                # - INITIAL mode uses overwrite
                # - FULL_REFRESH uses overwrite
                if step_type == StepType.GOLD:
                    write_mode_str = "overwrite"
                elif mode == ExecutionMode.INCREMENTAL:
                    write_mode_str = "append"
                else:  # INITIAL or FULL_REFRESH
                    write_mode_str = "overwrite"

                # Validate schema based on execution mode
                # For INCREMENTAL and FULL_REFRESH modes, schema must match exactly
                # For INITIAL mode, schema changes are allowed
                if mode in (ExecutionMode.INCREMENTAL, ExecutionMode.FULL_REFRESH):
                    if table_exists(self.spark, output_table):
                        # Refresh table metadata to ensure catalog is in sync (especially important for Delta tables)
                        try:
                            self.spark.sql(f"REFRESH TABLE {output_table}")  # type: ignore[attr-defined]
                        except Exception as refresh_error:
                            # Refresh might fail for some table types - log but continue
                            self.logger.debug(
                                f"Could not refresh table {output_table} before schema validation: {refresh_error}"
                            )

                        existing_schema = _get_existing_schema_safe(
                            self.spark, output_table
                        )
                        if existing_schema is None:
                            # Cannot read schema - raise error
                            raise ExecutionError(
                                f"Cannot read schema for table '{output_table}' in {mode} mode. "
                                "Schema validation is required for INCREMENTAL and FULL_REFRESH modes.",
                                context={
                                    "step_name": step.name,
                                    "table": output_table,
                                    "mode": mode.value,
                                },
                                suggestions=[
                                    "Ensure the table exists and is accessible",
                                    "Check that the table schema is readable",
                                    "Use INITIAL mode if you need to recreate the table",
                                ],
                            )

                        # If catalog reports empty schema, treat as mismatch with explicit guidance
                        schema_is_empty = (
                            not existing_schema.fields
                            or len(existing_schema.fields) == 0
                        )
                        if schema_is_empty:
                            output_schema = output_df.schema  # type: ignore[attr-defined]
                            raise ExecutionError(
                                f"Schema mismatch for table '{output_table}' in {mode} mode. "
                                f"Catalog reports empty schema (struct<>), but output schema has {len(output_schema.fields)} fields: {[f.name for f in output_schema.fields]}. "
                                f"Use INITIAL mode to recreate the table or provide schema_override explicitly.",
                                context={
                                    "step_name": step.name,
                                    "table": output_table,
                                    "mode": mode.value,
                                    "existing_schema": "struct<> (empty - catalog sync issue)",
                                    "output_schema": str(output_schema),
                                },
                                suggestions=[
                                    "Run initial_load/full_refresh to recreate the table with the desired schema",
                                    "Provide schema_override to force the schema in allowed modes",
                                ],
                            )

                        output_schema = output_df.schema  # type: ignore[attr-defined]
                        schemas_match, differences = _schemas_match(
                            existing_schema, output_schema
                        )

                        if not schemas_match:
                            raise ExecutionError(
                                f"Schema mismatch for table '{output_table}' in {mode} mode. "
                                f"Schema changes are only allowed in INITIAL mode.\n"
                                f"{chr(10).join(differences)}\n\n"
                                f"Existing table schema: {existing_schema}\n"
                                f"Output DataFrame schema: {output_schema}",
                                context={
                                    "step_name": step.name,
                                    "table": output_table,
                                    "mode": mode.value,
                                    "existing_schema": str(existing_schema),
                                    "output_schema": str(output_schema),
                                },
                                suggestions=[
                                    "Ensure the output schema matches the existing table schema exactly",
                                    "Run with INITIAL mode to recreate the table with the new schema",
                                    "Manually update the existing table schema to match the new schema",
                                ],
                            )

                # NOTE: We intentionally do NOT drop existing tables in INITIAL mode.
                # Dropping is destructive and can leave downstream users with missing tables
                # if a pipeline run fails after the drop but before the overwrite commit.
                # Delta overwrite is transactional; prefer `.mode("overwrite")` + schema options.

                # Handle schema override if provided
                schema_override = getattr(step, "schema_override", None)
                should_apply_schema_override = False

                if schema_override is not None:
                    # Determine when to apply schema override:
                    # - Gold steps: Always apply (always use overwrite mode)
                    # - Silver steps in initial/full refresh: Always apply
                    # - Silver steps in incremental: Only if table doesn't exist
                    if step_type == StepType.GOLD:
                        should_apply_schema_override = True
                    elif step_type == StepType.SILVER:
                        if mode != ExecutionMode.INCREMENTAL:
                            should_apply_schema_override = True
                        else:
                            should_apply_schema_override = not table_exists(
                                self.spark, output_table
                            )

                # Apply schema override if needed
                if should_apply_schema_override:
                    try:
                        # Cast DataFrame to the override schema
                        output_df = self.spark.createDataFrame(  # type: ignore[attr-defined]
                            output_df.rdd, schema_override
                        )  # type: ignore[attr-defined]
                        # For overwrite mode, use DELETE + INSERT pattern
                        if write_mode_str == "overwrite":
                            # Delete existing data if table exists
                            delete_succeeded = False
                            if table_exists(self.spark, output_table):
                                try:
                                    self.spark.sql(f"DELETE FROM {output_table}")  # type: ignore[attr-defined]
                                    delete_succeeded = True
                                except Exception as e:
                                    # DELETE might fail for non-Delta tables
                                    error_msg = str(e).lower()
                                    if (
                                        "does not support delete" in error_msg
                                        or "unsupported_feature" in error_msg
                                    ):
                                        self.logger.info(
                                            f"Table '{output_table}' does not support DELETE. "
                                            f"Using overwrite mode instead."
                                        )
                                    else:
                                        self.logger.warning(
                                            f"Could not delete from table '{output_table}' before overwrite: {e}"
                                        )

                            if delete_succeeded:
                                # Write with overwrite mode and overwriteSchema option
                                # Note: Delta Lake doesn't support append in batch mode
                                try:
                                    (
                                        output_df.write.format("delta")
                                        .mode("overwrite")
                                        .option("overwriteSchema", "true")
                                        .saveAsTable(output_table)  # type: ignore[attr-defined]
                                    )
                                except Exception as write_error:
                                    # Handle truncate error for Delta tables
                                    error_msg = str(write_error).lower()
                                    if (
                                        "truncate" in error_msg
                                        and "batch mode" in error_msg
                                    ):
                                        # Delta table doesn't support truncate - drop and retry
                                        self.logger.warning(
                                            f"Delta table truncate error, dropping table and retrying: {write_error}"
                                        )
                                        try:
                                            self.spark.sql(
                                                f"DROP TABLE IF EXISTS {output_table}"
                                            )  # type: ignore[attr-defined]
                                            import time

                                            time.sleep(
                                                0.1
                                            )  # Brief delay for catalog sync
                                            (
                                                output_df.write.format("delta")
                                                .mode("overwrite")
                                                .option("overwriteSchema", "true")
                                                .saveAsTable(output_table)  # type: ignore[attr-defined]
                                            )
                                            # Successfully wrote after retry
                                            write_error = None
                                        except Exception as retry_error:
                                            raise ExecutionError(
                                                f"Failed to write table '{output_table}' even after retry: {retry_error}",
                                                context={
                                                    "step_name": step.name,
                                                    "table": output_table,
                                                    "mode": mode.value,
                                                    "original_error": str(write_error),
                                                },
                                            ) from retry_error

                                    # If we handled truncate error successfully, skip other error handling
                                    if write_error is None:
                                        pass  # Write succeeded after retry, continue execution
                                    # Handle race condition where table might be created by another thread
                                    elif (
                                        "already exists" in error_msg
                                        or "table_or_view_already_exists" in error_msg
                                    ):
                                        # Table was created by another thread - verify it exists and retry with overwrite mode
                                        if table_exists(self.spark, output_table):
                                            self.logger.debug(
                                                f"Table {output_table} was created by another thread, retrying with overwrite mode"
                                            )
                                            # Retry with overwrite mode (original mode) and overwriteSchema
                                            retry_writer = _create_dataframe_writer(
                                                output_df,
                                                self.spark,
                                                "overwrite",
                                                table_name=output_table,
                                                overwriteSchema="true",
                                            )
                                            retry_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                        else:
                                            raise
                                    else:
                                        raise
                            else:
                                # DELETE failed - use overwrite mode directly
                                try:
                                    writer = _create_dataframe_writer(
                                        output_df,
                                        self.spark,
                                        "overwrite",
                                        table_name=output_table,
                                        overwriteSchema="true",
                                    )
                                    writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                except Exception as write_error:
                                    # Handle truncate error for Delta tables
                                    error_msg = str(write_error).lower()
                                    if (
                                        "truncate" in error_msg
                                        and "batch mode" in error_msg
                                    ):
                                        # Delta table doesn't support truncate - drop and retry
                                        self.logger.warning(
                                            f"Delta table truncate error, dropping table and retrying: {write_error}"
                                        )
                                        try:
                                            self.spark.sql(
                                                f"DROP TABLE IF EXISTS {output_table}"
                                            )  # type: ignore[attr-defined]
                                            import time

                                            time.sleep(
                                                0.1
                                            )  # Brief delay for catalog sync
                                            retry_writer = _create_dataframe_writer(
                                                output_df,
                                                self.spark,
                                                "overwrite",
                                                table_name=output_table,
                                                overwriteSchema="true",
                                            )
                                            retry_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                        except Exception as retry_error:
                                            raise ExecutionError(
                                                f"Failed to write table '{output_table}' even after retry: {retry_error}",
                                                context={
                                                    "step_name": step.name,
                                                    "table": output_table,
                                                    "mode": mode.value,
                                                    "original_error": str(write_error),
                                                },
                                            ) from retry_error
                                        # Successfully wrote after retry - exit exception handler
                                        write_error = None

                                    # If we handled truncate error successfully, skip other error handling
                                    if write_error is None:
                                        pass  # Write succeeded after retry, continue execution
                                    # Handle race condition where table might be created by another thread
                                    elif (
                                        "already exists" in error_msg
                                        or "table_or_view_already_exists" in error_msg
                                    ):
                                        # Table was created by another thread - verify it exists and retry with overwrite mode
                                        if table_exists(self.spark, output_table):
                                            self.logger.debug(
                                                f"Table {output_table} was created by another thread, retrying with overwrite mode"
                                            )
                                            # Retry with overwrite mode (original mode) and overwriteSchema
                                            retry_writer = _create_dataframe_writer(
                                                output_df,
                                                self.spark,
                                                "overwrite",
                                                table_name=output_table,
                                                overwriteSchema="true",
                                            )
                                            retry_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                        else:
                                            raise
                                    else:
                                        raise
                        else:
                            # For append mode, use normal write
                            try:
                                writer = _create_dataframe_writer(
                                    output_df,
                                    self.spark,
                                    write_mode_str,
                                    table_name=output_table,
                                    overwriteSchema="true",
                                )
                                writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                            except Exception as write_error:
                                # Handle race condition where table might be created by another thread
                                error_msg = str(write_error).lower()
                                if (
                                    "already exists" in error_msg
                                    or "table_or_view_already_exists" in error_msg
                                ):
                                    # Table was created by another thread - verify it exists and retry
                                    if table_exists(self.spark, output_table):
                                        self.logger.debug(
                                            f"Table {output_table} was created by another thread, retrying with overwrite mode"
                                        )
                                        # Retry with overwrite mode (append not supported in batch mode for Delta)
                                        retry_writer = _create_dataframe_writer(
                                            output_df,
                                            self.spark,
                                            "overwrite",
                                            overwriteSchema="true",
                                        )
                                        retry_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                    else:
                                        raise
                                else:
                                    raise
                    except Exception as e:
                        raise ExecutionError(
                            f"Failed to write table '{output_table}' with schema override: {e}",
                            context={
                                "step_name": step.name,
                                "table": output_table,
                                "schema_override": str(schema_override),
                            },
                            suggestions=[
                                "Verify that the schema_override matches the DataFrame structure",
                                "Check that all required columns are present in the DataFrame",
                                "Ensure data types are compatible",
                            ],
                        ) from e
                else:
                    # Normal write without schema override
                    # Handle INITIAL mode schema changes - allow schema changes via CREATE OR REPLACE TABLE
                    # For INCREMENTAL/FULL_REFRESH, schema validation already done above
                    if mode == ExecutionMode.INITIAL and write_mode_str == "append":
                        existing_schema = _get_existing_schema_safe(
                            self.spark, output_table
                        )
                        if existing_schema is not None:
                            # Check if schema is empty (catalog sync issue)
                            schema_is_empty = (
                                not existing_schema.fields
                                or len(existing_schema.fields) == 0
                            )
                            output_schema = output_df.schema  # type: ignore[attr-defined]

                            if schema_is_empty:
                                # Catalog reports empty schema but table exists - use CREATE OR REPLACE TABLE
                                self.logger.info(
                                    f"Table '{output_table}' exists but catalog reports empty schema. "
                                    f"Using CREATE OR REPLACE TABLE for atomic schema replacement."
                                )
                                temp_view_name = (
                                    f"_temp_{step.name}_{uuid.uuid4().hex[:8]}"
                                )
                                output_df.createOrReplaceTempView(temp_view_name)  # type: ignore[attr-defined]

                                # For Delta tables, DROP then CREATE (CREATE OR REPLACE doesn't work with Delta)
                                self.spark.sql(f"DROP TABLE IF EXISTS {output_table}")  # type: ignore[attr-defined]
                                self.spark.sql(f"""
                                    CREATE TABLE {output_table}
                                    USING DELTA
                                    AS SELECT * FROM {temp_view_name}
                                """)  # type: ignore[attr-defined]

                                try:
                                    self.spark.sql(
                                        f"DROP VIEW IF EXISTS {temp_view_name}"
                                    )  # type: ignore[attr-defined]
                                except Exception:
                                    pass  # DROP VIEW cleanup best-effort; ignore failure

                                # Skip normal write path - table already written
                                writer = None
                            else:
                                # Schema exists and is not empty - check if it matches
                                schemas_match, differences = _schemas_match(
                                    existing_schema, output_schema
                                )
                                if not schemas_match:
                                    # Schema differs - INITIAL mode allows schema changes via CREATE OR REPLACE TABLE
                                    self.logger.info(
                                        f"Schema change detected for '{output_table}' in INITIAL mode. "
                                        f"Using CREATE OR REPLACE TABLE for atomic schema replacement."
                                    )
                                    temp_view_name = (
                                        f"_temp_{step.name}_{uuid.uuid4().hex[:8]}"
                                    )
                                    output_df.createOrReplaceTempView(temp_view_name)  # type: ignore[attr-defined]

                                    # For Delta tables, DROP then CREATE (CREATE OR REPLACE doesn't work with Delta)
                                    self.spark.sql(
                                        f"DROP TABLE IF EXISTS {output_table}"
                                    )  # type: ignore[attr-defined]
                                    self.spark.sql(f"""
                                        CREATE TABLE {output_table}
                                        USING DELTA
                                        AS SELECT * FROM {temp_view_name}
                                    """)  # type: ignore[attr-defined]

                                    try:
                                        self.spark.sql(
                                            f"DROP VIEW IF EXISTS {temp_view_name}"
                                        )  # type: ignore[attr-defined]
                                    except Exception:
                                        pass  # DROP VIEW cleanup best-effort; ignore failure

                                    # Skip normal write path - table already written
                                    writer = None
                                else:
                                    # Schema matches - use DELETE + overwrite for Delta tables
                                    # or CREATE OR REPLACE TABLE for atomic replacement
                                    # Note: Delta Lake doesn't support append in batch mode, so we use overwrite
                                    if _is_delta_lake_available_execution(self.spark):
                                        # For Delta tables, use DELETE + overwrite (append not supported in batch mode)
                                        try:
                                            self.spark.sql(
                                                f"DELETE FROM {output_table}"
                                            )  # type: ignore[attr-defined]
                                            # DELETE succeeded, use overwrite mode (append not supported in batch mode)
                                            writer = _create_dataframe_writer(
                                                output_df,
                                                self.spark,
                                                "overwrite",
                                                table_name=output_table,
                                            )
                                        except Exception as delete_error:
                                            # If DELETE fails, fall back to CREATE OR REPLACE TABLE
                                            self.logger.warning(
                                                f"DELETE FROM failed for '{output_table}': {delete_error}. "
                                                f"Using CREATE OR REPLACE TABLE instead."
                                            )
                                            temp_view_name = f"_temp_{step.name}_{uuid.uuid4().hex[:8]}"
                                            output_df.createOrReplaceTempView(
                                                temp_view_name
                                            )  # type: ignore[attr-defined]
                                            self.spark.sql(f"""
                                                CREATE OR REPLACE TABLE {output_table}
                                                USING DELTA
                                                AS SELECT * FROM {temp_view_name}
                                            """)  # type: ignore[attr-defined]
                                            try:
                                                self.spark.sql(
                                                    f"DROP VIEW IF EXISTS {temp_view_name}"
                                                )  # type: ignore[attr-defined]
                                            except Exception:
                                                pass  # DROP VIEW cleanup best-effort
                                            writer = None
                                    else:
                                        # Not Delta table, use normal overwrite
                                        writer = _create_dataframe_writer(
                                            output_df,
                                            self.spark,
                                            "overwrite",
                                            table_name=output_table,
                                        )
                        else:
                            # Table doesn't exist - proceed with normal write
                            writer = _create_dataframe_writer(
                                output_df,
                                self.spark,
                                write_mode_str,
                                table_name=output_table,
                            )
                    # Heal catalog entries that report empty schema (struct<>) before choosing writer
                    if table_exists(self.spark, output_table) and table_schema_is_empty(
                        self.spark, output_table
                    ):
                        self.logger.warning(
                            f"Catalog reports empty schema for '{output_table}'. Dropping table to recreate with correct schema."
                        )
                        try:
                            self.spark.sql(f"DROP TABLE IF EXISTS {output_table}")  # type: ignore[attr-defined]
                        except Exception:
                            pass  # DROP TABLE best-effort before overwrite; write may still succeed

                    # Create writer with appropriate mode
                    writer = _create_dataframe_writer(
                        output_df, self.spark, write_mode_str, table_name=output_table
                    )

                    # Execute write
                    if writer is not None:
                        try:
                            writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                        except Exception as write_error:
                            error_msg = str(write_error).lower()
                            # Handle catalog sync issues where Spark reports empty schema (struct<>)
                            if (
                                "struct<>" in error_msg
                                or "column number of the existing table" in error_msg
                            ):
                                # This is a catalog sync issue - try refreshing the table and retrying
                                self.logger.warning(
                                    f"Catalog sync issue detected for table '{output_table}'. "
                                    f"Refreshing table and retrying write."
                                )
                                try:
                                    # Refresh table and force schema re-read by reading actual data
                                    self.spark.sql(f"REFRESH TABLE {output_table}")  # type: ignore[attr-defined]
                                    # Force Spark to re-read schema by reading a sample row
                                    try:
                                        sample_df = self.spark.sql(
                                            f"SELECT * FROM {output_table} LIMIT 1"
                                        )  # type: ignore[attr-defined]
                                        _ = sample_df.schema  # Force schema evaluation
                                    except Exception:
                                        pass  # Ignore errors when reading sample

                                    # Try SQL-based INSERT for Delta tables (works even with catalog sync issues)
                                    if (
                                        _is_delta_lake_available_execution(self.spark)
                                        and write_mode_str == "append"
                                    ):
                                        temp_view_name = (
                                            f"_temp_{step.name}_{uuid.uuid4().hex[:8]}"
                                        )
                                        output_df.createOrReplaceTempView(
                                            temp_view_name
                                        )  # type: ignore[attr-defined]
                                        try:
                                            self.spark.sql(
                                                f"INSERT INTO {output_table} SELECT * FROM {temp_view_name}"
                                            )  # type: ignore[attr-defined]
                                            # Success - clean up and skip normal write path
                                            try:
                                                self.spark.sql(
                                                    f"DROP VIEW IF EXISTS {temp_view_name}"
                                                )  # type: ignore[attr-defined]
                                            except Exception:
                                                pass  # DROP VIEW cleanup best-effort
                                            writer = None  # Mark writer as None to skip normal write
                                            # Exit the retry block - write succeeded via SQL INSERT
                                        except Exception:
                                            # SQL INSERT also failed - try normal write as fallback
                                            try:
                                                self.spark.sql(
                                                    f"DROP VIEW IF EXISTS {temp_view_name}"
                                                )  # type: ignore[attr-defined]
                                            except Exception:
                                                pass  # DROP VIEW cleanup best-effort before fallback write
                                            writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                    else:
                                        # Retry the write after refresh
                                        writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                except Exception as retry_error:
                                    # Attempt to heal catalog by recreating table with existing + new data
                                    healed = False
                                    # For Gold steps, we can safely rebuild the table from the fresh output
                                    if step_type == StepType.GOLD:
                                        try:
                                            self.spark.sql(
                                                f"DROP TABLE IF EXISTS {output_table}"
                                            )  # type: ignore[attr-defined]
                                            direct_writer = _create_dataframe_writer(
                                                output_df,
                                                self.spark,
                                                "overwrite",
                                                table_name=output_table,
                                                overwriteSchema="true",
                                            )
                                            direct_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                            healed = True
                                            writer = None
                                        except Exception:
                                            healed = False

                                    if not healed:
                                        try:
                                            base_df = None
                                            if _is_delta_lake_available_execution(
                                                self.spark
                                            ):
                                                try:
                                                    delta_tbl = DeltaTable.forName(
                                                        self.spark, output_table
                                                    )  # type: ignore[attr-defined]
                                                    base_df = delta_tbl.toDF()
                                                except Exception:
                                                    base_df = None
                                            if base_df is not None:
                                                combined_df = base_df.unionByName(  # type: ignore[attr-defined]
                                                    output_df, allowMissingColumns=True
                                                )
                                            else:
                                                combined_df = output_df

                                            temp_view_name = f"_heal_{step.name}_{uuid.uuid4().hex[:8]}"
                                            combined_df.createOrReplaceTempView(
                                                temp_view_name
                                            )  # type: ignore[attr-defined]
                                            try:
                                                # Always drop then create to avoid truncate/replace limitations
                                                self.spark.sql(
                                                    f"DROP TABLE IF EXISTS {output_table}"
                                                )  # type: ignore[attr-defined]
                                                if _is_delta_lake_available_execution(
                                                    self.spark
                                                ):
                                                    self.spark.sql(  # type: ignore[attr-defined]
                                                        f"CREATE TABLE {output_table} USING DELTA AS SELECT * FROM {temp_view_name}"
                                                    )
                                                else:
                                                    self.spark.sql(  # type: ignore[attr-defined]
                                                        f"CREATE TABLE {output_table} USING PARQUET AS SELECT * FROM {temp_view_name}"
                                                    )
                                                healed = True
                                            finally:
                                                try:
                                                    self.spark.sql(
                                                        f"DROP VIEW IF EXISTS {temp_view_name}"
                                                    )  # type: ignore[attr-defined]
                                                except Exception:
                                                    pass  # DROP VIEW cleanup best-effort
                                        except Exception:
                                            healed = False

                                    if not healed:
                                        # Last-resort fix for catalog sync issues: drop and recreate the table when safe.
                                        try:
                                            if mode == ExecutionMode.INITIAL:
                                                self.spark.sql(
                                                    f"DROP TABLE IF EXISTS {output_table}"
                                                )  # type: ignore[attr-defined]
                                                writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                            else:
                                                raise retry_error
                                        except Exception as drop_error:
                                            # If drop+rewrite fails, convert to ExecutionError with helpful message
                                            raise ExecutionError(
                                                f"Schema validation failed for table '{output_table}' in {mode} mode. "
                                                f"Catalog reports empty schema (struct<>), indicating a catalog sync issue. "
                                                f"Original error: {write_error}",
                                                context={
                                                    "step_name": step.name,
                                                    "table": output_table,
                                                    "mode": mode.value,
                                                    "original_error": str(write_error),
                                                    "retry_error": str(retry_error),
                                                    "drop_error": str(drop_error),
                                                },
                                                suggestions=[
                                                    "This may be a Spark/Delta Lake catalog sync issue",
                                                    "Try running the pipeline again",
                                                    "If the issue persists, use INITIAL mode to recreate the table",
                                                ],
                                            ) from drop_error
                                    else:
                                        # Table healed via recreate; skip normal writer path
                                        writer = None
                            # Handle race condition where table might be created by another thread
                            elif (
                                "already exists" in error_msg
                                or "table_or_view_already_exists" in error_msg
                            ):
                                # Table was created by another thread - verify it exists and retry with overwrite mode
                                if table_exists(self.spark, output_table):
                                    self.logger.debug(
                                        f"Table {output_table} was created by another thread, retrying with overwrite mode"
                                    )
                                    # Retry with overwrite mode (append not supported in batch mode for Delta)
                                    retry_writer = _create_dataframe_writer(
                                        output_df,
                                        self.spark,
                                        "overwrite",
                                        table_name=output_table,
                                    )
                                    retry_writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                                else:
                                    raise
                            else:
                                # Different error - re-raise
                                raise
                        finally:
                            pass

                        # Refresh table metadata after write to ensure subsequent reads see the latest data
                        try:
                            self.spark.sql(f"REFRESH TABLE {output_table}")  # type: ignore[attr-defined]
                        except Exception as refresh_error:
                            # Refresh might fail for some table types or if table doesn't exist - log but don't fail
                            self.logger.debug(
                                f"Could not refresh table {output_table} after write: {refresh_error}"
                            )

                result.output_table = output_table
                result.rows_processed = output_df.count()  # type: ignore[attr-defined]

                # Set write mode in result for tracking
                result.write_mode = write_mode_str  # type: ignore[attr-defined]
            elif step_type == StepType.BRONZE:
                # Bronze steps only validate data, don't write to tables
                result.rows_processed = output_df.count()  # type: ignore[attr-defined]
                result.write_mode = None  # type: ignore[attr-defined]
            else:
                # No write: VALIDATION_ONLY execution mode or validation-only step (with_silver_rules / with_gold_rules)
                result.rows_processed = output_df.count()  # type: ignore[attr-defined]
                result.write_mode = None  # type: ignore[attr-defined]

            # Validation-only steps (with_silver_rules / with_gold_rules) never write; ensure report reflects that
            if _is_validation_only_step:
                result.write_mode = None  # type: ignore[attr-defined]
                result.output_table = None  # type: ignore[attr-defined]

            # Collect final resource metrics
            end_memory, end_cpu = self._collect_resource_metrics()

            # Calculate metrics (use end values, or delta if both available)
            if end_memory is not None:
                if start_memory is not None:
                    # Use peak memory (difference) or end memory
                    result.memory_usage_mb = max(end_memory - start_memory, end_memory)
                else:
                    result.memory_usage_mb = end_memory
            if end_cpu is not None:
                result.cpu_usage_percent = end_cpu

            result.status = StepStatus.COMPLETED
            result.end_time = datetime.now()
            result.duration = (result.end_time - result.start_time).total_seconds()

            # Populate result fields
            rows_processed = result.rows_processed or 0
            # For Silver/Gold steps that write, rows_written equals rows_processed
            # Bronze and validation-only steps (with_silver_rules / with_gold_rules) don't write
            if step_type == StepType.BRONZE or _is_validation_only_step:
                rows_written = None
            else:
                rows_written = rows_processed

            result.rows_written = rows_written
            result.input_rows = rows_processed
            result.validation_rate = (
                validation_rate if validation_rate is not None else 100.0
            )

            # Note: output_df is already stored in context immediately after execution
            # (for Silver and Gold steps) to ensure prior_silvers is populated for downstream steps

            # Use logger's step_complete method for consistent formatting with emoji and uppercase
            # rows_written can be None for Bronze steps, but logger expects int, so use 0 as fallback
            self.logger.step_complete(
                step_type.value,
                step.name,
                result.duration,
                rows_processed=rows_processed,
                rows_written=rows_written if rows_written is not None else 0,
                invalid_rows=invalid_rows,
                validation_rate=validation_rate,
            )

        except Exception as e:
            # Handle truncate error for Delta tables - retry with table drop
            error_msg = str(e).lower()
            if (
                "truncate" in error_msg
                and "batch mode" in error_msg
                and step_type != StepType.BRONZE
            ):
                # This is a Delta table truncate error - try to fix it
                table_name = getattr(step, "table_name", step.name)
                schema = getattr(step, "schema", None)
                if (
                    schema is not None
                    and hasattr(self, "write_service")
                    and output_df is not None
                ):
                    output_table = fqn(schema, table_name)
                    self.logger.warning(
                        f"Delta table truncate error for {output_table}, attempting to drop and retry write"
                    )
                    try:
                        # Drop the table (without CASCADE - not supported in all Spark versions)
                        self.spark.sql(f"DROP TABLE IF EXISTS {output_table}")  # type: ignore[attr-defined]
                        import time

                        time.sleep(0.2)  # Brief delay for catalog sync
                        # Retry the write using append mode (since table is dropped, append will create it)
                        # This avoids the truncate issue entirely
                        writer = _create_dataframe_writer(
                            output_df,
                            self.spark,
                            "append",  # Use append after drop to avoid truncate
                            table_name=output_table,
                        )
                        writer.saveAsTable(output_table)  # type: ignore[attr-defined]
                        rows_written = output_df.count()  # type: ignore[attr-defined]
                        # If we get here, the retry succeeded - update result and continue
                        result.status = StepStatus.COMPLETED
                        result.rows_written = rows_written
                        result.rows_processed = rows_written
                        result.end_time = datetime.now()
                        result.duration = (
                            result.end_time - result.start_time
                        ).total_seconds()
                        self.logger.info(
                            f"✅ Completed {step_type.value.upper()} step: {step.name} ({result.duration:.2f}s) - {rows_written} rows written (after truncate retry)"
                        )
                        return result
                    except Exception as retry_error:
                        # Retry also failed - fall through to normal error handling
                        self.logger.error(
                            f"Retry after truncate error also failed: {retry_error}"
                        )
                        e = retry_error  # Use retry error for final error message

            # Collect final resource metrics even on failure
            end_memory, end_cpu = self._collect_resource_metrics()

            # Calculate metrics (use end values, or delta if both available)
            if end_memory is not None:
                if start_memory is not None:
                    # Use peak memory (difference) or end memory
                    result.memory_usage_mb = max(end_memory - start_memory, end_memory)
                else:
                    result.memory_usage_mb = end_memory
            if end_cpu is not None:
                result.cpu_usage_percent = end_cpu

            result.status = StepStatus.FAILED
            result.error = str(e)
            result.end_time = datetime.now()
            result.duration = (result.end_time - result.start_time).total_seconds()

            # Log step failure
            self.logger.error(
                f"❌ Failed {step_type.value.upper()} step: {step.name} ({result.duration:.2f}s) - {str(e)}"
            )
            raise ExecutionError(f"Step execution failed: {e}") from e

        return result

    def execute_pipeline(
        self,
        steps: list[Union[BronzeStep, SilverStep, GoldStep]],
        mode: ExecutionMode = ExecutionMode.INITIAL,
        context: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
        step_params: Optional[Dict[str, Dict[str, Any]]] = None,
        stop_after_step: Optional[str] = None,
        start_at_step: Optional[str] = None,
        write_outputs: bool = True,
        execution_order: Optional[list[str]] = None,
    ) -> ExecutionResult:
        """Execute a complete pipeline with dependency-aware sequential execution.

        Analyzes step dependencies and executes steps sequentially in the correct
        order using topological sort. Steps execute one at a time in dependency
        order to respect dependency constraints.

        Args:
            steps: List of steps to execute. Can include Bronze, Silver, and
                Gold steps in any order - dependencies are automatically analyzed.
            mode: Execution mode (INITIAL, INCREMENTAL, FULL_REFRESH,
                VALIDATION_ONLY). Defaults to INITIAL.
            context: Optional initial execution context dictionary mapping step
                names to DataFrames. Must contain bronze source data. If None,
                empty dictionary is used.
            step_params: Optional dictionary mapping step names to parameter
                dictionaries. These parameters will be passed to the step's
                transform function if it accepts a 'params' argument or **kwargs.
            stop_after_step: Optional step name. If provided, execution stops
                after this step completes (inclusive). Useful for debugging
                or partial pipeline execution.
            start_at_step: Optional step name. If provided, execution begins
                at this step, skipping earlier steps. Earlier step outputs
                must exist in context or be readable from tables.
            write_outputs: If True, write step outputs to Delta Lake tables.
                If False, skip writing (useful for debugging/iteration).
                Defaults to True.
            execution_order: Optional pre-computed step order (e.g. from PipelineBuilder).
                When provided, this order is used so execution matches to_pipeline() report.

        Returns:
            ExecutionResult containing:
            - Overall pipeline status
            - List of StepExecutionResult for each step
            - Execution timing
            - Dependency analysis results

        Raises:
            ExecutionError: If pipeline execution fails.
            TypeError: If context is not a dictionary.

        Example:
            >>> config = PipelineConfig.create_default(schema="my_schema")
            >>> engine = ExecutionEngine(spark, config)
            >>> result = engine.execute_pipeline(
            ...     steps=[bronze, silver1, silver2, gold],
            ...     mode=ExecutionMode.INITIAL,
            ...     context={"events": source_df}
            ... )
            >>> print(f"Status: {result.status}")
            >>> print(f"Steps executed: {len(result.steps) if result.steps else 0}")
            >>> print(f"Execution order: {[s.step_name for s in result.steps] if result.steps else []}")
            >>> print(f"Steps completed: {len([s for s in result.steps if s.status == StepStatus.COMPLETED])}")

        Note:
            - All required schemas are created upfront before execution
            - Steps are ordered by dependencies using DependencyAnalyzer (topological sort)
            - Steps execute sequentially one at a time in dependency order
            - Context is updated after each step completion for downstream steps
            - Failed steps are recorded but don't stop execution of remaining steps
        """
        execution_id = str(uuid.uuid4())
        start_time = datetime.now()

        result = ExecutionResult(
            execution_id=execution_id,
            mode=mode,
            start_time=start_time,
            status="running",
        )

        # Disable Delta's strict schema-on-read check for the whole run so validation-only
        # steps (with_silver_rules / with_gold_rules) can read existing tables even if
        # schema evolved. Restored in finally.
        _delta_check_key = "spark.databricks.delta.checkLatestSchemaOnRead"
        _delta_check_prev = None
        try:
            _delta_check_prev = self.spark.conf.get(_delta_check_key)  # type: ignore[attr-defined]
        except Exception:
            pass  # Spark config get may not exist; use set/unset only
        try:
            self.spark.conf.set(_delta_check_key, "false")  # type: ignore[attr-defined]
        except Exception:
            pass  # Disable Delta constraint check best-effort
        try:
            self.spark.sql(f"SET {_delta_check_key}=false")  # type: ignore[attr-defined]
        except Exception:
            pass  # SQL SET may not be supported; continue

        try:
            # Logging is handled by the runner to avoid duplicate messages
            # Ensure all required schemas exist before execution
            # Collect unique schemas from all steps
            required_schemas = set()
            for step in steps:
                if hasattr(step, "schema") and step.schema:  # type: ignore[attr-defined]
                    schema_value = step.schema  # type: ignore[attr-defined]
                    # Handle both string schemas and Mock objects (for tests)
                    if isinstance(schema_value, str):
                        required_schemas.add(schema_value)
            # Create all required schemas upfront - always try to create, don't rely on catalog checks
            for schema in required_schemas:
                try:
                    # Always try to create schema - CREATE SCHEMA IF NOT EXISTS is idempotent
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")  # type: ignore[attr-defined]
                    # Also use _ensure_schema_exists as backup (tries multiple methods)
                    self._ensure_schema_exists(schema)
                except Exception as e:
                    # Log but don't fail - schema might already exist or creation might work later
                    self.logger.debug(f"Schema '{schema}' pre-creation attempt: {e}")

            # Validate context parameter
            if context is None:
                context = {}
            elif not isinstance(context, dict):
                raise TypeError(f"context must be a dictionary, got {type(context)}")

            # Create a mapping of step names to step objects (needed for start_at_step handling)
            step_map = {s.name: s for s in steps}

            # Use provided execution order (from builder) or compute via dependency analysis
            if execution_order:
                # Restrict to steps actually in this run; use only if it covers all steps
                ordered = [n for n in execution_order if n in step_map]
                if set(ordered) != set(step_map):
                    execution_order = None  # subset or mismatch: fall back to analyzer
                else:
                    execution_order = ordered
            if not execution_order:
                # Group steps by type for dependency analysis
                bronze_steps = [s for s in steps if s.step_type.value == "bronze"]
                silver_steps = [s for s in steps if s.step_type.value == "silver"]
                gold_steps = [s for s in steps if s.step_type.value == "gold"]
                analyzer = DependencyAnalyzer()
                analysis = analyzer.analyze_dependencies(
                    bronze_steps={s.name: s for s in bronze_steps},
                    silver_steps={s.name: s for s in silver_steps},
                    gold_steps={s.name: s for s in gold_steps},
                )
                execution_order = analysis.execution_order

            # Create a mapping of step names to step types (needed for prior_golds building)
            step_types = {s.name: s.step_type.value for s in steps}

            # Handle start_at_step: filter execution order and load earlier outputs
            if start_at_step is not None:
                if start_at_step not in execution_order:
                    raise ExecutionError(
                        f"start_at_step '{start_at_step}' not found in execution order. "
                        f"Available steps: {execution_order}"
                    )
                start_index = execution_order.index(start_at_step)
                skipped_steps = execution_order[:start_index]
                execution_order = execution_order[start_index:]

                # Try to load outputs from skipped steps if not in context
                for skipped_name in skipped_steps:
                    if skipped_name not in context:
                        skipped_step = step_map.get(skipped_name)
                        if skipped_step is not None:
                            # Try to read from table if it's a Silver/Gold step
                            if skipped_step.step_type.value in ("silver", "gold"):
                                table_name = getattr(
                                    skipped_step, "table_name", skipped_name
                                )
                                schema = getattr(skipped_step, "schema", None)
                                if schema is not None:
                                    table_fqn = fqn(schema, table_name)
                                    try:
                                        if table_exists(self.spark, table_fqn):
                                            context[skipped_name] = self.spark.table(
                                                table_fqn
                                            )  # type: ignore[attr-defined,valid-type]
                                            self.logger.info(
                                                f"Loaded output for skipped step '{skipped_name}' from table '{table_fqn}'"
                                            )
                                        else:
                                            self.logger.warning(
                                                f"Step '{skipped_name}' output not in context and table '{table_fqn}' does not exist. "
                                                f"Downstream steps may fail."
                                            )
                                    except Exception as e:
                                        self.logger.warning(
                                            f"Could not load output for skipped step '{skipped_name}' from table '{table_fqn}': {e}"
                                        )
                            else:
                                # Bronze step - must be in context
                                self.logger.warning(
                                    f"Bronze step '{skipped_name}' output not in context. "
                                    f"Bronze steps must be provided in context when using start_at_step."
                                )

                self.logger.info(
                    f"Starting execution at step '{start_at_step}' (skipped {len(skipped_steps)} earlier steps)"
                )

            # Log dependency analysis results
            self.logger.info(
                f"Dependency analysis complete: {len(execution_order)} steps to execute"
            )

            # Execute steps in dependency order
            for step_name in execution_order:
                if step_name not in step_map:
                    self.logger.warning(
                        f"Step {step_name} in execution order but not found in step list"
                    )
                    continue

                step = step_map[step_name]
                try:
                    # Get step-specific params if provided
                    current_step_params = None
                    if step_params is not None and step_name in step_params:
                        current_step_params = step_params[step_name]

                    step_result = self.execute_step(
                        step,
                        context,
                        mode,
                        step_params=current_step_params,
                        write_outputs=write_outputs,
                        step_types=step_types,
                    )
                    if result.steps is not None:
                        result.steps.append(step_result)

                    # Update context with step output for downstream steps
                    # Note: output_df is already stored in context by execute_step after completion
                    # If write_outputs is True, refresh from table to ensure we have the latest persisted data
                    # If write_outputs is False, keep the in-memory DataFrame (no table was written)
                    if (
                        step_result.status == StepStatus.COMPLETED
                        and step_result.step_type != StepType.BRONZE
                        and write_outputs
                    ):
                        table_name = getattr(step, "table_name", step.name)
                        schema = getattr(step, "schema", None)
                        if schema is not None:
                            table_fqn = fqn(schema, table_name)
                            try:
                                # Refresh table first to ensure we see the latest data
                                try:
                                    self.spark.sql(f"REFRESH TABLE {table_fqn}")  # type: ignore[attr-defined]
                                except Exception:
                                    pass  # Refresh might fail for some table types - continue anyway
                                # Read table and add to context (overwrites output_df with table data)
                                # Only update if read succeeds - if it fails, keep the output_df from execute_step
                                table_df = self.spark.table(table_fqn)  # type: ignore[attr-defined,valid-type]
                                context[step.name] = table_df  # type: ignore[valid-type]
                            except Exception as e:
                                # If reading fails, the output_df stored by execute_step is still in context
                                # This is fine - we'll use the output_df that was stored during execution
                                # Only log if the step name is not already in context (meaning execute_step didn't store it)
                                if step.name not in context:
                                    self.logger.warning(
                                        f"Could not read table '{table_fqn}' to add to context, "
                                        f"and execute_step did not store output_df. "
                                        f"Downstream steps may not have access to this step's output. Error: {e}"
                                    )
                                else:
                                    self.logger.debug(
                                        f"Could not read table '{table_fqn}' to refresh context. "
                                        f"Using output_df stored during execution. Error: {e}"
                                    )
                    elif (
                        step_result.status == StepStatus.COMPLETED
                        and step_result.step_type != StepType.BRONZE
                        and not write_outputs
                    ):
                        # When write_outputs is False, ensure context has the in-memory DataFrame
                        # (execute_step already stored it, but we want to make sure it's there)
                        if step.name not in context:
                            self.logger.warning(
                                f"Step '{step.name}' completed but output not in context. "
                                f"This may indicate an issue with execute_step."
                            )

                    if step_result.status == StepStatus.FAILED:
                        self.logger.error(
                            f"Step {step_name} failed: {step_result.error}"
                        )

                    # Check if we should stop after this step
                    if stop_after_step is not None and step_name == stop_after_step:
                        self.logger.info(
                            f"Stopping execution after step '{stop_after_step}' as requested"
                        )
                        # Mark result as completed (even if some steps were skipped)
                        result.status = "completed"
                        result.end_time = datetime.now()
                        return result

                except Exception as e:
                    self.logger.error(f"Exception executing step {step_name}: {e}")
                    # Determine correct step type
                    step_obj = step_map.get(step_name)
                    if step_obj is not None:
                        phase = step_obj.step_type
                        if phase.value == "bronze":
                            step_type_enum = StepType.BRONZE
                        elif phase.value == "silver":
                            step_type_enum = StepType.SILVER
                        elif phase.value == "gold":
                            step_type_enum = StepType.GOLD
                        else:
                            step_type_enum = StepType.BRONZE  # fallback
                    else:
                        step_type_enum = StepType.BRONZE  # fallback

                    step_result = StepExecutionResult(
                        step_name=step_name,
                        step_type=step_type_enum,
                        status=StepStatus.FAILED,
                        error=str(e),
                        start_time=datetime.now(),
                        end_time=datetime.now(),
                        duration=0.0,
                    )
                    if result.steps is not None:
                        result.steps.append(step_result)

            # Determine overall pipeline status based on step results
            if result.steps is None:
                result.steps = []
            step_results: list[StepExecutionResult] = result.steps
            failed_steps = [s for s in step_results if s.status == StepStatus.FAILED]

            if failed_steps:
                result.status = "failed"
                self.logger.error(
                    f"Pipeline execution failed: {len(failed_steps)} steps failed"
                )
            else:
                result.status = "completed"
                self.logger.info(f"Completed pipeline execution: {execution_id}")

            result.end_time = datetime.now()

        except Exception as e:
            result.status = "failed"
            result.error = str(e)
            result.end_time = datetime.now()
            self.logger.error(f"Pipeline execution failed: {e}")
            raise ExecutionError(f"Pipeline execution failed: {e}") from e
        finally:
            if _delta_check_prev is not None:
                try:
                    self.spark.conf.set(_delta_check_key, _delta_check_prev)  # type: ignore[attr-defined]
                except Exception:
                    pass  # Restore config best-effort; test isolation may not need it
            else:
                try:
                    self.spark.conf.unset(_delta_check_key)  # type: ignore[attr-defined]
                except Exception:
                    pass  # Unset config best-effort

        return result

    def _execute_bronze_step(
        self,
        step: BronzeStep,
        context: Dict[str, DataFrame],  # type: ignore[valid-type]  # type: ignore[valid-type]
    ) -> DataFrame:  # type: ignore[valid-type]
        """Execute a bronze step.

        Bronze steps validate existing data but don't transform or write it.
        The step name must exist in the context dictionary with the source DataFrame.

        Args:
            step: BronzeStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                the step name as a key.

        Returns:
            DataFrame from context (validated but unchanged).

        Raises:
            ExecutionError: If step name not found in context or DataFrame is
                invalid.

        Note:
            Bronze steps are for validating raw data. They don't write to tables
            or perform transformations. Validation is applied separately by the
            execution engine.
        """
        # Bronze steps require data to be provided in context
        # This is the expected behavior - bronze steps validate existing data
        if step.name not in context:
            raise ExecutionError(
                f"Bronze step '{step.name}' requires data to be provided in context. "
                f"Bronze steps are for validating existing data, not creating it. "
                f"Please provide data using bronze_sources parameter or context dictionary. "
                f"Available context keys: {list(context.keys())}"
            )

        df: DataFrame = context[step.name]  # type: ignore[valid-type]

        # Validate that the DataFrame is not empty (optional check)
        if df.count() == 0:  # type: ignore[attr-defined]
            self.logger.warning(
                f"Bronze step '{step.name}' received empty DataFrame. "
                f"This may indicate missing or invalid data source."
            )

        return df

    def _execute_silver_step(
        self,
        step: SilverStep,
        context: Dict[str, DataFrame],  # type: ignore[valid-type]
        mode: ExecutionMode,
    ) -> DataFrame:  # type: ignore[valid-type]
        """Execute a silver step.

        Silver steps transform bronze data into cleaned and enriched data.
        For INCREMENTAL mode, filters bronze input to only process new rows.

        Args:
            step: SilverStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                the source bronze step name.
            mode: Execution mode. INCREMENTAL mode triggers incremental filtering.

        Returns:
            Transformed DataFrame ready for validation and writing.

        Raises:
            ExecutionError: If source bronze step not found in context.

        Note:
            - Applies incremental filtering if mode is INCREMENTAL
            - Calls step.transform() with bronze DataFrame and empty silvers dict
            - Transformation logic is defined in the step's transform function
        """

        # Get source bronze data
        if step.source_bronze not in context:
            raise ExecutionError(
                f"Source bronze step {step.source_bronze} not found in context"
            )

        bronze_df: DataFrame = context[step.source_bronze]  # type: ignore[valid-type]

        if mode == ExecutionMode.INCREMENTAL:
            bronze_df = self._filter_incremental_bronze_input(step, bronze_df)

        # Build prior_silvers dict from context
        # If source_silvers is specified, only include those steps
        # Otherwise, include all previously executed steps (excluding bronze and current step)
        prior_silvers: Dict[str, DataFrame] = {}  # type: ignore[valid-type]
        source_silvers = getattr(step, "source_silvers", None)

        if source_silvers:
            # Only include explicitly specified silver steps
            for silver_name in source_silvers:
                if silver_name in context and silver_name != step.name:
                    prior_silvers[silver_name] = context[silver_name]  # type: ignore[assignment]
                elif silver_name not in context:
                    # Log warning if expected silver step is not in context
                    # This helps debug dependency issues
                    available_keys = [
                        k
                        for k in context.keys()
                        if k != step.name and k != step.source_bronze
                    ]
                    self.logger.warning(
                        f"Silver step {step.name} expects {silver_name} in prior_silvers "
                        f"(via source_silvers), but it's not in context. "
                        f"Available keys: {list(context.keys())}, "
                        f"Other silver steps in context: {available_keys}"
                    )
        else:
            # Include all previously executed steps (excluding bronze and current step)
            # This allows backward compatibility for silver steps that access prior_silvers
            # without explicitly declaring dependencies
            for key, value in context.items():
                if key != step.name and key != step.source_bronze:
                    prior_silvers[key] = value  # type: ignore[assignment]

        # Apply transform with source bronze data and prior silvers dict
        return step.transform(self.spark, bronze_df, prior_silvers)

    def _filter_incremental_bronze_input(
        self,
        step: SilverStep,
        bronze_df: DataFrame,  # type: ignore[valid-type]  # type: ignore[valid-type]
    ) -> DataFrame:  # type: ignore[valid-type]
        """Filter bronze input rows already processed in previous incremental runs.

        Filters bronze DataFrame to only include rows that haven't been processed
        yet. Uses the source bronze step's incremental column and the silver step's
        watermark column to determine which rows to exclude.

        Args:
            step: SilverStep instance with incremental configuration.
            bronze_df: Bronze DataFrame to filter.

        Returns:
            Filtered DataFrame containing only new rows to process. Returns
            original DataFrame if filtering cannot be performed (missing columns,
            table doesn't exist, etc.).

        Raises:
            ExecutionError: If filtering fails due to column or type issues.

        Note:
            Filtering logic:
            1. Reads existing silver table to get maximum watermark value
            2. Filters bronze rows where incremental_col > max_watermark
            3. Returns original DataFrame if table doesn't exist (first run)

            Requires:
            - step.source_incremental_col: Column in bronze DataFrame
            - step.watermark_col: Column in existing silver table
            - step.schema and step.table_name: To locate existing table

            Skips filtering gracefully if requirements not met (returns original DataFrame).
        """

        incremental_col = getattr(step, "source_incremental_col", None)
        watermark_col = getattr(step, "watermark_col", None)
        schema = getattr(step, "schema", None)
        table_name = getattr(step, "table_name", step.name)

        if not incremental_col or not watermark_col or schema is None:
            return bronze_df

        if incremental_col not in getattr(bronze_df, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: incremental column '{incremental_col}' "
                f"not present in bronze DataFrame; skipping incremental filter"
            )
            return bronze_df

        # Validate that incremental column type is appropriate for filtering
        try:
            schema = bronze_df.schema  # type: ignore[attr-defined]
            col_field = schema[incremental_col]  # type: ignore[index]
            col_type = col_field.dataType  # type: ignore[attr-defined]
            col_type_name = str(col_type)

            # Check if type is comparable (numeric, date, timestamp, string)
            # Non-comparable types: boolean, array, map, struct
            non_comparable_types = ["boolean", "array", "map", "struct", "binary"]
            if any(
                non_comp in col_type_name.lower() for non_comp in non_comparable_types
            ):
                self.logger.warning(
                    f"Silver step {step.name}: incremental column '{incremental_col}' "
                    f"has type '{col_type_name}' which may not be suitable for comparison operations. "
                    f"Filtering may fail or produce unexpected results. "
                    f"Consider using a numeric, date, timestamp, or string column for incremental processing."
                )
        except (KeyError, AttributeError, Exception) as e:
            # If we can't inspect the schema, log a warning but continue
            self.logger.debug(
                f"Silver step {step.name}: could not validate incremental column type: {e}"
            )

        output_table = fqn(schema, table_name)

        try:
            existing_table = self.spark.table(output_table)  # type: ignore[attr-defined]
        except Exception as exc:
            self.logger.debug(
                f"Silver step {step.name}: unable to read existing table {output_table} "
                f"for incremental filter: {exc}"
            )
            return bronze_df

        if watermark_col not in getattr(existing_table, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: watermark column '{watermark_col}' "
                f"not present in existing table {output_table}; skipping incremental filter"
            )
            return bronze_df

        try:
            watermark_rows = existing_table.select(watermark_col).collect()  # type: ignore[attr-defined]
        except Exception as exc:
            self.logger.warning(
                f"Silver step {step.name}: failed to collect watermark values "
                f"from {output_table}: {exc}"
            )
            return bronze_df

        if not watermark_rows:
            return bronze_df

        cutoff_value = None
        for row in watermark_rows:
            value = None
            if hasattr(row, "__getitem__"):
                try:
                    value = row[watermark_col]
                except Exception:
                    try:
                        value = row[0]
                    except Exception:
                        value = None
            if value is None and hasattr(row, "asDict"):
                value = row.asDict().get(watermark_col)
            if value is None:
                continue
            cutoff_value = value if cutoff_value is None else max(cutoff_value, value)

        if cutoff_value is None:
            return bronze_df

        try:
            filtered_df = bronze_df.filter(F.col(incremental_col) > F.lit(cutoff_value))  # type: ignore[attr-defined]
        except Exception as exc:
            # Provide detailed error context for incremental filtering failures
            error_msg = str(exc).lower()
            if "cannot resolve" in error_msg or "column" in error_msg:
                # Column-related error - provide schema context
                available_cols = sorted(getattr(bronze_df, "columns", []))
                raise ExecutionError(
                    f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                    f"Error: {exc!r}. "
                    f"Available columns in bronze DataFrame: {available_cols}. "
                    f"This may indicate that the incremental column was dropped or renamed in a previous transform. "
                    f"Please ensure the incremental column '{incremental_col}' exists in the bronze DataFrame."
                ) from exc
            elif "type" in error_msg or "cast" in error_msg:
                # Type-related error - provide type information
                try:
                    col_type = bronze_df.schema[incremental_col].dataType  # type: ignore[attr-defined]
                    raise ExecutionError(
                        f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                        f"Error: {exc!r}. "
                        f"Column type: {col_type}. "
                        f"Cutoff value type: {type(cutoff_value).__name__}. "
                        f"Incremental columns must be comparable types (numeric, date, timestamp). "
                        f"Please ensure the incremental column type is compatible with the cutoff value."
                    ) from exc
                except (KeyError, AttributeError, Exception):
                    # If we can't get type info, provide generic error
                    raise ExecutionError(
                        f"Silver step {step.name}: failed to filter bronze rows using incremental column '{incremental_col}'. "
                        f"Error: {exc!r}. "
                        f"This may be a type mismatch between the incremental column and the cutoff value. "
                        f"Please ensure the incremental column type is compatible with the cutoff value type."
                    ) from exc
            else:
                # Generic error with context
                raise ExecutionError(
                    f"Silver step {step.name}: failed to filter bronze rows using "
                    f"{incremental_col} > {cutoff_value}: {exc!r}. "
                    f"Please check that the incremental column exists and is of a comparable type."
                ) from exc

        self.logger.info(
            f"Silver step {step.name}: filtering bronze rows where "
            f"{incremental_col} <= {cutoff_value}"
        )
        return filtered_df

    @staticmethod
    def _extract_row_value(row: Any, column: str) -> Optional[object]:
        """Safely extract a column value from a Row-like object.

        Attempts multiple methods to extract a column value from Spark Row objects,
        handling different Row implementations and access patterns.

        Args:
            row: Row-like object (Spark Row, dict, or similar).
            column: Column name to extract.

        Returns:
            Extracted value if found, None otherwise.

        Note:
            Tries methods in order:
            1. Direct indexing: row[column]
            2. Positional indexing: row[0]
            3. Dictionary access: row.asDict().get(column)

            Returns None if all methods fail or value is not found.
        """
        if hasattr(row, "__getitem__"):
            try:
                result: Optional[object] = row[column]  # type: ignore[assignment]
                return result
            except Exception:
                try:
                    result = row[0]  # type: ignore[assignment]
                    return cast(Optional[object], result)
                except Exception:
                    pass  # Row may not support asDict; try other access
        if hasattr(row, "asDict"):
            try:
                result = row.asDict().get(column)  # type: ignore[assignment]
                return cast(Optional[object], result)
            except Exception:
                return None
        return None

    def _execute_gold_step(
        self,
        step: GoldStep,
        context: Dict[str, DataFrame],  # type: ignore[valid-type]  # type: ignore[valid-type]
    ) -> DataFrame:  # type: ignore[valid-type]
        """Execute a gold step.

        Gold steps transform silver data into business analytics and aggregations.
        Builds a dictionary of source silver DataFrames from step.source_silvers.

        Args:
            step: GoldStep instance to execute.
            context: Dictionary mapping step names to DataFrames. Must contain
                all source silver step names listed in step.source_silvers.

        Returns:
            Transformed DataFrame ready for validation and writing.

        Raises:
            ExecutionError: If any source silver step not found in context.

        Note:
            - Builds silvers dictionary from step.source_silvers
            - Calls step.transform() with SparkSession and silvers dictionary
            - Transformation logic is defined in the step's transform function
            - Gold steps typically perform aggregations and business metrics
        """

        # Build silvers dict from source_silvers
        silvers = {}
        if step.source_silvers is not None:
            for silver_name in step.source_silvers:
                if silver_name not in context:
                    raise ExecutionError(
                        f"Source silver {silver_name} not found in context"
                    )
                silvers[silver_name] = context[silver_name]  # type: ignore[valid-type]

        return step.transform(self.spark, silvers)

# Backward compatibility aliases
UnifiedExecutionEngine = ExecutionEngine
UnifiedStepExecutionResult = StepExecutionResult

In [None]:
# Module: pipeline_builder.writer.storage (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.functions, pipeline_builder.table_operations, pipeline_builder.table_operations, pipeline_builder.writer.exceptions, pipeline_builder.writer.models, pipeline_builder.writer.models, pipeline_builder_base.logging, pipeline_builder_base.logging, writer.exceptions

# mypy: ignore-errors
"""
Writer storage module for Delta Lake and table operations.

This module handles all storage-related operations including Delta Lake
integration, table management, and data persistence.

"""

from __future__ import annotations

import os
import tempfile
from datetime import datetime
from typing import Dict, Optional, TypedDict, Union, cast
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession, types  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
from pyspark.sql import types  # types from pyspark (not from compat)
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..table_operations import (  # Removed: defined in notebook cells above
    # prepare_delta_overwrite,
    # table_exists,
    # table_schema_is_empty,
# )
# from .exceptions import WriterTableError  # Removed: defined in notebook cells above
# from .models import LogRow, WriteMode, WriterConfig, create_log_schema  # Removed: defined in notebook cells above

# Handle optional Delta Lake dependency
try:
    from delta.tables import DeltaTable

    HAS_DELTA = True
except (ImportError, AttributeError, RuntimeError):
    # Catch ImportError, AttributeError (delta-spark compatibility issues),
    # and RuntimeError (Spark session not initialized)
    DeltaTable = None  # type: ignore[misc, assignment]
    HAS_DELTA = False

# Cache for Delta Lake availability per Spark session
# Key: Spark session id, Value: boolean indicating if Delta works
_delta_availability_cache: Dict[str, bool] = {}

# Legacy function - use prepare_delta_overwrite from table_operations instead
def _prepare_delta_overwrite_storage(
    spark: SparkSession,  # type: ignore[valid-type]
    table_name: str,
) -> None:
    """
    Legacy function - delegates to prepare_delta_overwrite() from table_operations.

    This function is kept for backward compatibility but now uses the centralized
    prepare_delta_overwrite() function from table_operations module.
    """
    # Only prepare if Delta is available
    if HAS_DELTA and _is_delta_lake_available(spark):
        prepare_delta_overwrite(spark, table_name)

def _is_delta_lake_available(spark: SparkSession) -> bool:  # type: ignore[valid-type]
    """
    Check if Delta Lake is actually available and working in the Spark session.

    This function checks configuration and optionally tests Delta functionality.
    Results are cached per Spark session for performance.

    Args:
        spark: Spark session to test

    Returns:
        True if Delta Lake is available and working, False otherwise
    """

    # Use Spark session's underlying SparkContext ID as cache key
    try:
        spark_id = (
            str(id(spark._jsparkSession))
            if hasattr(spark, "_jsparkSession")
            else str(id(spark))
        )
    except Exception:
        # Fallback: use Python id if JVM/session id unavailable
        spark_id = str(id(spark))

    # Log session identity and configs before checking
    print("🔍 _is_delta_lake_available: Checking Delta availability")
    print(f"🔍 _is_delta_lake_available: PID={os.getpid()}")
    print(f"🔍 _is_delta_lake_available: Session ID (Python)={id(spark)}")
    try:
        if hasattr(spark, "_jsparkSession"):
            print(
                f"🔍 _is_delta_lake_available: Session ID (JVM)={id(spark._jsparkSession)}"
            )
    except Exception:
        pass  # Session has no JVM; skip JVM id log

    # Check cache first
    if spark_id in _delta_availability_cache:
        cached_result = _delta_availability_cache[spark_id]
        print(f"🔍 _is_delta_lake_available: Using cached result: {cached_result}")
        return cached_result

    # If delta package is not installed, can't be available
    if not HAS_DELTA:
        print(
            "⚠️ _is_delta_lake_available: Delta package not installed (HAS_DELTA=False)"
        )
        _delta_availability_cache[spark_id] = False
        return False

    # Check Spark configuration first (fast check)
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        catalog = spark.conf.get("spark.sql.catalog.spark_catalog", "")  # type: ignore[attr-defined]

        print(
            f"🔍 _is_delta_lake_available: Config check - Extensions: '{extensions}', Catalog: '{catalog}'"
        )

        # If both extensions and catalog are configured for Delta, assume it works
        if (
            extensions
            and catalog
            and "DeltaSparkSessionExtension" in extensions
            and "DeltaCatalog" in catalog
        ):
            print("✅ _is_delta_lake_available: Delta configured via config check")
            _delta_availability_cache[spark_id] = True
            return True
    except Exception as e:
        print(f"⚠️ _is_delta_lake_available: Error checking config: {e}")
        pass  # Config check failed; proceed to lightweight test

    # If only extensions are configured, do a lightweight test
    try:
        extensions = spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
        if extensions and "DeltaSparkSessionExtension" in extensions:
            print(
                "🔍 _is_delta_lake_available: Extension found, testing with actual write..."
            )
            # Try a simple test - create a minimal DataFrame and try to write it
            test_df = spark.createDataFrame([(1, "test")], ["id", "name"])
            # Use a unique temp directory to avoid conflicts
            with tempfile.TemporaryDirectory() as temp_dir:
                test_path = os.path.join(temp_dir, "delta_test")
                try:
                    test_df.write.format("delta").mode("overwrite").save(test_path)
                    print("✅ _is_delta_lake_available: Delta test write succeeded")
                    _delta_availability_cache[spark_id] = True
                    return True
                except Exception as test_error:
                    # Delta format failed - not available
                    print(
                        f"⚠️ _is_delta_lake_available: Delta test write failed: {test_error}"
                    )
                    pass
    except Exception as e:
        print(f"⚠️ _is_delta_lake_available: Error during test write: {e}")
        pass  # Lightweight Delta test failed; assume Delta not available

    # Delta is not available in this Spark session
    print("❌ _is_delta_lake_available: Delta NOT available in this session")
    _delta_availability_cache[spark_id] = False
    return False

# ============================================================================
# TypedDict Definitions
# ============================================================================

class WriteResult(TypedDict):
    """Write operation result structure."""

    table_name: str
    write_mode: str
    rows_written: int
    timestamp: str
    success: bool

class OptimizeResultSkipped(TypedDict):
    """Optimize operation result when skipped."""

    table_name: str
    optimization_completed: bool  # False
    skipped: bool  # True
    reason: str
    timestamp: str

class TableInfo(TypedDict, total=False):
    """Table information structure."""

    table_name: str
    row_count: int
    details: list[dict[str, Union[str, int, float, Optional[bool]]]]
    history_count: int
    last_modified: Optional[str]
    history: list[dict[str, Union[str, int, float, Optional[bool]]]]
    timestamp: str

class OptimizeResultCompleted(TypedDict):
    """Optimize operation result when completed."""

    table_name: str
    optimization_completed: bool  # True
    timestamp: str
    table_info: TableInfo

# Union type for optimize result
OptimizeResult = Union[OptimizeResultSkipped, OptimizeResultCompleted]

class VacuumResultSkipped(TypedDict):
    """Vacuum operation result when skipped."""

    table_name: str
    vacuum_completed: bool  # False
    skipped: bool  # True
    reason: str
    retention_hours: int
    timestamp: str

class VacuumResultCompleted(TypedDict):
    """Vacuum operation result when completed."""

    table_name: str
    vacuum_completed: bool  # True
    retention_hours: int
    timestamp: str

# Union type for vacuum result
VacuumResult = Union[VacuumResultSkipped, VacuumResultCompleted]

class StorageManager:
    """Handles storage operations for the writer."""

    def __init__(
        self,
        spark: SparkSession,  # type: ignore[valid-type]
        config: WriterConfig,
        functions: Optional[FunctionsProtocol] = None,
        logger: Optional[PipelineLogger] = None,
    ):
        """Initialize the storage manager."""
        self.spark = spark
        self.config = config
        self.functions = functions if functions is not None else get_default_functions()
        if logger is None:
            self.logger = PipelineLogger("StorageManager")
        else:
            self.logger = logger
        self.table_fqn = f"{config.table_schema}.{config.table_name}"

    def create_table_if_not_exists(self, schema: types.StructType) -> None:
        """
        Create the log table if it doesn't exist.

        Args:
            schema: Spark schema for the table

        Raises:
            WriterTableError: If table creation fails
        """
        try:
            self.logger.info(f"Creating table if not exists: {self.table_fqn}")

            # Extract schema name from table_fqn (format: "schema.table")
            schema_name = (
                self.table_fqn.split(".")[0] if "." in self.table_fqn else None
            )

            # Ensure schema exists before creating table
            # This is especially important for LogWriter which creates tables in different schemas
            if schema_name:
                try:
                    # Use SQL to ensure schema exists (works for both PySpark and mock-spark)
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")  # type: ignore[attr-defined]
                except Exception as e:
                    # If SQL fails, log warning but continue (schema might already exist)
                    self.logger.debug(f"Could not create schema '{schema_name}': {e}")

            # Check if table exists and is a Delta table
            table_is_delta = False
            if table_exists(self.spark, self.table_fqn):
                # Heal catalog entries that report empty schema (struct<>)
                if table_schema_is_empty(self.spark, self.table_fqn):
                    self.logger.warning(
                        f"Table {self.table_fqn} reports empty schema; dropping and recreating."
                    )
                    self.spark.sql(f"DROP TABLE IF EXISTS {self.table_fqn}")  # type: ignore[attr-defined]
                try:
                    # Check if table is a Delta table by checking table properties
                    if HAS_DELTA:
                        try:
                            # Try to get table details using DESCRIBE DETAIL (more reliable)
                            detail_df = self.spark.sql(
                                f"DESCRIBE DETAIL {self.table_fqn}"
                            )  # type: ignore[attr-defined]
                            detail_rows = detail_df.collect()
                            if detail_rows:
                                # Check if provider is delta
                                provider = detail_rows[0].get("provider", "")
                                if provider == "delta":
                                    table_is_delta = True
                                    self.logger.info(
                                        f"Table {self.table_fqn} exists and is a Delta table"
                                    )
                                else:
                                    # Table exists but is not a Delta table - drop it
                                    self.logger.warning(
                                        f"Table {self.table_fqn} exists but is not a Delta table (provider: {provider}). Dropping and recreating."
                                    )
                                    self.spark.sql(
                                        f"DROP TABLE IF EXISTS {self.table_fqn}"
                                    )  # type: ignore[attr-defined]
                            else:
                                # Could not get details, try DeltaTable.forName as fallback
                                try:
                                    DeltaTable.forName(self.spark, self.table_fqn)  # type: ignore[attr-defined]
                                    table_is_delta = True
                                    self.logger.info(
                                        f"Table {self.table_fqn} exists and is a Delta table (verified via DeltaTable)"
                                    )
                                except Exception:
                                    # If both methods fail, assume it's not a Delta table
                                    self.logger.warning(
                                        f"Table {self.table_fqn} exists but could not verify as Delta table. Dropping and recreating."
                                    )
                                    self.spark.sql(
                                        f"DROP TABLE IF EXISTS {self.table_fqn}"
                                    )  # type: ignore[attr-defined]
                        except Exception as e:
                            # DESCRIBE DETAIL failed, try DeltaTable.forName as fallback
                            try:
                                DeltaTable.forName(self.spark, self.table_fqn)  # type: ignore[attr-defined]
                                table_is_delta = True
                                self.logger.info(
                                    f"Table {self.table_fqn} exists and is a Delta table (verified via DeltaTable)"
                                )
                            except Exception:
                                # If both methods fail, log warning but don't drop - might be a temporary issue
                                self.logger.warning(
                                    f"Could not verify if table {self.table_fqn} is Delta: {e}. Assuming it's valid and continuing."
                                )
                                table_is_delta = True  # Assume it's valid to avoid dropping existing data
                    else:
                        # Delta Lake not available, but table exists - assume it's okay
                        table_is_delta = True
                except Exception as e:
                    # If all checks fail, assume table is valid to avoid data loss
                    self.logger.warning(
                        f"Could not verify table {self.table_fqn} Delta status: {e}. Assuming valid and continuing."
                    )
                    table_is_delta = True

            if not table_exists(self.spark, self.table_fqn):
                # Create empty DataFrame with schema
                empty_df = self.spark.createDataFrame([], schema)  # type: ignore[attr-defined]

                # Ensure schema exists RIGHT BEFORE saveAsTable
                if schema_name:
                    try:
                        self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")  # type: ignore[attr-defined]
                    except Exception:
                        pass  # Schema might already exist; continue

                # Always use Delta format - failures will propagate if Delta is not available
                # This ensures we know when Delta fails rather than silently falling back
                # Note: Delta Lake doesn't support append in batch mode, so use overwrite
                try:
                    (
                        empty_df.write.format("delta")
                        .mode("overwrite")
                        .option("overwriteSchema", "true")
                        .saveAsTable(self.table_fqn)  # type: ignore[attr-defined]
                    )
                    self.logger.info(
                        f"Delta table created successfully: {self.table_fqn}"
                    )
                except Exception as create_error:
                    # Handle race condition - table might already exist
                    error_msg = str(create_error).lower()
                    # Check for various "table already exists" error formats
                    if (
                        "already exists" in error_msg
                        or "table_or_view_already_exists" in error_msg
                    ):
                        self.logger.debug(
                            f"Table {self.table_fqn} already exists, continuing..."
                        )
                        # Verify table exists and has correct schema - if not, re-raise
                        if not table_exists(self.spark, self.table_fqn):
                            raise  # Table should exist but doesn't - re-raise
                    else:
                        # Re-raise if it's a different error - this will propagate Delta failures
                        raise

                try:
                    self.spark.sql(f"REFRESH TABLE {self.table_fqn}")  # type: ignore[attr-defined]
                except Exception:
                    pass  # Refresh optional; table created
                self.logger.info(f"Table created successfully: {self.table_fqn}")
            elif not table_is_delta:
                # Table exists but wasn't verified as Delta - this shouldn't happen after the check above
                self.logger.warning(
                    f"Table {self.table_fqn} exists but Delta status unclear"
                )

        except Exception as e:
            raise WriterTableError(
                f"Failed to create table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="create_table",
                context={"schema": str(schema)},
                suggestions=[
                    "Check table permissions",
                    "Verify schema configuration",
                    "Ensure Delta Lake is properly configured",
                ],
            ) from e

    def write_dataframe(
        self,
        df: DataFrame,  # type: ignore[valid-type]
        write_mode: WriteMode = WriteMode.APPEND,
        partition_columns: Optional[list[str]] = None,
    ) -> WriteResult:
        """
        Write DataFrame to the log table.

        Args:
            df: DataFrame to write
            write_mode: Write mode for the operation
            partition_columns: Columns to partition by

        Returns:
            Dictionary containing write results

        Raises:
            WriterTableError: If write operation fails
        """
        try:
            self.logger.info(
                f"Writing DataFrame to {self.table_fqn} with mode {write_mode.value}"
            )

            df_prepared = self._prepare_dataframe_for_write(df)

            if table_exists(self.spark, self.table_fqn) and table_schema_is_empty(
                self.spark, self.table_fqn
            ):
                self.logger.warning(
                    f"Table {self.table_fqn} reports empty schema before write; dropping and recreating."
                )
                self.spark.sql(f"DROP TABLE IF EXISTS {self.table_fqn}")  # type: ignore[attr-defined]

            # Check if table exists and has format mismatch (e.g., Parquet table but we want Delta)
            if table_exists(self.spark, self.table_fqn):
                try:
                    # Get table provider/format
                    detail_df = self.spark.sql(
                        f"DESCRIBE DETAIL {self.table_fqn}"
                    ).collect()  # type: ignore[attr-defined]
                    if detail_df:
                        provider = detail_df[0].get("provider", "").lower()  # type: ignore[index]
                        # If table is Parquet but we're trying to write Delta, drop and recreate
                        if (
                            provider
                            and "parquet" in provider
                            and "delta" not in provider
                        ):
                            self.logger.warning(
                                f"Table {self.table_fqn} is Parquet but we need Delta format. "
                                f"Dropping and will recreate as Delta."
                            )
                            self.spark.sql(f"DROP TABLE IF EXISTS {self.table_fqn}")  # type: ignore[attr-defined]
                except Exception as e:
                    # If we can't check format, log warning but continue
                    self.logger.debug(f"Could not check table format: {e}")

            # Verify Delta configuration before write operation
            import os

            print(f"🔍 write_dataframe: About to write Delta table {self.table_fqn}")
            print(f"🔍 write_dataframe: PID={os.getpid()}")
            print(f"🔍 write_dataframe: Session ID (Python)={id(self.spark)}")
            try:
                if hasattr(self.spark, "_jsparkSession"):
                    print(
                        f"🔍 write_dataframe: Session ID (JVM)={id(self.spark._jsparkSession)}"
                    )
            except Exception:
                pass  # Session/config log optional for diagnostics

            # Check configs before Delta operation
            try:
                ext = self.spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
                cat = self.spark.conf.get("spark.sql.catalog.spark_catalog", "")  # type: ignore[attr-defined]
                print(
                    f"🔍 write_dataframe: Pre-write config check - Extensions: '{ext}', Catalog: '{cat}'"
                )
                if "DeltaSparkSessionExtension" not in ext or "DeltaCatalog" not in cat:
                    print(
                        "⚠️ write_dataframe: WARNING - Delta configs missing before write!"
                    )
                    print(
                        "⚠️ write_dataframe: This will likely cause DeltaAnalysisException"
                    )
            except Exception as config_error:
                print(f"⚠️ write_dataframe: Could not check configs: {config_error}")

            # Standardize Delta overwrite pattern for overwrite; append uses mergeSchema
            # Always uses Delta format - failures will propagate if Delta is not available
            if write_mode == WriteMode.OVERWRITE:
                # Prepare for Delta overwrite by dropping existing Delta table if it exists
                prepare_delta_overwrite(self.spark, self.table_fqn)
                writer = (
                    df_prepared.write.format("delta")
                    .mode("overwrite")
                    .option("overwriteSchema", "true")
                )  # type: ignore[attr-defined]
            else:
                # Append mode - use mergeSchema for schema evolution
                writer = (
                    df_prepared.write.format("delta")
                    .mode(write_mode.value)
                    .option("mergeSchema", "true")
                )  # type: ignore[attr-defined]

            if partition_columns:
                writer = writer.partitionBy(*partition_columns)

            try:
                print(f"🔍 write_dataframe: Executing saveAsTable({self.table_fqn})")
                writer.saveAsTable(self.table_fqn)  # type: ignore[attr-defined]
                print("✅ write_dataframe: saveAsTable succeeded")
            except Exception as write_error:
                import traceback

                error_msg = str(write_error).lower()

                # Log full error context for DeltaAnalysisException
                if "delta" in error_msg.lower() or "DELTA_CONFIGURE" in str(
                    write_error
                ):
                    print("❌ write_dataframe: Delta error occurred!")
                    print(
                        f"❌ write_dataframe: Error type: {type(write_error).__name__}"
                    )
                    print(f"❌ write_dataframe: Error message: {write_error}")
                    print(f"❌ write_dataframe: Session ID (Python)={id(self.spark)}")
                    try:
                        if hasattr(self.spark, "_jsparkSession"):
                            print(
                                f"❌ write_dataframe: Session ID (JVM)={id(self.spark._jsparkSession)}"
                            )
                    except Exception:
                        pass
                    try:
                        ext = self.spark.conf.get("spark.sql.extensions", "")  # type: ignore[attr-defined]
                        cat = self.spark.conf.get("spark.sql.catalog.spark_catalog", "")  # type: ignore[attr-defined]
                        print(
                            f"❌ write_dataframe: Configs at error time - Extensions: '{ext}', Catalog: '{cat}'"
                        )
                    except Exception:
                        pass
                    print("❌ write_dataframe: Stack trace:")
                    traceback.print_exc()

                if (
                    "already exists" in error_msg
                    or "table_or_view_already_exists" in error_msg
                ):
                    if table_exists(self.spark, self.table_fqn):
                        self.logger.debug(
                            f"Table {self.table_fqn} was created by another thread, retrying with overwrite mode"
                        )
                        # Always use Delta format
                        # Note: Delta Lake doesn't support append in batch mode, so use overwrite
                        retry_writer = (
                            df_prepared.write.format("delta")
                            .mode("overwrite")
                            .option("overwriteSchema", "true")
                        )  # type: ignore[attr-defined]
                        if partition_columns:
                            retry_writer = retry_writer.partitionBy(*partition_columns)
                        retry_writer.saveAsTable(self.table_fqn)  # type: ignore[attr-defined]
                    else:
                        raise
                else:
                    raise

            row_count = df_prepared.count()  # type: ignore[attr-defined]

            write_result = {
                "table_name": self.table_fqn,
                "write_mode": write_mode.value,
                "rows_written": row_count,
                "timestamp": datetime.now().isoformat(),
                "success": True,
            }

            self.logger.info(f"Successfully wrote {row_count} rows to {self.table_fqn}")
            return cast(WriteResult, write_result)

        except Exception as e:
            # Safely get row count for error context
            try:
                row_count = df.count() if hasattr(df, "count") else 0  # type: ignore[attr-defined]
            except Exception:
                row_count = 0

            raise WriterTableError(
                f"Failed to write DataFrame to {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="write_dataframe",
                context={"write_mode": write_mode.value, "row_count": row_count},
                suggestions=[
                    "Check table permissions",
                    "Verify DataFrame schema matches table schema",
                    "Ensure sufficient storage space",
                    "Check for schema evolution conflicts",
                ],
            ) from e

    def write_batch(
        self, log_rows: list[LogRow], write_mode: WriteMode = WriteMode.APPEND
    ) -> WriteResult:
        """
        Write a batch of log rows to the table.

        Args:
            log_rows: List of log rows to write
            write_mode: Write mode for the operation

        Returns:
            Dictionary containing write results
        """
        try:
            self.logger.info(f"Writing batch of {len(log_rows)} log rows")

            # Convert log rows to DataFrame and write
            df = self._create_dataframe_from_log_rows(log_rows)
            # Write DataFrame
            return self.write_dataframe(df, write_mode)  # type: ignore[attr-defined]

        except Exception as e:
            self.logger.error(f"Failed to write batch: {e}")
            raise

    def optimize_table(self) -> OptimizeResult:
        """
        Optimize the Delta table for better performance.

        Returns:
            Dictionary containing optimization results
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, optimize operation skipped for {self.table_fqn}"
            )
            return {
                "table_name": self.table_fqn,
                "optimization_completed": False,
                "skipped": True,
                "reason": "Delta Lake not available",
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(f"Optimizing table: {self.table_fqn}")

            # Run OPTIMIZE command using Delta Lake Python API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)
            # Note: optimize() method may not be available in all Delta Lake versions
            if hasattr(delta_table, "optimize"):
                delta_table.optimize()
            else:
                # Fallback: use SQL command
                self.spark.sql(f"OPTIMIZE {self.table_fqn}")  # type: ignore[attr-defined]

            # Get table statistics
            table_info = self.get_table_info()

            optimization_result = {
                "table_name": self.table_fqn,
                "optimization_completed": True,
                "timestamp": datetime.now().isoformat(),
                "table_info": table_info,
            }

            self.logger.info(f"Table optimization completed: {self.table_fqn}")
            return cast(OptimizeResult, optimization_result)

        except Exception as e:
            self.logger.error(f"Failed to optimize table {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to optimize table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="optimize_table",
                suggestions=[
                    "Check table permissions",
                    "Verify table exists",
                    "Ensure sufficient resources for optimization",
                ],
            ) from e

    def vacuum_table(self, retention_hours: int = 168) -> VacuumResult:
        """
        Vacuum the Delta table to remove old files.

        Args:
            retention_hours: Hours of retention for old files

        Returns:
            Dictionary containing vacuum results
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, vacuum operation skipped for {self.table_fqn}"
            )
            return {
                "table_name": self.table_fqn,
                "vacuum_completed": False,
                "skipped": True,
                "reason": "Delta Lake not available",
                "retention_hours": retention_hours,
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(
                f"Vacuuming table: {self.table_fqn} (retention: {retention_hours}h)"
            )

            # Run VACUUM command using Delta Lake API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)
            delta_table.vacuum(retentionHours=retention_hours)

            vacuum_result = {
                "table_name": self.table_fqn,
                "vacuum_completed": True,
                "retention_hours": retention_hours,
                "timestamp": datetime.now().isoformat(),
            }

            self.logger.info(f"Table vacuum completed: {self.table_fqn}")
            return cast(VacuumResult, vacuum_result)

        except Exception as e:
            self.logger.error(f"Failed to vacuum table {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to vacuum table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="vacuum_table",
                suggestions=[
                    "Check table permissions",
                    "Verify retention period is valid",
                    "Ensure table exists",
                ],
            ) from e

    def get_table_info(self) -> TableInfo:
        """
        Get information about the log table.

        Returns:
            Dictionary containing table information
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, using basic table info for {self.table_fqn}"
            )
            # Get basic info without Delta Lake
            row_count = self.spark.table(self.table_fqn).count()  # type: ignore[attr-defined]
            return {
                "table_name": self.table_fqn,
                "row_count": row_count,
                "details": [],
                "history": [],
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(f"Getting table info for: {self.table_fqn}")

            # Get table details using Delta Lake API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)

            # Get table details using Delta Lake Python API
            # Note: detail() method may not be available in all Delta Lake versions
            if hasattr(delta_table, "detail"):
                table_details = delta_table.detail().collect()
            else:
                # Fallback: use SQL command
                table_details = self.spark.sql(
                    f"DESCRIBE DETAIL {self.table_fqn}"
                ).collect()  # type: ignore[attr-defined]

            # Get table history
            table_history = delta_table.history().collect()

            # Get row count
            row_count = self.spark.table(self.table_fqn).count()  # type: ignore[attr-defined]

            table_info = {
                "table_name": self.table_fqn,
                "row_count": row_count,
                "details": [dict(row.asDict()) for row in table_details],
                "history_count": len(table_history),
                "last_modified": (
                    table_history[0]["timestamp"] if table_history else None
                ),
            }

            self.logger.info(f"Table info retrieved: {row_count} rows")
            return cast(TableInfo, table_info)

        except Exception as e:
            self.logger.error(f"Failed to get table info for {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to get table info for {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="get_table_info",
            ) from e

    def query_logs(
        self,
        limit: Optional[int] = None,
        filters: Union[Dict[str, Union[str, int, float, bool]], None] = None,
    ) -> DataFrame:  # type: ignore[valid-type]
        """
        Query logs from the table.

        Args:
            limit: Maximum number of rows to return
            filters: Filters to apply to the query

        Returns:
            DataFrame containing query results
        """
        try:
            self.logger.info(f"Querying logs from: {self.table_fqn}")

            # Start with the base table
            result_df = self.spark.table(self.table_fqn)  # type: ignore[attr-defined]

            # Apply filters if provided using PySpark functions
            if filters:
                for column, value in filters.items():
                    if isinstance(value, str):
                        result_df = result_df.filter(
                            self.functions.col(column) == self.functions.lit(value)  # type: ignore[attr-defined]
                        )
                    else:
                        result_df = result_df.filter(
                            self.functions.col(column) == value  # type: ignore[attr-defined]
                        )

            # Add ordering using PySpark functions
            # from ..compat import desc  # Removed: defined in notebook cells above

            result_df = result_df.orderBy(desc("created_at"))

            # Apply limit if specified
            if limit:
                result_df = result_df.limit(limit)  # type: ignore[attr-defined]

            self.logger.info(f"Query executed successfully: {result_df.count()} rows")  # type: ignore[attr-defined]

            return result_df

        except Exception as e:
            self.logger.error(f"Failed to query logs from {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to query logs: {e}",
                table_name=self.table_fqn,
                operation="query_logs",
                suggestions=[
                    "Check table exists",
                    "Verify query syntax",
                    "Check column names in filters",
                ],
            ) from e

    def _prepare_dataframe_for_write(self, df: DataFrame) -> DataFrame:  # type: ignore[valid-type]
        """Prepare DataFrame for writing to Delta table."""
        try:
            # Add metadata columns if not present
            from datetime import datetime

            current_time_str = datetime.now().isoformat()

            if "created_at" not in df.columns:  # type: ignore[attr-defined]
                df = df.withColumn("created_at", self.functions.lit(current_time_str))

            if "updated_at" not in df.columns:  # type: ignore[attr-defined]
                df = df.withColumn("updated_at", self.functions.lit(current_time_str))

            return df

        except Exception as e:
            self.logger.error(f"Failed to prepare DataFrame for write: {e}")
            raise

    def _create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> DataFrame:  # type: ignore[valid-type]
        """Create DataFrame from log rows."""
        try:
            # Convert log rows to dictionaries
            from datetime import datetime

            current_time_str = datetime.now().isoformat()

            log_data = []
            for row in log_rows:
                row_dict = {
                    "run_id": row["run_id"],
                    "run_mode": row["run_mode"],
                    "run_started_at": row["run_started_at"],
                    "run_ended_at": row["run_ended_at"],
                    "execution_id": row["execution_id"],
                    "pipeline_id": row["pipeline_id"],
                    "schema": row["schema"],
                    "phase": row["phase"],
                    "step_name": row["step_name"],
                    "step_type": row["step_type"],
                    "start_time": row["start_time"],
                    "end_time": row["end_time"],
                    "duration_secs": row["duration_secs"],
                    "table_fqn": row["table_fqn"],
                    "write_mode": row["write_mode"],
                    "input_rows": row["input_rows"],
                    "output_rows": row["output_rows"],
                    "rows_written": row["rows_written"],
                    "rows_processed": row["rows_processed"],
                    "table_total_rows": row.get(
                        "table_total_rows"
                    ),  # Include table_total_rows metric
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                    "error_message": row["error_message"],
                    "memory_usage_mb": row["memory_usage_mb"],
                    "cpu_usage_percent": row["cpu_usage_percent"],
                    "metadata": row["metadata"],
                    "created_at": current_time_str,  # Include timestamp directly as string
                }
                log_data.append(row_dict)

            # Create DataFrame with explicit schema for type safety and None value handling
            schema = create_log_schema()
            df = self.spark.createDataFrame(log_data, schema)  # type: ignore[attr-defined,type-var]

            return df

        except Exception as e:
            self.logger.error(f"Failed to create DataFrame from log rows: {e}")
            raise

    @property
    def table_schema(self) -> str:
        """Get the table schema."""
        return self.config.table_schema

    @property
    def table_name(self) -> str:
        """Get the table name."""
        return self.config.table_name

In [None]:
# Module: pipeline_builder.engine.spark_engine (pipeline_builder)
#
# Dependencies: abstracts.engine, abstracts.reports.transform, abstracts.reports.validation, abstracts.reports.write, abstracts.source, abstracts.step, pipeline_builder.execution, pipeline_builder.functions, pipeline_builder.models, pipeline_builder.protocols, pipeline_builder.table_operations, pipeline_builder.validation, pipeline_builder_base.logging

# mypy: ignore-errors
"""
SparkEngine implementation of abstracts.Engine.

This engine wraps ExecutionEngine and adapts between the abstracts interface
and the concrete pipeline_builder implementation.
"""

from __future__ import annotations

from typing import Any, Optional, Union
# from .engine import Engine  # Removed: defined in notebook cells above
# from .reports.transform import TransformReport  # Removed: defined in notebook cells above
# from .reports.validation import ValidationReport  # Removed: defined in notebook cells above
# from .reports.write import WriteReport  # Removed: defined in notebook cells above
# from .source import Source  # Removed: defined in notebook cells above
# from .step import Step  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

# from ..execution import ExecutionEngine, _create_dataframe_writer  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from ..protocols import (  # Removed: defined in notebook cells above
    # DataFrameProtocol as DataFrame,
# )
# from ..protocols import (  # Removed: defined in notebook cells above
    # SparkSessionProtocol as SparkSession,
# )
# from ..table_operations import fqn  # Removed: defined in notebook cells above
# from ..validation import apply_column_rules  # Removed: defined in notebook cells above

class SparkEngine(Engine):
    """
    SparkEngine implements abstracts.Engine using ExecutionEngine.

    This engine adapts between the abstracts interface (Step, Source protocols)
    and the concrete pipeline_builder types (BronzeStep/SilverStep/GoldStep, DataFrame).
    """

    def __init__(
        self,
        spark: SparkSession,  # type: ignore[valid-type]
        config: Any,  # PipelineConfig
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
    ):
        """
        Initialize SparkEngine.

        Args:
            spark: SparkSession instance
            config: PipelineConfig instance
            logger: Optional logger instance
            functions: Optional functions protocol for PySpark operations
        """
        self.spark = spark
        self.config = config
        self.logger = logger or PipelineLogger()
        self.functions = functions
        self._execution_engine = ExecutionEngine(spark, config, self.logger, functions)

    def validate_source(self, step: Step, source: Source) -> ValidationReport:
        """
        Validate a data source according to step rules.

        Args:
            step: Step with validation rules
            source: Source data to validate (DataFrame)

        Returns:
            ValidationReport with validation results
        """
        # Duck-type: must expose DataFrameProtocol surface
        if not hasattr(source, "schema") or not hasattr(source, "count"):
            raise TypeError(f"Source must be DataFrame-like, got {type(source)}")

        df: DataFrame = source  # type: ignore[valid-type]

        concrete_step: Union[BronzeStep, SilverStep, GoldStep] = step  # type: ignore[assignment]

        # Apply validation rules
        try:
            # Rules type compatibility - Step Protocol uses Rules, concrete steps use ColumnRules
            # mypy doesn't understand Protocol structural typing here, so we use Any
            rules: Any = concrete_step.rules
            valid_df, invalid_df, validation_stats = apply_column_rules(
                df,
                rules,
                "pipeline",
                concrete_step.name,
                functions=self.functions,
            )

            valid_rows = valid_df.count()  # type: ignore[attr-defined]
            invalid_rows = invalid_df.count()  # type: ignore[attr-defined]

            return ValidationReport(
                source=valid_df,  # Return validated source
                valid_rows=valid_rows,
                invalid_rows=invalid_rows,
                error=None,
            )
        except Exception as e:
            return ValidationReport(
                source=df,
                valid_rows=0,
                invalid_rows=df.count() if df is not None else 0,  # type: ignore[attr-defined]
                error=e,
            )

    def transform_source(self, step: Step, source: Source) -> TransformReport:
        """
        Transform a data source according to step transformation logic.

        Args:
            step: Step with transformation function
            source: Source data to transform (DataFrame)

        Returns:
            TransformReport with transformed source
        """
        if not hasattr(source, "schema") or not hasattr(source, "count"):
            raise TypeError(f"Source must be DataFrame-like, got {type(source)}")

        df: DataFrame = source  # type: ignore[valid-type]

        concrete_step: Union[BronzeStep, SilverStep, GoldStep] = step  # type: ignore[assignment]

        try:
            # Bronze steps: no transformation, just return source
            step_phase = concrete_step.step_type
            if step_phase.value == "bronze":
                return TransformReport(source=df, error=None)

            elif step_phase.value == "silver":
                if concrete_step.transform is None:  # type: ignore[attr-defined]
                    raise ValueError(
                        f"Silver step '{concrete_step.name}' requires a transform function"
                    )
                transformed_df = concrete_step.transform(self.spark, df, {})  # type: ignore[attr-defined]
                return TransformReport(source=transformed_df, error=None)

            # Gold steps: transform with silvers dict
            # Note: For gold steps, the "source" parameter is actually a dict of silvers
            # This is a limitation of the abstracts.Engine interface for gold steps
            elif step_phase.value == "gold":
                if concrete_step.transform is None:  # type: ignore[attr-defined]
                    raise ValueError(
                        f"Gold step '{concrete_step.name}' requires a transform function"
                    )
                # For gold steps, source should be a dict of silvers (Dict[str, DataFrame]  # type: ignore[valid-type])
                # The abstracts interface expects Source, but we accept dict for gold steps
                if type(source) is dict:
                    silvers = source
                else:
                    # If single DataFrame, this is an error for gold steps
                    raise TypeError(
                        f"Gold step '{concrete_step.name}' requires a dict of silvers, got {type(source)}"
                    )
                transformed_df = concrete_step.transform(self.spark, silvers)
                return TransformReport(source=transformed_df, error=None)

            else:
                raise ValueError(f"Unknown step type: {type(step)}")

        except Exception as e:
            return TransformReport(source=df, error=e)

    def write_target(self, step: Step, source: Source) -> WriteReport:
        """
        Write a data source to target table.

        Args:
            step: Step with target configuration
            source: Source data to write (DataFrame)

        Returns:
            WriteReport with write results
        """
        # Duck-type: must expose DataFrameProtocol surface (avoids isinstance issues in Python 3.8)
        if not hasattr(source, "schema") or not hasattr(source, "count"):
            raise TypeError(f"Source must be DataFrame-like, got {type(source)}")

        df: DataFrame = source  # type: ignore[valid-type]

        # Type check: step should have step_type property (avoids isinstance issues in Python 3.8)
        if not hasattr(step, "step_type"):
            raise TypeError(
                f"Step must have step_type property (BronzeStep, SilverStep, or GoldStep), got {type(step)}"
            )
        # Cast to help mypy - we know it's one of the concrete types after checking step_type
        concrete_step: Union[BronzeStep, SilverStep, GoldStep] = step  # type: ignore[assignment]

        # Bronze steps don't write to tables
        step_phase = concrete_step.step_type
        if step_phase.value == "bronze":
            rows_written = df.count()  # type: ignore[attr-defined]
            return WriteReport(
                source=df,
                written_rows=rows_written,
                failed_rows=0,
                error=None,
            )

        # Get table name and schema
        table_name = getattr(concrete_step, "table_name", None) or getattr(
            concrete_step, "target", concrete_step.name
        )
        schema = getattr(concrete_step, "schema", None) or getattr(
            concrete_step, "write_schema", None
        )

        if schema is None:
            raise ValueError(
                f"Step '{concrete_step.name}' requires a schema to be specified for writing"
            )

        if table_name is None:
            raise ValueError(
                f"Step '{concrete_step.name}' requires a table_name or target to be specified"
            )

        output_table = fqn(schema, table_name)

        # Determine write mode
        write_mode = getattr(concrete_step, "write_mode", "overwrite")
        if write_mode is None:
            write_mode = "overwrite"

        # Create schema if needed
        try:
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")  # type: ignore[attr-defined]
        except Exception as e:
            raise RuntimeError(f"Failed to create schema '{schema}': {e}") from e

        # Write to table
        try:
            rows_before = df.count()  # type: ignore[attr-defined]
            # Use helper function to ensure correct format (delta or parquet) based on availability
            writer = _create_dataframe_writer(df, self.spark, write_mode)
            writer.saveAsTable(output_table)  # type: ignore[attr-defined]
            rows_written = rows_before  # Assuming all rows were written successfully
            return WriteReport(
                source=df,
                written_rows=rows_written,
                failed_rows=0,
                error=None,
            )
        except Exception as e:
            return WriteReport(
                source=df,
                written_rows=0,
                failed_rows=df.count() if df is not None else 0,  # type: ignore[attr-defined]
                error=e,
            )

In [None]:
# Module: pipeline_builder.writer.core (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.pipeline.models, pipeline_builder.table_operations, pipeline_builder.validation.utils, pipeline_builder.writer.analytics, pipeline_builder.writer.exceptions, pipeline_builder.writer.models, pipeline_builder.writer.monitoring, pipeline_builder.writer.operations, pipeline_builder.writer.storage, pipeline_builder_base.logging, pipeline_builder_base.models

"""
Refactored LogWriter implementation with modular architecture.

This module contains the main LogWriter class that orchestrates the various
writer components for comprehensive logging functionality. The LogWriter
provides a simplified API for writing pipeline execution results to Delta
Lake tables.

**New Simplified API:**
    The LogWriter now supports a simplified initialization API using `schema`
    and `table_name` parameters directly, replacing the previous `WriterConfig`
    approach. The old API is still supported but deprecated.

**Key Features:**
    - Simplified initialization with schema and table_name
    - Modular architecture with dedicated components
    - Comprehensive error handling and validation
    - Performance monitoring and optimization
    - Data quality analysis and trend detection
    - Delta Lake integration for persistent logging

**Migration Guide:**
    Old API (deprecated):
        >>> config = WriterConfig(table_schema="analytics", table_name="logs")
        >>> writer = LogWriter(spark, config=config)

    New API (recommended):
        >>> writer = LogWriter(spark, schema="analytics", table_name="logs")

Dependencies:
    - compat: Spark/PySpark compatibility layer
    - functions: Functions protocol for DataFrame operations
    - logging: Pipeline logging utilities
    - models.execution: ExecutionResult and StepResult models
    - validation.utils: Data validation utilities
    - writer.analytics: Analytics and trend analysis
    - writer.exceptions: Writer-specific exceptions
    - writer.models: Writer models and type definitions
    - writer.monitoring: Performance monitoring
    - writer.operations: Data processing operations
    - writer.storage: Delta Lake storage management

Example:
    >>> from pipeline_builder.writer import LogWriter
    >>> from pipeline_builder.models.execution import ExecutionResult
    >>>
    >>> # Initialize with new simplified API
    >>> writer = LogWriter(spark, schema="analytics", table_name="pipeline_logs")
    >>>
    >>> # Write execution result
    >>> result = writer.write_execution_result(execution_result)
    >>> print(f"Wrote {result['rows_written']} rows")
"""

from __future__ import annotations

import uuid
from datetime import datetime
from typing import Any, Dict, Optional
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionResult, StepResult  # Removed: defined in notebook cells above

# from ..compat import SparkSession  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..pipeline.models import PipelineReport  # Removed: defined in notebook cells above
# from ..table_operations import table_exists  # Removed: defined in notebook cells above
# from .analytics import (  # Removed: defined in notebook cells above
    # DataQualityAnalyzer,
    # ExecutionTrends,
    # QualityAnomalies,
    # QualityTrends,
    # TrendAnalyzer,
# )
# from .exceptions import WriterConfigurationError, WriterError  # Removed: defined in notebook cells above
# from .models import LogRow, WriteMode, WriterConfig, WriterMetrics, create_log_schema  # Removed: defined in notebook cells above
# from .monitoring import (  # Removed: defined in notebook cells above
    # AnalyticsEngine,
    # AnomalyReport,
    # MemoryUsageInfo,
    # PerformanceMonitor,
    # PerformanceReport,
# )
# from .operations import DataProcessor, DataQualityReport  # Removed: defined in notebook cells above
# from .storage import (  # Removed: defined in notebook cells above
    # OptimizeResult,
    # StorageManager,
    # TableInfo,
    # VacuumResult,
    # WriteResult,
# )

def time_write_operation(
    operation_func: Any, *args: Any, **kwargs: Any
) -> tuple[int, float, Any, Any]:
    """
    Time a write operation and return metrics.

    Args:
        operation_func: Function to time
        *args: Arguments for the function
        **kwargs: Keyword arguments for the function

    Returns:
        Tuple of (rows_written, duration_secs, start_time, end_time)
    """
    import time
    from datetime import datetime

    start_time = datetime.now()
    start_ts = time.time()

    try:
        result = operation_func(*args, **kwargs)
        rows_written = result.get("rows_written", 0) if isinstance(result, dict) else 0
    except Exception:
        rows_written = 0

    end_time = datetime.now()
    duration_secs = time.time() - start_ts

    return rows_written, duration_secs, start_time, end_time

def validate_log_data(log_rows: list[LogRow]) -> None:
    """
    Validate log data for quality and consistency.

    Args:
        log_rows: List of log rows to validate

    Raises:
        WriterValidationError: If validation fails
    """
    if not log_rows:
        return

    # Basic validation - check required fields
    required_fields = {"run_id", "phase", "step_name"}
    for i, row in enumerate(log_rows):
        missing_fields = required_fields - set(row.keys())
        if missing_fields:
            # from .exceptions import WriterValidationError  # Removed: defined in notebook cells above

            raise WriterValidationError(
                f"Log row {i} missing required fields: {missing_fields}",
                validation_errors=[f"Missing fields: {missing_fields}"],
                context={"row_index": i, "row": row},
            )

def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: str = "initial",
    metadata: Optional[Dict[str, Any]] = None,
) -> list[LogRow]:
    """
    Create log rows from an execution result.

    Args:
        execution_result: The execution result
        run_id: Run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List of log rows
    """

    log_rows = []

    # Create a main log row for the execution
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,  # type: ignore[typeddict-item]
        "run_started_at": getattr(execution_result, "start_time", None),
        "run_ended_at": getattr(execution_result, "end_time", None),
        "execution_id": getattr(execution_result, "execution_id", run_id),
        "pipeline_id": getattr(execution_result, "pipeline_id", "unknown"),
        "schema": getattr(execution_result, "schema", "default"),
        "phase": "bronze",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": getattr(execution_result, "start_time", None),
        "end_time": getattr(execution_result, "end_time", None),
        "duration_secs": getattr(execution_result, "duration", 0.0) or 0.0,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": 0,
        "output_rows": 0,
        "rows_written": 0,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": getattr(execution_result, "status", "unknown") == "completed",
        "error_message": getattr(execution_result, "error", None),
        "memory_usage_mb": 0.0,
        "cpu_usage_percent": 0.0,
        "metadata": {},
    }

    log_rows.append(main_row)

    # Add step results if available
    if hasattr(execution_result, "steps"):
        steps = getattr(execution_result, "steps", None)
        if steps and isinstance(steps, (list, tuple)):
            for step in steps:
                step_row: LogRow = {
                    "run_id": run_id,
                    "run_mode": run_mode,  # type: ignore[typeddict-item]
                    "run_started_at": getattr(execution_result, "start_time", None),
                    "run_ended_at": getattr(execution_result, "end_time", None),
                    "execution_id": getattr(execution_result, "execution_id", run_id),
                    "pipeline_id": getattr(execution_result, "pipeline_id", "unknown"),
                    "schema": getattr(execution_result, "schema", "default"),
                    "phase": getattr(step, "step_type", "bronze").lower(),  # type: ignore[typeddict-item]
                    "step_name": getattr(step, "step_name", "unknown"),
                    "step_type": getattr(step, "step_type", "unknown"),
                    "start_time": getattr(step, "start_time", None),
                    "end_time": getattr(step, "end_time", None),
                    "duration_secs": getattr(step, "duration", 0.0),
                    "table_fqn": getattr(step, "output_table", None),
                    "write_mode": getattr(step, "write_mode", None),
                    "input_rows": getattr(step, "input_rows", 0),
                    "output_rows": getattr(step, "rows_processed", 0),
                    "rows_written": getattr(step, "rows_written", 0),
                    "rows_processed": getattr(step, "rows_processed", 0),
                    "table_total_rows": None,
                    "valid_rows": 0,
                    "invalid_rows": 0,
                    "validation_rate": 100.0,
                    "success": getattr(step, "status", "unknown") == "completed",
                    "error_message": getattr(step, "error", None),
                    "memory_usage_mb": 0.0,
                    "cpu_usage_percent": 0.0,
                    "metadata": {},
                }
                log_rows.append(step_row)

    return log_rows

class LogWriter:
    """Refactored LogWriter with modular architecture.

    Main class for writing pipeline execution results to Delta Lake tables.
    Provides a simplified API for logging pipeline runs, steps, and metrics
    with comprehensive error handling, performance monitoring, and data quality
    analysis.

    **New Simplified API:**
        The LogWriter now supports direct initialization with `schema` and
        `table_name` parameters, making it easier to use:

        >>> writer = LogWriter(spark, schema="analytics", table_name="logs")

    **Deprecated API:**
        The old API using `WriterConfig` is still supported but deprecated:

        >>> config = WriterConfig(table_schema="analytics", table_name="logs")
        >>> writer = LogWriter(spark, config=config)  # Deprecated

    **Components:**
        - **DataProcessor**: Handles data processing and transformations
        - **StorageManager**: Manages Delta Lake storage operations
        - **PerformanceMonitor**: Tracks performance metrics
        - **AnalyticsEngine**: Provides analytics and trend analysis
        - **DataQualityAnalyzer**: Analyzes data quality metrics
        - **TrendAnalyzer**: Analyzes execution trends

    **Key Methods:**
        - `write_execution_result()`: Write ExecutionResult to log table
        - `write_step_results()`: Write StepResult dictionary to log table
        - `write_log_rows()`: Write raw LogRow list to log table
        - `create_table()`: Create/overwrite table from PipelineReport
        - `append()`: Append PipelineReport to existing table
        - `optimize_table()`: Optimize Delta table for performance
        - `vacuum_table()`: Vacuum Delta table to remove old files
        - `analyze_quality_trends()`: Analyze data quality trends
        - `analyze_execution_trends()`: Analyze execution trends

    Example:
        >>> from pipeline_builder.writer import LogWriter
        >>> from pipeline_builder.models.execution import ExecutionResult
        >>>
        >>> # Initialize with new simplified API
        >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
        >>>
        >>> # Write execution result
        >>> result = writer.write_execution_result(execution_result)
        >>> print(f"Success: {result['success']}")
        >>> print(f"Rows written: {result['rows_written']}")
    """

    def __init__(
        self,
        spark: SparkSession,
        schema: Optional[str] = None,
        table_name: Optional[str] = None,
        config: Optional[WriterConfig] = None,
        functions: Optional[FunctionsProtocol] = None,
        logger: Optional[PipelineLogger] = None,
    ) -> None:
        """Initialize the LogWriter with modular components.

        Creates a new LogWriter instance with the specified schema and table
        name. The writer will automatically create the log table if it doesn't
        exist when data is first written.

        **New Simplified API (Recommended):**
            Use `schema` and `table_name` parameters directly:

            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")

        **Deprecated API:**
            The old API using `WriterConfig` is still supported but will emit
            a deprecation warning:

            >>> config = WriterConfig(table_schema="analytics", table_name="logs")
            >>> writer = LogWriter(spark, config=config)  # Deprecated

        Args:
            spark: SparkSession instance for DataFrame operations. Required.
            schema: Database schema name for the log table. Required if using
                new API. Must be a non-empty string.
            table_name: Table name for the log table. Required if using new API.
                Must be a non-empty string.
            config: WriterConfig instance (deprecated). Use `schema` and
                `table_name` instead. If provided, `schema` and `table_name`
                are ignored.
            functions: FunctionsProtocol instance for DataFrame operations.
                Optional. Uses default functions if not provided.
            logger: PipelineLogger instance for logging. Optional. Creates a
                new logger if not provided.

        Raises:
            WriterConfigurationError: If configuration is invalid, such as:
                - Neither `config` nor both `schema` and `table_name` provided
                - Schema or table_name is empty
                - WriterConfig validation fails

        Example:
            >>> from pipeline_builder.writer import LogWriter
            >>>
            >>> # New simplified API (recommended)
            >>> writer = LogWriter(
            ...     spark,
            ...     schema="analytics",
            ...     table_name="pipeline_logs"
            ... )
            >>>
            >>> # Old API (deprecated, emits warning)
            >>> from pipeline_builder.writer import WriterConfig, WriteMode
            >>> config = WriterConfig(
            ...     table_schema="analytics",
            ...     table_name="pipeline_logs",
            ...     write_mode=WriteMode.APPEND
            ... )
            >>> writer = LogWriter(spark, config=config)  # DeprecationWarning
        """
        self.spark = spark

        # Handle both old and new API
        if config is not None:
            # Old API: config provided
            import warnings

            warnings.warn(
                "Passing WriterConfig is deprecated. Use LogWriter(spark, schema='...', table_name='...') instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            self.config = config
        elif schema is not None and table_name is not None:
            # New API: schema and table_name provided
            self.config = WriterConfig(
                table_schema=schema, table_name=table_name, write_mode=WriteMode.APPEND
            )
        else:
            raise WriterConfigurationError(
                "Must provide either (schema and table_name) or config parameter",
                config_errors=["Missing required parameters"],
                suggestions=[
                    "Use: LogWriter(spark, schema='my_schema', table_name='my_table')",
                    "Or: LogWriter(spark, config=WriterConfig(...))",
                ],
            )

        self.functions = functions if functions is not None else get_default_functions()
        if logger is None:
            self.logger = PipelineLogger("LogWriter")
        else:
            self.logger = logger

        # Validate configuration
        try:
            self.config.validate()
        except ValueError as e:
            raise WriterConfigurationError(
                f"Invalid writer configuration: {e}",
                config_errors=[str(e)],
                context={"config": self.config.__dict__},
                suggestions=[
                    "Check configuration values",
                    "Ensure all required fields are provided",
                    "Verify numeric values are positive",
                ],
            ) from e

        # Initialize components
        self._initialize_components()

        # Initialize metrics
        self.metrics: WriterMetrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }

        # Initialize schema
        self.schema = create_log_schema()

        # Set table FQN for compatibility
        self.table_fqn = f"{self.config.table_schema}.{self.config.table_name}"

        # Cache table row counts to avoid repeated counts within a single write operation
        self._table_total_rows_cache: dict[str, Optional[int]] = {}

        self.logger.info(f"LogWriter initialized for table: {self.table_fqn}")

    def _initialize_components(self) -> None:
        """Initialize all writer components."""
        # Data processing component
        self.data_processor = DataProcessor(self.spark, self.functions, self.logger)

        # Storage management component
        self.storage_manager = StorageManager(
            self.spark, self.config, self.functions, self.logger
        )

        # Performance monitoring component
        self.performance_monitor = PerformanceMonitor(self.spark, self.logger)

        # Analytics components
        self.analytics_engine = AnalyticsEngine(self.spark, self.logger)
        self.quality_analyzer = DataQualityAnalyzer(self.spark, self.logger)
        self.trend_analyzer = TrendAnalyzer(self.spark, self.logger)

    def write_execution_result(
        self,
        execution_result: ExecutionResult,
        run_id: Optional[str] = None,
        run_mode: str = "initial",
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Write execution result to log table.

        Writes an ExecutionResult instance to the log table, creating one log
        row per step in the execution. The table is automatically created if
        it doesn't exist.

        Args:
            execution_result: ExecutionResult instance containing execution
                context, step results, and metrics. Required.
            run_id: Unique run identifier. Optional. If not provided, a UUID
                is automatically generated.
            run_mode: Execution mode string. Defaults to "initial". Valid values:
                - "initial": First-time execution
                - "incremental": Incremental processing
                - "full_refresh": Full refresh execution
                - "validation_only": Validation-only execution
            metadata: Additional metadata dictionary to include in log rows.
                Optional. Can contain any key-value pairs.

        Returns:
            Dictionary containing write results with the following keys:
                - `success`: Boolean indicating if write succeeded
                - `run_id`: The run identifier used
                - `operation_id`: Unique operation identifier
                - `rows_written`: Number of rows written to the table
                - `write_result`: WriteResult dictionary with detailed results
                - `operation_metrics`: Performance metrics for the operation
                - `threshold_violations`: List of performance threshold violations

        Raises:
            WriterValidationError: If log data validation fails (e.g., missing
                required fields, invalid data types)
            WriterTableError: If table operations fail (e.g., table creation,
                write operation)
            WriterPerformanceError: If performance thresholds are exceeded
                (if enabled in configuration)

        Example:
            >>> from pipeline_builder.writer import LogWriter
            >>> from pipeline_builder.models.execution import ExecutionResult
            >>>
            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
            >>> result = writer.write_execution_result(
            ...     execution_result=execution_result,
            ...     run_id="run_123",
            ...     run_mode="incremental",
            ...     metadata={"environment": "production", "version": "1.0"}
            ... )
            >>> print(f"Wrote {result['rows_written']} rows")
            >>> print(f"Success: {result['success']}")
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(
                operation_id, "write_execution_result"
            )

            # Log operation start
            self.logger.info(f"Writing execution result for run {run_id}")

            # Process execution result
            log_rows = self.data_processor.process_execution_result(
                execution_result,
                run_id,
                run_mode,
                metadata,
                table_total_rows_provider=self._get_table_total_rows,
            )

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            # Check performance thresholds
            threshold_violations = (
                self.performance_monitor.check_performance_thresholds(operation_metrics)
            )
            if threshold_violations:
                self.logger.warning(
                    f"Performance threshold violations: {threshold_violations}"
                )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
                "threshold_violations": threshold_violations,
            }

            self.logger.info(f"Successfully wrote execution result for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write execution result for run {run_id}: {e}")
            raise

    def write_step_results(
        self,
        step_results: Dict[str, StepResult],
        run_id: Optional[str] = None,
        run_mode: str = "initial",
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """
        Write step results to log table.

        Args:
            step_results: Dictionary of step results
            run_id: Unique run identifier (generated if not provided)
            run_mode: Mode of the run
            metadata: Additional metadata

        Returns:
            Dict containing write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "write_step_results")

            # Log operation start
            self.logger.info(
                f"Writing {len(step_results)} step results for run {run_id}"
            )

            # Process step results
            log_rows = self.data_processor.process_step_results(
                step_results, run_id, run_mode, metadata
            )

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(f"Successfully wrote step results for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write step results for run {run_id}: {e}")
            raise

    def write_log_rows(
        self,
        log_rows: list[LogRow],
        run_id: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Write log rows directly to the table.

        Args:
            log_rows: List of log rows to write
            run_id: Unique run identifier (generated if not provided)

        Returns:
            Dict containing write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "write_log_rows")

            # Log operation start
            self.logger.info(f"Writing {len(log_rows)} log rows for run {run_id}")

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(f"Successfully wrote log rows for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write log rows for run {run_id}: {e}")
            raise

    def write_execution_result_batch(
        self,
        execution_results: list[ExecutionResult],
        run_ids: Optional[list[str]] = None,
        run_mode: str = "initial",
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """
        Write multiple execution results in batch.

        Args:
            execution_results: List of execution results to write
            run_ids: List of run identifiers (generated if not provided)
            run_mode: Mode of the runs
            metadata: Additional metadata

        Returns:
            Dict containing batch write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_ids is None:
            run_ids = [str(uuid.uuid4()) for _ in execution_results]

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(
                operation_id, "write_execution_result_batch"
            )

            # Log operation start
            self.logger.info(
                f"Writing batch of {len(execution_results)} execution results"
            )

            # Process all execution results
            all_log_rows = []
            self._reset_table_total_rows_cache()
            for i, execution_result in enumerate(execution_results):
                run_id = run_ids[i] if i < len(run_ids) else str(uuid.uuid4())
                log_rows = self.data_processor.process_execution_result(
                    execution_result,
                    run_id,
                    run_mode,
                    metadata,
                    table_total_rows_provider=self._get_table_total_rows,
                )
                all_log_rows.extend(log_rows)

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                all_log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "operation_id": operation_id,
                "execution_results_count": len(execution_results),
                "total_rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"Successfully wrote batch of {len(execution_results)} execution results"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write execution result batch: {e}")
            raise

    def show_logs(self, limit: Optional[int] = None) -> None:
        """
        Display logs from the table.

        Args:
            limit: Maximum number of rows to display
        """
        try:
            self.logger.info(
                f"Displaying logs from {self.config.table_schema}.{self.config.table_name}"
            )

            # Query logs using spark.table for compatibility
            df = self.spark.table(
                f"{self.config.table_schema}.{self.config.table_name}"
            )

            # Show DataFrame
            if limit is not None:
                df.show(limit)
            else:
                df.show()

            self.logger.info("Logs displayed successfully")

        except Exception as e:
            self.logger.error(f"Failed to display logs: {e}")
            raise

    def get_table_info(self) -> TableInfo:
        """
        Get information about the log table.

        Returns:
            Dictionary containing table information
        """
        try:
            return self.storage_manager.get_table_info()
        except Exception as e:
            self.logger.error(f"Failed to get table info: {e}")
            raise WriterError(f"Failed to get table info: {e}") from e

    def _reset_table_total_rows_cache(self) -> None:
        """Clear cached table counts so subsequent operations refresh totals."""
        self._table_total_rows_cache.clear()

    def _get_table_total_rows(self, table_fqn: Optional[str]) -> Optional[int]:
        """
        Determine the total number of rows for a given table.

        Args:
            table_fqn: Fully qualified table name.

        Returns:
            Row count if available, otherwise None.
        """
        if not table_fqn:
            return None

        if table_fqn in self._table_total_rows_cache:
            return self._table_total_rows_cache[table_fqn]

        try:
            table_accessor = getattr(self.spark, "table", None)
            if not callable(table_accessor):
                self.logger.debug(
                    "table_total_rows: spark session does not expose table(); skipping count"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            if not table_exists(self.spark, table_fqn):
                self.logger.debug(
                    f"table_total_rows: table {table_fqn} does not exist; leaving value as None"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            table_df = table_accessor(table_fqn)
            count_method = getattr(table_df, "count", None)
            if not callable(count_method):
                self.logger.debug(
                    f"table_total_rows: object for {table_fqn} lacks count(); skipping"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            raw_count = count_method()
            if isinstance(raw_count, (int, float)):
                row_count = int(raw_count)
            else:
                row_count = None

            self._table_total_rows_cache[table_fqn] = row_count
            return row_count
        except Exception as exc:  # pragma: no cover - defensive logging path
            self.logger.warning(
                f"table_total_rows: unable to compute row count for {table_fqn}: {exc}"
            )
            self._table_total_rows_cache[table_fqn] = None
            return None

    def optimize_table(self) -> OptimizeResult:
        """
        Optimize the Delta table for better performance.

        Returns:
            Dictionary containing optimization results
        """
        try:
            self.logger.info("Optimizing Delta table")
            return self.storage_manager.optimize_table()
        except Exception as e:
            self.logger.error(f"Failed to optimize table: {e}")
            raise

    def vacuum_table(self, retention_hours: int = 168) -> VacuumResult:
        """
        Vacuum the Delta table to remove old files.

        Args:
            retention_hours: Hours of retention for old files

        Returns:
            Dictionary containing vacuum results
        """
        try:
            self.logger.info(f"Vacuuming Delta table (retention: {retention_hours}h)")
            return self.storage_manager.vacuum_table(retention_hours)
        except Exception as e:
            self.logger.error(f"Failed to vacuum table: {e}")
            raise

    def analyze_quality_trends(self, days: int = 30) -> QualityTrends:
        """
        Analyze data quality trends.

        Args:
            days: Number of days to analyze

        Returns:
            Dictionary containing quality trend analysis
        """
        try:
            self.logger.info(f"Analyzing quality trends for last {days} days")

            # Query recent logs
            df = self.storage_manager.query_logs()

            # Analyze quality trends
            return self.quality_analyzer.analyze_quality_trends(df, days)

        except Exception as e:
            self.logger.error(f"Failed to analyze quality trends: {e}")
            raise WriterError(f"Failed to analyze quality trends: {e}") from e

    def analyze_execution_trends(self, days: int = 30) -> ExecutionTrends:
        """
        Analyze execution trends.

        Args:
            days: Number of days to analyze

        Returns:
            Dictionary containing execution trend analysis
        """
        try:
            self.logger.info(f"Analyzing execution trends for last {days} days")

            # Query recent logs
            df = self.storage_manager.query_logs()

            # Analyze execution trends
            return self.trend_analyzer.analyze_execution_trends(df, days)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def detect_quality_anomalies(self) -> QualityAnomalies:
        """
        Detect data quality anomalies.

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting quality anomalies")

            # Query logs
            df = self.storage_manager.query_logs()

            # Detect anomalies
            return self.quality_analyzer.detect_quality_anomalies(df)

        except Exception as e:
            self.logger.error(f"Failed to detect quality anomalies: {e}")
            raise WriterError(f"Failed to detect quality anomalies: {e}") from e

    def generate_performance_report(self) -> PerformanceReport:
        """
        Generate comprehensive performance report.

        Returns:
            Dictionary containing performance report
        """
        try:
            self.logger.info("Generating performance report")

            # Query logs
            df = self.storage_manager.query_logs()

            # Generate report
            return self.analytics_engine.generate_performance_report(df)

        except Exception as e:
            self.logger.error(f"Failed to generate performance report: {e}")
            raise WriterError(f"Failed to generate performance report: {e}") from e

    def get_metrics(self) -> WriterMetrics:
        """Get current writer metrics."""
        return self.performance_monitor.get_metrics()

    def reset_metrics(self) -> None:
        """Reset writer metrics."""
        # Reset LogWriter metrics
        self.metrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        # Reset performance monitor metrics
        self.performance_monitor.reset_metrics()

    def get_memory_usage(self) -> MemoryUsageInfo:
        """Get current memory usage information."""
        return self.performance_monitor.get_memory_usage()

    def _update_metrics(self, write_result: WriteResult, success: bool) -> None:
        """Update writer metrics."""
        try:
            self.metrics["total_writes"] += 1
            if success:
                self.metrics["successful_writes"] += 1
            else:
                self.metrics["failed_writes"] += 1

            if "rows_written" in write_result:
                self.metrics["total_rows_written"] += write_result["rows_written"]

            # Update performance monitor metrics
            self.performance_monitor.metrics.update(self.metrics)

        except Exception as e:
            self.logger.error(f"Failed to update metrics: {e}")

    # Backward compatibility methods for tests
    def _write_log_rows(
        self,
        log_rows: list[LogRow],
        run_id: str,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> WriteResult:
        """Write log rows directly (for backward compatibility with tests)."""
        return self.storage_manager.write_batch(log_rows, self.config.write_mode)

    def _write_log_rows_batch(
        self, log_rows: list[LogRow], run_id: str, batch_size: int = 100
    ) -> WriteResult:
        """Write log rows in batches (for backward compatibility with tests)."""
        results = []
        for i in range(0, len(log_rows), batch_size):
            batch = log_rows[i : i + batch_size]
            result = self._write_log_rows(batch, run_id)
            results.append(result)

        total_rows = sum(r.get("rows_written", 0) for r in results)
        from datetime import datetime

        return {
            "table_name": self.storage_manager.table_fqn,
            "write_mode": self.config.write_mode.value,
            "rows_written": total_rows,
            "timestamp": datetime.now().isoformat(),
            "success": True,
        }

    def _create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> Any:
        """Create DataFrame from log rows (for backward compatibility with tests)."""
        # Convert TypedDict to regular dicts for createDataFrame
        dict_rows = [dict(row) for row in log_rows]
        return self.spark.createDataFrame(dict_rows, schema=self.schema)

    def detect_anomalies(self, log_rows: list[LogRow]) -> AnomalyReport:
        """Detect anomalies in log data (for backward compatibility with tests)."""
        if not self.config.enable_anomaly_detection:
            return {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": 0,
            }

        try:
            # Basic anomaly detection logic
            if not log_rows:
                return {
                    "performance_anomalies": [],
                    "quality_anomalies": [],
                    "anomaly_score": 0.0,
                    "total_anomalies": 0,
                    "total_executions": len(log_rows),
                }

            # Check for duration anomalies (very simple logic)
            durations = [
                row.get("duration_secs", 0)
                for row in log_rows
                if "duration_secs" in row
            ]
            if not durations:
                return {
                    "performance_anomalies": [],
                    "quality_anomalies": [],
                    "anomaly_score": 0.0,
                    "total_anomalies": 0,
                    "total_executions": len(log_rows),
                }

            avg_duration = sum(durations) / len(durations)
            threshold = avg_duration * 2  # 2x average is anomalous

            # from .monitoring import PerformanceAnomaly  # Removed: defined in notebook cells above

            performance_anomalies = []
            for row in log_rows:
                duration = row.get("duration_secs", 0)
                if duration > threshold:
                    anomaly: PerformanceAnomaly = {
                        "step": row.get("step_name", "unknown"),
                        "execution_time": float(duration),
                        "validation_rate": float(row.get("validation_rate", 0.0)),
                        "success": bool(row.get("success", False)),
                    }
                    performance_anomalies.append(anomaly)

            total_anomalies = len(performance_anomalies)
            total_executions = len(log_rows)
            anomaly_score = (
                (total_anomalies / total_executions * 100)
                if total_executions > 0
                else 0.0
            )

            return {
                "performance_anomalies": performance_anomalies,
                "quality_anomalies": [],
                "anomaly_score": round(anomaly_score, 2),
                "total_anomalies": total_anomalies,
                "total_executions": total_executions,
            }
        except Exception as e:
            self.logger.warning(f"Anomaly detection failed: {e}")
            return {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": len(log_rows) if log_rows else 0,
            }

    # Additional methods expected by tests
    def validate_log_data_quality(self, log_rows: list[LogRow]) -> DataQualityReport:
        """Validate log data quality (for backward compatibility with tests)."""
        try:
            # from ..validation.utils import get_dataframe_info  # Removed: defined in notebook cells above

            if not log_rows:
                return {
                    "is_valid": True,
                    "total_rows": 0,
                    "null_counts": {},
                    "validation_issues": [],
                    "failed_executions": 0,
                    "data_quality_score": 100.0,
                }

            # Create DataFrame for validation
            df = self._create_dataframe_from_log_rows(log_rows)

            # Get basic info
            df_info = get_dataframe_info(df)

            # Count failed executions
            failed_executions = sum(
                1 for row in log_rows if not row.get("success", True)
            )

            # Calculate quality score
            total_rows = df_info.get("row_count", len(log_rows))
            validation_rate = 100.0  # Simplified
            data_quality_score = (
                validation_rate
                if failed_executions == 0
                else max(0, validation_rate - (failed_executions / total_rows * 100))
            )

            # Check for null values in critical columns
            null_counts: Dict[str, int] = {}

            # Determine validation issues
            validation_issues = []
            if failed_executions > 0:
                validation_issues.append(f"{failed_executions} failed executions")

            return {
                "is_valid": failed_executions == 0 and len(validation_issues) == 0,
                "total_rows": total_rows,
                "null_counts": null_counts,
                "validation_issues": validation_issues,
                "failed_executions": failed_executions,
                "data_quality_score": round(data_quality_score, 2),
            }

        except Exception as e:
            return {
                "is_valid": False,
                "total_rows": len(log_rows) if log_rows else 0,
                "null_counts": {},
                "validation_issues": [str(e)],
                "failed_executions": 0,
                "data_quality_score": 0.0,
            }

    # ========================================================================
    # New simplified API methods for working with PipelineReport
    # ========================================================================

    def _convert_report_to_log_rows(
        self, report: PipelineReport, run_id: Optional[str] = None
    ) -> list[LogRow]:
        """
        Convert a PipelineReport to log rows for storage.

        This method extracts data from a PipelineReport and creates one log row
        per pipeline step (bronze, silver, gold) with step-specific metrics.

        Args:
            report: PipelineReport to convert
            run_id: Optional run ID (generated if not provided)

        Returns:
            List of LogRow dictionaries ready for storage (one per step)
        """

        if run_id is None:
            run_id = str(uuid.uuid4())

        log_rows: list[LogRow] = []

        # Helper function to parse datetime strings
        def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
            if dt_str is None:
                return None
            try:
                return datetime.fromisoformat(dt_str)
            except (ValueError, AttributeError):
                return None

        # Process bronze steps
        for step_name, step_info in report.bronze_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            bronze_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "bronze",
                "step_name": step_name,
                "step_type": "bronze",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(bronze_log_row)

        # Process silver steps
        for step_name, step_info in report.silver_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            silver_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "silver",
                "step_name": step_name,
                "step_type": "silver",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(silver_log_row)

        # Process gold steps
        for step_name, step_info in report.gold_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            gold_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "gold",
                "step_name": step_name,
                "step_type": "gold",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(gold_log_row)

        return log_rows

    def create_table(
        self, report: PipelineReport, run_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Create or overwrite the log table with data from a PipelineReport.

        Creates the log table if it doesn't exist, and writes the report data
        using OVERWRITE mode (replacing any existing data). This method is
        useful for initial table creation or full refresh scenarios.

        **Note:** This method uses OVERWRITE mode, which will replace all
        existing data in the table. Use `append()` if you want to add data
        to an existing table.

        Args:
            report: PipelineReport instance containing pipeline execution
                results. Required. The report should contain results for
                bronze, silver, and gold steps.
            run_id: Unique run identifier. Optional. If not provided, a UUID
                is automatically generated.

        Returns:
            Dictionary with write results containing:
                - `success`: Boolean indicating if operation succeeded
                - `run_id`: The run identifier used
                - `operation_id`: Unique operation identifier
                - `rows_written`: Number of rows written to the table
                - `table_fqn`: Fully qualified table name (schema.table_name)
                - `write_result`: WriteResult dictionary with detailed results
                - `operation_metrics`: Performance metrics for the operation

        Raises:
            WriterTableError: If table creation or write operation fails
            WriterValidationError: If report data validation fails

        Example:
            >>> from pipeline_builder.writer import LogWriter
            >>> from pipeline_builder.pipeline.models import PipelineReport
            >>>
            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
            >>> result = writer.create_table(pipeline_report, run_id="run_123")
            >>> print(f"Created table with {result['rows_written']} rows")
            >>> print(f"Table: {result['table_fqn']}")
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "create_table")

            # Log operation start
            self.logger.info(f"📊 Creating log table {self.table_fqn} for run {run_id}")

            # Convert report to log rows
            log_rows = self._convert_report_to_log_rows(report, run_id)

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage with OVERWRITE mode
            write_result = self.storage_manager.write_batch(
                log_rows, WriteMode.OVERWRITE
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "table_fqn": self.table_fqn,
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"✅ Successfully created log table {self.table_fqn} with "
                f"{result['rows_written']} row(s) for run {run_id}"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"❌ Failed to create log table for run {run_id}: {e}")
            raise

    def append(
        self, report: PipelineReport, run_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Append data from a PipelineReport to the log table.

        Appends the report data to an existing log table using APPEND mode.
        If the table doesn't exist, it will be created first. This method is
        useful for incremental logging scenarios where you want to preserve
        historical data.

        **Note:** This method uses APPEND mode, which adds new data to the
        existing table. Use `create_table()` if you want to replace all
        existing data.

        Args:
            report: PipelineReport instance containing pipeline execution
                results. Required. The report should contain results for
                bronze, silver, and gold steps.
            run_id: Unique run identifier. Optional. If not provided, a UUID
                is automatically generated.

        Returns:
            Dictionary with write results containing:
                - `success`: Boolean indicating if operation succeeded
                - `run_id`: The run identifier used
                - `operation_id`: Unique operation identifier
                - `rows_written`: Number of rows written to the table
                - `table_fqn`: Fully qualified table name (schema.table_name)
                - `write_result`: WriteResult dictionary with detailed results
                - `operation_metrics`: Performance metrics for the operation

        Raises:
            WriterTableError: If table creation or write operation fails
            WriterValidationError: If report data validation fails

        Example:
            >>> from pipeline_builder.writer import LogWriter
            >>> from pipeline_builder.pipeline.models import PipelineReport
            >>>
            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
            >>> result = writer.append(pipeline_report, run_id="run_123")
            >>> print(f"Appended {result['rows_written']} rows to {result['table_fqn']}")
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "append")

            # Log operation start
            self.logger.info(
                f"📊 Appending to log table {self.table_fqn} for run {run_id}"
            )

            # Convert report to log rows
            log_rows = self._convert_report_to_log_rows(report, run_id)

            # Create table if not exists (for first append)
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage with APPEND mode
            write_result = self.storage_manager.write_batch(log_rows, WriteMode.APPEND)

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "table_fqn": self.table_fqn,
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"✅ Successfully appended {result['rows_written']} row(s) to "
                f"{self.table_fqn} for run {run_id}"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"❌ Failed to append to log table for run {run_id}: {e}")
            raise

In [None]:
# Module: pipeline_builder.pipeline.builder (pipeline_builder)
#
# Dependencies: abstracts.builder, abstracts.step, pipeline_builder.compat, pipeline_builder.engine, pipeline_builder.functions, pipeline_builder.models, pipeline_builder.pipeline.runner, pipeline_builder.sql_source, pipeline_builder.table_operations, pipeline_builder.types, pipeline_builder.validation, pipeline_builder_base.builder, pipeline_builder_base.dependencies, pipeline_builder_base.errors, pipeline_builder_base.logging, pipeline_builder_base.models, pipeline_builder_base.validation, pipeline_builder_base.validation.pipeline_validator

"""PipelineBuilder for constructing data pipelines.

This module provides a clean, maintainable PipelineBuilder that handles
pipeline construction with the Medallion Architecture (Bronze → Silver → Gold).
The builder creates pipelines that can be executed with the execution engine
using a service-oriented architecture.

Key Features:
    - Fluent API for intuitive pipeline construction
    - Automatic dependency management
    - String-based validation rules
    - Multi-schema support
    - Comprehensive validation and error handling

The builder uses a service-oriented architecture internally:
    - StepFactory: Creates step instances
    - ExecutionValidator: Validates pipeline configuration
    - Step Executors: Execute steps during pipeline run

Example:
    >>> from pipeline_builder.pipeline.builder import PipelineBuilder
    >>> from pipeline_builder.functions import get_default_functions
    >>> F = get_default_functions()
    >>>
    >>> builder = PipelineBuilder(spark=spark, schema="analytics")
    >>> builder.with_bronze_rules(
    ...     name="events",
    ...     rules={"user_id": ["not_null"], "timestamp": ["not_null"]},
    ...     incremental_col="timestamp"
    ... )
    >>> builder.add_silver_transform(
    ...     name="clean_events",
    ...     source_bronze="events",
    ...     transform=lambda spark, df, silvers: df.filter(F.col("value") > 0),
    ...     rules={"value": ["gt", 0]},
    ...     table_name="clean_events"
    ... )
    >>> pipeline = builder.to_pipeline()
    >>> result = pipeline.run_initial_load(bronze_sources={"events": source_df})

Note:
    This module depends on:
    - compat: Spark compatibility layer
    - errors: Error handling
    - functions: PySpark function protocols
    - logging: Pipeline logging
    - models: Pipeline and step models
    - pipeline.runner: Pipeline execution
    - types: Type definitions
    - validation: Data and pipeline validation
"""

from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, List, Optional, Union

# Engine-specific StructType should satisfy the TypesProtocol.StructType
# from .builder import PipelineBuilder as _AbstractsPipelineBuilderClass  # Removed: defined in notebook cells above
# from .builder import BasePipelineBuilder  # Removed: defined in notebook cells above
# from .errors import (  # Removed: defined in notebook cells above
    # ConfigurationError as PipelineConfigurationError,
# )
# from .errors import (  # Removed: defined in notebook cells above
    # ExecutionError as StepError,
# )
# from .errors import (  # Removed: defined in notebook cells above
    # ValidationError,
# )
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
    # PipelineConfig,
    # ValidationThresholds,
# )

# from ..compat import SparkSession  # Removed: defined in notebook cells above
# from ..engine import SparkEngine  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..models import (  # Removed: defined in notebook cells above
    # BronzeStep,
    # GoldStep,
    # SilverStep,
# )
# from ..sql_source import JdbcSource, SqlAlchemySource  # Removed: defined in notebook cells above
# from ..table_operations import fqn, table_exists  # Removed: defined in notebook cells above
# from ..types import (  # Removed: defined in notebook cells above
    # ColumnRules,
    # GoldTransformFunction,
    # SilverTransformFunction,
    # StepName,
    # TableName,
# )
# from ..validation import ValidationResult, _convert_rules_to_expressions  # Removed: defined in notebook cells above
# from .runner import PipelineRunner  # Removed: defined in notebook cells above

class PipelineBuilder(BasePipelineBuilder):
    """Production-ready builder for creating data pipelines.

    The PipelineBuilder provides a fluent API for constructing robust data
    pipelines with comprehensive validation, automatic dependency management,
    and enterprise-grade features. Uses the Medallion Architecture
    (Bronze → Silver → Gold) for data layering.

    Key Features:
        - Fluent API: Chain methods for intuitive pipeline construction
        - Robust Validation: Early error detection with clear validation messages
        - Auto-inference: Automatic dependency detection and validation
        - String Rules: Convert human-readable rules to PySpark expressions
        - Multi-schema Support: Cross-schema data flows for enterprise environments
        - Comprehensive Error Handling: Detailed error messages with suggestions
        - Service-Oriented Architecture: Clean separation of concerns internally

    Validation Requirements:
        All pipeline steps must have validation rules. Invalid configurations
        are rejected during construction with clear error messages.

    String Rules Support:
        You can use human-readable string rules that are automatically converted
        to PySpark expressions:

        - "not_null" → F.col("column").isNotNull()
        - "gt", value → F.col("column") > value
        - "lt", value → F.col("column") < value
        - "eq", value → F.col("column") == value
        - "in", [values] → F.col("column").isin(values)
        - "between", min, max → F.col("column").between(min, max)

    Attributes:
        spark: SparkSession instance for DataFrame operations.
        schema: Target schema name for pipeline tables.
        config: PipelineConfig instance with pipeline configuration.
        logger: PipelineLogger instance for logging.
        functions: FunctionsProtocol instance for PySpark operations.
        bronze_steps: Dictionary of BronzeStep instances.
        silver_steps: Dictionary of SilverStep instances.
        gold_steps: Dictionary of GoldStep instances.
        execution_order: List of step names in execution order (topological sort).
            Populated automatically after successful pipeline validation.
            None if validation hasn't been run or failed.

    Example:
        Basic pipeline construction:

        >>> from pipeline_builder.pipeline.builder import PipelineBuilder
        >>> from pipeline_builder.functions import get_default_functions
        >>> F = get_default_functions()
        >>>
        >>> builder = PipelineBuilder(spark=spark, schema="analytics")
        >>> builder.with_bronze_rules(
        ...     name="events",
        ...     rules={"user_id": ["not_null"], "timestamp": ["not_null"]},
        ...     incremental_col="timestamp"
        ... )
        >>> builder.add_silver_transform(
        ...     name="clean_events",
        ...     source_bronze="events",
        ...     transform=lambda spark, df, silvers: df.filter(F.col("value") > 0),
        ...     rules={"value": ["gt", 0]},
        ...     table_name="clean_events"
        ... )
        >>> builder.add_gold_transform(
        ...     name="daily_metrics",
        ...     transform=lambda spark, silvers: silvers["clean_events"]
        ...     .groupBy("date")
        ...     .agg(F.count("*").alias("count")),
        ...     rules={"count": ["gt", 0]},
        ...     table_name="daily_metrics",
        ...     source_silvers=["clean_events"]
        ... )
        >>> pipeline = builder.to_pipeline()
        >>> result = pipeline.run_initial_load(bronze_sources={"events": source_df})

    Raises:
        ValidationError: If validation rules are invalid or missing.
        ConfigurationError: If configuration parameters are invalid.
        StepError: If step dependencies cannot be resolved.
    """

    def __init__(
        self,
        *,
        spark: SparkSession,
        schema: str,
        min_bronze_rate: float = 95.0,
        min_silver_rate: float = 98.0,
        min_gold_rate: float = 99.0,
        verbose: bool = True,
        functions: Optional[FunctionsProtocol] = None,
    ) -> None:
        """Initialize a new PipelineBuilder instance.

        Creates a PipelineBuilder with the specified configuration. Initializes
        all required services including validators, step storage, and execution
        engine.

        Args:
            spark: Active SparkSession instance for data processing.
            schema: Database schema name where tables will be created.
            min_bronze_rate: Minimum data quality rate for Bronze layer (0-100).
                Defaults to 95.0.
            min_silver_rate: Minimum data quality rate for Silver layer (0-100).
                Defaults to 98.0.
            min_gold_rate: Minimum data quality rate for Gold layer (0-100).
                Defaults to 99.0.
            verbose: Enable verbose logging output. Defaults to True.
            functions: Optional FunctionsProtocol instance for PySpark operations.
                If None, uses get_default_functions().

        Raises:
            PipelineConfigurationError: If Spark session is None, schema is empty,
                or quality rates are invalid.

        Note:
            The builder initializes:
            - PipelineConfig with validation thresholds
            - UnifiedValidator for Spark-specific validation
            - SparkEngine for execution
            - Step storage dictionaries (bronze_steps, silver_steps, gold_steps)
        """
        # Validate inputs
        if not spark:
            raise PipelineConfigurationError(
                "Spark session is required",
                suggestions=[
                    "Ensure SparkSession is properly initialized",
                    "Check Spark configuration",
                ],
            )
        if not schema:
            raise PipelineConfigurationError(
                "Schema name cannot be empty",
                suggestions=[
                    "Provide a valid schema name",
                    "Check database configuration",
                ],
            )

        # Store configuration
        thresholds = ValidationThresholds(
            bronze=min_bronze_rate, silver=min_silver_rate, gold=min_gold_rate
        )
        config = PipelineConfig(
            schema=schema,
            thresholds=thresholds,
            verbose=verbose,
        )

        # Initialize base class (this sets up self.config, self.logger, self.validator, self.step_validator)
        super().__init__(config, logger=PipelineLogger(verbose=verbose))

        # Initialize Spark-specific components
        self.spark = spark
        self.functions = functions if functions is not None else get_default_functions()

        # Expose schema for backward compatibility
        self.schema = schema
        self.pipeline_id = (
            f"pipeline_{schema}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        )

        # Use Spark-specific validator (UnifiedValidator) in addition to base validator
        # Keep the existing UnifiedValidator for Spark-specific validation
        # from ..validation import UnifiedValidator  # Removed: defined in notebook cells above

        self.spark_validator = UnifiedValidator(self.logger)
        # Store base validator before overriding
        # Type annotation needed for mypy - validator is set in BasePipelineBuilder.__init__
        from typing import cast
# from .validation import UnifiedValidator  # Removed: defined in notebook cells above

        # Type cast: BasePipelineBuilder.__init__ creates UnifiedValidator instance
        # Runtime check: We verify this is actually a UnifiedValidator via isinstance
        # This cast is safe because BasePipelineBuilder always creates UnifiedValidator
        # NOTE: We use runtime type checks in validate_pipeline() to catch any mismatches
        validator: UnifiedValidator = cast(UnifiedValidator, self.validator)  # type: ignore[redundant-cast]
        self._base_validator: UnifiedValidator = validator

        # Execution order will be populated after validation
        self.execution_order: Optional[List[str]] = None

        # Track step creation order for deterministic ordering when no explicit dependencies
        self._step_creation_order: Dict[str, int] = {}
        self._creation_counter: int = 0

        # Override base validator with spark_validator for backward compatibility
        # This allows add_validator() to work on self.validator
        # Type cast: UnifiedValidator implements the UnifiedValidator interface
        # but has different return types. We cast for interface compatibility.
        # NOTE: Runtime type checks in validate_pipeline() handle the return type differences
# from .validation import (  # Removed: defined in notebook cells above
            # UnifiedValidator as UnifiedValidator,
        # )

        self.validator = cast(UnifiedValidator, self.spark_validator)
        # Expose validators for backward compatibility
        self.validators = self.spark_validator.custom_validators

        # Step storage is already initialized by BasePipelineBuilder
        # but we need to type them correctly for Spark steps
        self.bronze_steps: Dict[str, BronzeStep] = {}
        self.silver_steps: Dict[str, SilverStep] = {}
        self.gold_steps: Dict[str, GoldStep] = {}

        # Create SparkEngine for abstracts layer
        self.spark_engine = SparkEngine(
            spark=self.spark,
            config=self.config,
            logger=self.logger,
            functions=self.functions,
        )

        # Create abstracts.PipelineBuilder with SparkEngine injection
        # We'll use PipelineRunner as the runner class
        self._abstracts_builder = _AbstractsPipelineBuilderClass(
            runner_cls=PipelineRunner,
            engine=self.spark_engine,
        )

        self.logger.info(f"🔧 PipelineBuilder initialized (schema: {schema})")

    def with_bronze_rules(
        self,
        *,
        name: StepName,
        rules: ColumnRules,
        incremental_col: Optional[str] = None,
        description: Optional[str] = None,
        schema: Optional[str] = None,
    ) -> PipelineBuilder:
        """Add Bronze layer validation rules for raw data ingestion.

        Bronze steps represent the first layer of the Medallion Architecture,
        handling raw data ingestion and initial validation. All Bronze steps
        must have non-empty validation rules.

        Args:
            name: Unique identifier for this Bronze step.
            rules: Dictionary mapping column names to validation rule lists.
                Supports both PySpark Column expressions and string rules:
                - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                - String: {"user_id": ["not_null"], "age": ["gt", 0]}
            incremental_col: Optional column name for incremental processing
                (e.g., "timestamp", "updated_at"). If provided, enables
                incremental processing with append mode.
            description: Optional description of this Bronze step.
            schema: Optional schema name for reading bronze data. If not
                provided, uses the builder's default schema.

        Returns:
            Self for method chaining.

        Raises:
            StepError: If step name is empty, conflicts with existing step,
                or schema validation fails.
            ValidationError: If rules are empty or invalid.

        Example:
            Using PySpark Column expressions:

            >>> builder.with_bronze_rules(
            ...     name="events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     incremental_col="timestamp"
            ... )

            Using string rules (automatically converted):

            >>> builder.with_bronze_rules(
            ...     name="users",
            ...     rules={"user_id": ["not_null"], "age": ["gt", 0]},
            ...     incremental_col="updated_at"
            ... )

            Cross-schema bronze data:

            >>> builder.with_bronze_rules(
            ...     name="user_events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     incremental_col="timestamp",
            ...     schema="raw_data"  # Read from different schema
            ... )

        Note:
            String rules are automatically converted to PySpark expressions:
            - "not_null" → F.col("column").isNotNull()
            - "gt", value → F.col("column") > value
            - "lt", value → F.col("column") < value
            - "eq", value → F.col("column") == value
            - "in", [values] → F.col("column").isin(values)
            - "between", min, max → F.col("column").between(min, max)
        """
        if not name:
            raise StepError(
                "Bronze step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "bronze"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        # Use base class method for duplicate checking
        try:
            self._check_duplicate_step_name(name, "bronze")
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": name, "step_type": "bronze"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            ) from e

        # Validate schema if provided (use base class method)
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                # Convert to StepError for consistency
                raise StepError(
                    str(e),
                    context={
                        "step_name": name,
                        "step_type": "bronze",
                        "schema": schema,
                    },
                ) from e

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Create bronze step
        bronze_step = BronzeStep(
            name=name,
            rules=converted_rules,
            incremental_col=incremental_col,
            schema=schema,
        )

        self.bronze_steps[name] = bronze_step
        # Track creation order for deterministic ordering
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Bronze step: {name}")

        return self

    def with_bronze_sql_source(
        self,
        *,
        name: StepName,
        sql_source: Union[JdbcSource, SqlAlchemySource],
        rules: ColumnRules,
        incremental_col: Optional[str] = None,
        schema: Optional[str] = None,
        description: Optional[str] = None,
    ) -> PipelineBuilder:
        """Add Bronze layer step that reads from a SQL database (JDBC or SQLAlchemy).

        SQL alternative to with_bronze_rules: data is read from the given sql_source
        at run time (no DataFrame in bronze_sources). Requires non-empty validation
        rules like with_bronze_rules.

        Args:
            name: Unique identifier for this Bronze step.
            sql_source: JdbcSource or SqlAlchemySource (table or query + connection).
            rules: Validation rules (PySpark or string rules); must be non-empty.
            incremental_col: Optional column for incremental processing.
            schema: Optional schema name for reading bronze data.
            description: Optional description.

        Returns:
            Self for method chaining.
        """
        if not name:
            raise StepError(
                "Bronze step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "bronze"},
                suggestions=["Provide a valid step name"],
            )
        if not isinstance(sql_source, (JdbcSource, SqlAlchemySource)):
            raise StepError(
                "sql_source must be JdbcSource or SqlAlchemySource",
                context={"step_name": name, "step_type": "bronze"},
            )
        if not rules or not isinstance(rules, dict):
            raise StepError(
                "Rules must be a non-empty dictionary (same as with_bronze_rules)",
                context={"step_name": name, "step_type": "bronze"},
            )
        try:
            self._check_duplicate_step_name(name, "bronze")
        except Exception as e:
            raise StepError(
                str(e), context={"step_name": name, "step_type": "bronze"}
            ) from e
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                raise StepError(
                    str(e),
                    context={
                        "step_name": name,
                        "step_type": "bronze",
                        "schema": schema,
                    },
                ) from e
        converted_rules = _convert_rules_to_expressions(rules, self.functions)
        bronze_step = BronzeStep(
            name=name,
            rules=converted_rules,
            incremental_col=incremental_col,
            schema=schema,
            sql_source=sql_source,
        )
        self.bronze_steps[name] = bronze_step
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Bronze step (SQL source): {name}")
        return self

    def with_silver_rules(
        self,
        *,
        name: StepName,
        table_name: TableName,
        rules: ColumnRules,
        description: Optional[str] = None,
        schema: Optional[str] = None,
        optional: bool = False,
    ) -> PipelineBuilder:
        """Add Silver layer validation rules for existing silver tables.

        Silver steps created with this method represent validation-only steps
        for existing silver tables. They allow subsequent transform functions
        to access validated existing silver and gold tables via `prior_silvers`
        and `prior_golds` arguments.

        Args:
            name: Unique identifier for this Silver step.
            table_name: Existing Delta table name (without schema).
            rules: Dictionary mapping column names to validation rule lists.
                Supports both PySpark Column expressions and string rules:
                - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                - String: {"user_id": ["not_null"], "age": ["gt", 0]}
            description: Optional description of this Silver step.
            schema: Optional schema name for reading silver data. If not
                provided, uses the builder's default schema.
            optional: If True, step does not fail when the table does not exist;
                an empty DataFrame is returned so downstream steps can run.

        Returns:
            Self for method chaining.

        Raises:
            StepError: If step name is empty, conflicts with existing step,
                or schema validation fails.
            ValidationError: If rules are empty or invalid.

        Example:
            Using PySpark Column expressions:

            >>> builder.with_silver_rules(
            ...     name="existing_clean_events",
            ...     table_name="clean_events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]}
            ... )

            Using string rules (automatically converted):

            >>> builder.with_silver_rules(
            ...     name="validated_events",
            ...     table_name="events",
            ...     rules={"user_id": ["not_null"], "value": ["gt", 0]},
            ...     schema="staging"
            ... )

        Note:
            String rules are automatically converted to PySpark expressions.
            See with_bronze_rules() for supported string rule formats.
            This creates a validation-only step that can be accessed by
            subsequent transform functions via prior_silvers.
        """
        if not name:
            raise StepError(
                "Silver step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "silver"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        # Use base class method for duplicate checking
        try:
            self._check_duplicate_step_name(name, "silver")
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            ) from e

        # Validate schema if provided (use base class method)
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                # Convert to StepError for consistency
                raise StepError(
                    str(e),
                    context={
                        "step_name": name,
                        "step_type": "silver",
                        "schema": schema,
                    },
                ) from e

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Get effective schema (use builder's default if not provided)
        effective_schema = self._get_effective_schema(schema)

        # Create SilverStep for validation-only (no transform function)
        silver_step = SilverStep(
            name=name,
            source_bronze="",  # No source bronze for existing tables
            transform=None,  # No transform function for validation-only steps
            rules=converted_rules,
            table_name=table_name,
            watermark_col=None,  # No watermark needed for validation-only steps
            existing=True,
            optional=optional,
            schema=effective_schema,
            source_incremental_col=None,
        )

        self.silver_steps[name] = silver_step
        # Track creation order for deterministic ordering
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Silver step (validation-only): {name}")

        return self

    def with_silver_sql_source(
        self,
        *,
        name: StepName,
        sql_source: Union[JdbcSource, SqlAlchemySource],
        table_name: TableName,
        rules: ColumnRules,
        schema: Optional[str] = None,
        description: Optional[str] = None,
        optional: bool = False,
    ) -> PipelineBuilder:
        """Add Silver layer step that reads from a SQL database (JDBC or SQLAlchemy).

        SQL alternative to with_silver_rules: data is read from sql_source at run time,
        validated, and written to the Delta table.

        Args:
            name: Unique identifier for this Silver step.
            sql_source: JdbcSource or SqlAlchemySource.
            table_name: Target Delta table name (without schema).
            rules: Validation rules; must be non-empty.
            schema: Optional schema for writing silver data.
            description: Optional description.
            optional: If True, step does not fail when SQL read fails; empty DataFrame.

        Returns:
            Self for method chaining.
        """
        if not name:
            raise StepError(
                "Silver step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "silver"},
            )
        if not isinstance(sql_source, (JdbcSource, SqlAlchemySource)):
            raise StepError("sql_source must be JdbcSource or SqlAlchemySource")
        if not rules or not isinstance(rules, dict):
            raise StepError("Rules must be a non-empty dictionary")
        if not table_name:
            raise StepError("table_name cannot be empty")
        try:
            self._check_duplicate_step_name(name, "silver")
        except Exception as e:
            raise StepError(
                str(e), context={"step_name": name, "step_type": "silver"}
            ) from e
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                raise StepError(
                    str(e), context={"step_name": name, "schema": schema}
                ) from e
        converted_rules = _convert_rules_to_expressions(rules, self.functions)
        effective_schema = self._get_effective_schema(schema)
        silver_step = SilverStep(
            name=name,
            source_bronze="",
            transform=None,
            rules=converted_rules,
            table_name=table_name,
            existing=False,
            optional=optional,
            schema=effective_schema,
            sql_source=sql_source,
        )
        self.silver_steps[name] = silver_step
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Silver step (SQL source): {name}")
        return self

    def with_gold_rules(
        self,
        *,
        name: StepName,
        table_name: TableName,
        rules: ColumnRules,
        description: Optional[str] = None,
        schema: Optional[str] = None,
        optional: bool = False,
    ) -> PipelineBuilder:
        """Add Gold layer validation rules for existing gold tables.

        Gold steps created with this method represent validation-only steps
        for existing gold tables. They allow subsequent transform functions
        to access validated existing silver and gold tables via `prior_silvers`
        and `prior_golds` arguments.

        Args:
            name: Unique identifier for this Gold step.
            table_name: Existing Delta table name (without schema).
            rules: Dictionary mapping column names to validation rule lists.
                Supports both PySpark Column expressions and string rules:
                - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                - String: {"user_id": ["not_null"], "count": ["gt", 0]}
            description: Optional description of this Gold step.
            schema: Optional schema name for reading gold data. If not
                provided, uses the builder's default schema.
            optional: If True, step does not fail when the table does not exist;
                an empty DataFrame is returned so downstream steps can run.

        Returns:
            Self for method chaining.

        Raises:
            StepError: If step name is empty, conflicts with existing step,
                or schema validation fails.
            ValidationError: If rules are empty or invalid.

        Example:
            Using PySpark Column expressions:

            >>> builder.with_gold_rules(
            ...     name="existing_user_metrics",
            ...     table_name="user_metrics",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ... )

            Using string rules (automatically converted):

            >>> builder.with_gold_rules(
            ...     name="validated_metrics",
            ...     table_name="metrics",
            ...     rules={"user_id": ["not_null"], "count": ["gt", 0]},
            ...     schema="analytics"
            ... )

        Note:
            String rules are automatically converted to PySpark expressions.
            See with_bronze_rules() for supported string rule formats.
            This creates a validation-only step that can be accessed by
            subsequent transform functions via prior_golds.
        """
        if not name:
            raise StepError(
                "Gold step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "gold"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        # Use base class method for duplicate checking
        try:
            self._check_duplicate_step_name(name, "gold")
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": name, "step_type": "gold"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            ) from e

        # Validate schema if provided (use base class method)
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                # Convert to StepError for consistency
                raise StepError(
                    str(e),
                    context={
                        "step_name": name,
                        "step_type": "gold",
                        "schema": schema,
                    },
                ) from e

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Get effective schema (use builder's default if not provided)
        effective_schema = self._get_effective_schema(schema)

        # Create GoldStep for validation-only (no transform function)
        gold_step = GoldStep(
            name=name,
            transform=None,  # No transform function for validation-only steps
            rules=converted_rules,
            table_name=table_name,
            existing=True,
            optional=optional,
            schema=effective_schema,
            source_silvers=None,  # No source silvers for existing tables
        )

        self.gold_steps[name] = gold_step
        # Track creation order for deterministic ordering
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Gold step (validation-only): {name}")

        return self

    def with_gold_sql_source(
        self,
        *,
        name: StepName,
        sql_source: Union[JdbcSource, SqlAlchemySource],
        table_name: TableName,
        rules: ColumnRules,
        schema: Optional[str] = None,
        description: Optional[str] = None,
        optional: bool = False,
    ) -> PipelineBuilder:
        """Add Gold layer step that reads from a SQL database (JDBC or SQLAlchemy).

        SQL alternative to with_gold_rules: data is read from sql_source at run time,
        validated, and written to the Delta table.

        Args:
            name: Unique identifier for this Gold step.
            sql_source: JdbcSource or SqlAlchemySource.
            table_name: Target Delta table name (without schema).
            rules: Validation rules; must be non-empty.
            schema: Optional schema for writing gold data.
            description: Optional description.
            optional: If True, step does not fail when SQL read fails; empty DataFrame.

        Returns:
            Self for method chaining.
        """
        if not name:
            raise StepError(
                "Gold step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "gold"},
            )
        if not isinstance(sql_source, (JdbcSource, SqlAlchemySource)):
            raise StepError("sql_source must be JdbcSource or SqlAlchemySource")
        if not rules or not isinstance(rules, dict):
            raise StepError("Rules must be a non-empty dictionary")
        if not table_name:
            raise StepError("table_name cannot be empty")
        try:
            self._check_duplicate_step_name(name, "gold")
        except Exception as e:
            raise StepError(
                str(e), context={"step_name": name, "step_type": "gold"}
            ) from e
        if schema is not None:
            try:
                self._validate_schema(schema)
            except Exception as e:
                raise StepError(
                    str(e), context={"step_name": name, "schema": schema}
                ) from e
        converted_rules = _convert_rules_to_expressions(rules, self.functions)
        effective_schema = self._get_effective_schema(schema)
        gold_step = GoldStep(
            name=name,
            transform=None,
            rules=converted_rules,
            table_name=table_name,
            existing=False,
            optional=optional,
            schema=effective_schema,
            source_silvers=None,
            sql_source=sql_source,
        )
        self.gold_steps[name] = gold_step
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Gold step (SQL source): {name}")
        return self

    def add_validator(self, validator: Any) -> PipelineBuilder:
        """Add a custom step validator to the pipeline.

        Custom validators allow you to add additional validation logic beyond
        the built-in validation rules. Validators are called during pipeline
        validation to check step configurations.

        Args:
            validator: Custom validator implementing StepValidator protocol.
                Must have a validate() method that accepts step and context
                parameters.

        Returns:
            Self for method chaining.

        Example:
            >>> class CustomValidator(StepValidator):
            ...     def validate(self, step, context):
            ...         if step.name == "special_step":
            ...             return ["Special validation failed"]
            ...         return []
            >>>
            >>> builder.add_validator(CustomValidator())

        Note:
            Custom validators are added to the UnifiedValidator and called
            during validate_pipeline(). They can return ValidationResult or
            List[str] of error messages.
        """
        self.spark_validator.add_validator(validator)
        return self

    def add_silver_transform(
        self,
        *,
        name: StepName,
        source_bronze: Optional[StepName] = None,
        transform: SilverTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        watermark_col: Optional[str] = None,
        description: Optional[str] = None,
        depends_on: Optional[list[StepName]] = None,
        source_silvers: Optional[list[StepName]] = None,
        schema: Optional[str] = None,
        schema_override: Optional[Any] = None,
    ) -> PipelineBuilder:
        """Add Silver layer transformation step for data cleaning and enrichment.

        Silver steps represent the second layer of the Medallion Architecture,
        transforming raw Bronze data into clean, business-ready datasets. All
        Silver steps must have non-empty validation rules and a valid transform
        function.

        Args:
            name: Unique identifier for this Silver step.
            source_bronze: Optional name of the Bronze step this Silver step
                depends on. If not provided, automatically infers from the most
                recent with_bronze_rules() call. If no bronze steps exist,
                raises an error.
            transform: Transformation function with signature:
                (spark: SparkSession, bronze_df: DataFrame,
                prior_silvers: Dict[str, DataFrame]) -> DataFrame
                Must be callable and cannot be None.
            rules: Dictionary mapping column names to validation rule lists.
                Supports both PySpark Column expressions and string rules:
                - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                - String: {"user_id": ["not_null"], "age": ["gt", 0]}
            table_name: Target Delta table name where results will be stored
                (without schema).
            watermark_col: Optional column name for watermarking (e.g.,
                "timestamp", "updated_at"). If provided, enables incremental
                processing with append mode.
            description: Optional description of this Silver step.
            depends_on: Optional list of other Silver step names that must
                complete before this step. Deprecated - use source_silvers instead.
            source_silvers: Optional list of Silver step names this Silver step
                depends on. These steps will be available in the prior_silvers
                dictionary passed to the transform function. If provided, ensures
                correct execution order.
            schema: Optional schema name for writing silver data. If not
                provided, uses the builder's default schema.
            schema_override: Optional PySpark StructType schema to override
                DataFrame schema when creating tables. Uses Delta Lake's
                overwriteSchema option. Applied during initial runs and when
                table doesn't exist.

        Returns:
            Self for method chaining.

        Raises:
            StepError: If step name is empty, conflicts with existing step,
                source_bronze not found, or schema validation fails.
            ValidationError: If rules are empty, transform is None, or
                configuration is invalid.

        Example:
            Using PySpark Column expressions:

            >>> def clean_user_events(spark, bronze_df, prior_silvers):
            ...     return (bronze_df
            ...         .filter(F.col("user_id").isNotNull())
            ...         .withColumn("event_date", F.date_trunc("day", "timestamp"))
            ...     )
            >>>
            >>> builder.add_silver_transform(
            ...     name="clean_events",
            ...     source_bronze="user_events",
            ...     transform=clean_user_events,
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="clean_events"
            ... )

            Using string rules with auto-inferred source:

            >>> builder.add_silver_transform(
            ...     name="enriched_events",
            ...     transform=lambda spark, df, silvers: df.withColumn(
            ...         "processed_at", F.current_timestamp()
            ...     ),
            ...     rules={"user_id": ["not_null"], "processed_at": ["not_null"]},
            ...     table_name="enriched_events",
            ...     watermark_col="processed_at"
            ... )

        Note:
            String rules are automatically converted to PySpark expressions.
            See with_bronze_rules() for supported string rule formats.
        """
        if not name:
            raise StepError(
                "Silver step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "silver"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        # Use base class method for duplicate checking
        try:
            self._check_duplicate_step_name(name, "silver")
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            ) from e

        # Auto-infer source_bronze if not provided
        if source_bronze is None:
            if not self.bronze_steps:
                raise StepError(
                    "No bronze steps available for auto-inference",
                    context={"step_name": name, "step_type": "silver"},
                    suggestions=[
                        "Add a bronze step first using with_bronze_rules()",
                        "Explicitly specify source_bronze parameter",
                    ],
                )

            # Use the most recently added bronze step
            source_bronze = list(self.bronze_steps.keys())[-1]
            self.logger.info(f"🔍 Auto-inferred source_bronze: {source_bronze}")

        # Validate that the source_bronze exists
        if source_bronze not in self.bronze_steps:
            raise StepError(
                f"Bronze step '{source_bronze}' not found",
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    f"Available bronze steps: {list(self.bronze_steps.keys())}",
                    "Add the bronze step first using with_bronze_rules()",
                ],
            )

        # Note: Dependency validation is deferred to validate_pipeline()
        # This allows for more flexible pipeline construction

        # Use builder's schema if not provided
        if schema is None:
            schema = self.config.schema
        else:
            self._validate_schema(schema)

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Capture the incremental column from the source bronze step (if any)
        source_incremental_col = self.bronze_steps[source_bronze].incremental_col

        # Use source_silvers if provided, otherwise fall back to depends_on for backward compatibility
        final_source_silvers = (
            source_silvers if source_silvers is not None else depends_on
        )

        # Create silver step
        silver_step = SilverStep(
            name=name,
            source_bronze=source_bronze,
            transform=transform,
            rules=converted_rules,
            table_name=table_name,
            watermark_col=watermark_col,
            schema=schema,
            source_incremental_col=source_incremental_col,
            schema_override=schema_override,
            source_silvers=final_source_silvers,
        )

        self.silver_steps[name] = silver_step
        # Track creation order for deterministic ordering
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Silver step: {name} (source: {source_bronze})")

        return self

    def add_gold_transform(
        self,
        *,
        name: StepName,
        transform: GoldTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        source_silvers: Optional[list[StepName]] = None,
        description: Optional[str] = None,
        schema: Optional[str] = None,
        schema_override: Optional[Any] = None,
    ) -> PipelineBuilder:
        """Add Gold layer transformation step for business analytics and aggregations.

        Gold steps represent the third layer of the Medallion Architecture,
        creating business-ready datasets for analytics and reporting. All Gold
        steps must have non-empty validation rules and a valid transform function.

        Args:
            name: Unique identifier for this Gold step.
            transform: Transformation function with signature:
                (spark: SparkSession, silvers: Dict[str, DataFrame]) -> DataFrame
                Must be callable and cannot be None.
            rules: Dictionary mapping column names to validation rule lists.
                Supports both PySpark Column expressions and string rules:
                - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                - String: {"user_id": ["not_null"], "count": ["gt", 0]}
            table_name: Target Delta table name where results will be stored
                (without schema).
            source_silvers: Optional list of Silver step names this Gold step
                depends on. If not provided, automatically uses all available
                Silver steps. If no Silver steps exist, raises an error.
            description: Optional description of this Gold step.
            schema: Optional schema name for writing gold data. If not provided,
                uses the builder's default schema.
            schema_override: Optional PySpark StructType schema to override
                DataFrame schema when writing to gold tables. Uses Delta Lake's
                overwriteSchema option. Always applied for gold table writes.

        Returns:
            Self for method chaining.

        Raises:
            StepError: If step name is empty, conflicts with existing step,
                source_silvers not found, or schema validation fails.
            ValidationError: If rules are empty, transform is None, or
                configuration is invalid.

        Example:
            Using PySpark Column expressions:

            >>> def user_daily_metrics(spark, silvers):
            ...     events_df = silvers["clean_events"]
            ...     return (events_df
            ...         .groupBy("user_id", "event_date")
            ...         .agg(F.count("*").alias("event_count"))
            ...     )
            >>>
            >>> builder.add_gold_transform(
            ...     name="user_metrics",
            ...     transform=user_daily_metrics,
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="user_daily_metrics",
            ...     source_silvers=["clean_events"]
            ... )

            Using string rules with auto-inferred sources:

            >>> builder.add_gold_transform(
            ...     name="daily_analytics",
            ...     transform=lambda spark, silvers: (
            ...         silvers["clean_events"]
            ...         .groupBy("date")
            ...         .agg(F.count("*").alias("count"))
            ...     ),
            ...     rules={"date": ["not_null"], "count": ["gt", 0]},
            ...     table_name="daily_analytics"
            ...     # source_silvers auto-inferred from all silver steps
            ... )

        Note:
            String rules are automatically converted to PySpark expressions.
            See with_bronze_rules() for supported string rule formats.
        """
        if not name:
            raise StepError(
                "Gold step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "gold"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        # Use base class method for duplicate checking
        try:
            self._check_duplicate_step_name(name, "gold")
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": name, "step_type": "gold"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            ) from e

        # Auto-infer source_silvers if not provided
        if source_silvers is None:
            if not self.silver_steps:
                raise StepError(
                    "No silver steps available for auto-inference",
                    context={"step_name": name, "step_type": "gold"},
                    suggestions=[
                        "Add a silver step first using add_silver_transform()",
                        "Explicitly specify source_silvers parameter",
                    ],
                )

            # Use all available silver steps
            source_silvers = list(self.silver_steps.keys())
            self.logger.info(f"🔍 Auto-inferred source_silvers: {source_silvers}")

        # Validate that all source_silvers exist
        invalid_silvers = [s for s in source_silvers if s not in self.silver_steps]
        if invalid_silvers:
            raise StepError(
                f"Silver steps not found: {invalid_silvers}",
                context={"step_name": name, "step_type": "gold"},
                suggestions=[
                    f"Available silver steps: {list(self.silver_steps.keys())}",
                    "Add the missing silver steps first using add_silver_transform()",
                ],
            )

        # Note: Dependency validation is deferred to validate_pipeline()
        # This allows for more flexible pipeline construction

        # Use builder's schema if not provided
        if schema is None:
            schema = self.config.schema
        else:
            self._validate_schema(schema)

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Create gold step
        gold_step = GoldStep(
            name=name,
            transform=transform,
            rules=converted_rules,
            table_name=table_name,
            source_silvers=source_silvers,
            schema=schema,
            schema_override=schema_override,
        )

        self.gold_steps[name] = gold_step
        # Track creation order for deterministic ordering
        self._step_creation_order[name] = self._creation_counter
        self._creation_counter += 1
        self.logger.info(f"✅ Added Gold step: {name} (sources: {source_silvers})")

        return self

    @staticmethod
    def _extract_errors_from_validator_result(
        result: Union[ValidationResult, List[str]],
        validator_name: str,
        logger: PipelineLogger,
    ) -> List[str]:
        """
        Type guard function to safely extract errors from validator results.

        This function handles both ValidationResult and List[str] return types,
        providing runtime type safety and clear error messages. This prevents
        type mismatch bugs like the ValidationResult + List[str] concatenation error.

        Different validators return different types:
        - UnifiedValidator.validate_pipeline() returns List[str]
        - UnifiedValidator.validate_pipeline() returns ValidationResult

        This function normalizes both to List[str] for safe concatenation.

        Args:
            result: Result from validator (ValidationResult or List[str])
            validator_name: Name of the validator for error messages
            logger: Logger instance for warnings/errors

        Returns:
            List of validation error strings (guaranteed to be List[str])

        Raises:
            TypeError: If result is neither ValidationResult nor List[str],
                      or if List contains non-string items

        Example:
            >>> base_result = validator.validate_pipeline(...)
            >>> errors = PipelineBuilder._extract_errors_from_validator_result(
            ...     base_result, "base_validator", logger
            ... )
            >>> # errors is guaranteed to be List[str]
        """
        if isinstance(result, ValidationResult):
            # UnifiedValidator returns ValidationResult
            return result.errors
        elif isinstance(result, list):
            # UnifiedValidator returns List[str]
            # Verify all items are strings
            if not all(isinstance(item, str) for item in result):
                error_msg = (
                    f"Validator {validator_name} returned List with non-string items. "
                    f"Expected List[str]. Got: {result}"
                )
                logger.error(error_msg)
                raise TypeError(error_msg)
            return result
        else:
            # Unexpected type - this is reachable at runtime even though
            # the type hint suggests it shouldn't be (defensive programming)
            error_msg = (
                f"Unexpected return type from {validator_name}: {type(result)}. "
                f"Expected ValidationResult or List[str]. Got: {result}"
            )
            logger.error(error_msg)
            raise TypeError(error_msg)

    def validate_pipeline(self) -> List[str]:
        """Validate the entire pipeline configuration.

        Runs both base validator (UnifiedValidator) and spark validator
        (UnifiedValidator), then combines their errors. Runtime type checks
        ensure that return types match expectations, preventing type mismatch bugs.

        Args:
            None (uses instance state).

        Returns:
            List of validation error strings (empty if valid). Each string
            describes a validation issue found in the pipeline configuration.

        Raises:
            TypeError: If validators return unexpected types (caught by runtime
                checks).

        Note:
            Return types from validators:
            - UnifiedValidator.validate_pipeline() returns List[str]
            - UnifiedValidator.validate_pipeline() returns ValidationResult

            Both are normalized to List[str] using type guard functions before
            concatenation. Errors are logged to the logger.
        """
        # Use base class validation first (from BasePipelineBuilder)
        # UnifiedValidator.validate_pipeline() returns List[str]
        base_result = self._base_validator.validate_pipeline(
            self.config, self.bronze_steps, self.silver_steps, self.gold_steps
        )

        # Extract errors using type guard function for runtime safety
        base_errors = self._extract_errors_from_validator_result(
            base_result, "base_validator", self.logger
        )

        # Also run Spark-specific validation
        # UnifiedValidator.validate_pipeline() returns ValidationResult
        spark_result = self.spark_validator.validate_pipeline(
            self.config, self.bronze_steps, self.silver_steps, self.gold_steps
        )

        # Extract errors using type guard function for runtime safety
        spark_errors = self._extract_errors_from_validator_result(
            spark_result, "spark_validator", self.logger
        )

        # Combine errors - both are now guaranteed to be lists
        all_errors = base_errors + spark_errors

        if all_errors:
            self.logger.error(
                f"Pipeline validation failed with {len(all_errors)} errors"
            )
            for error in all_errors:
                self.logger.error(f"  - {error}")
            # Clear execution order if validation fails
            self.execution_order = None
        else:
            self.logger.info("✅ Pipeline validation passed")
            # Calculate execution order after successful validation
            self._calculate_execution_order()

        return all_errors

    def _calculate_execution_order(self) -> None:
        """Calculate and store execution order based on step dependencies.

        Uses DependencyAnalyzer to determine the topological sort order of steps.
        This is called automatically after successful pipeline validation.

        Note:
            Execution order is stored in self.execution_order attribute.
            If dependency analysis fails, execution_order is set to None.
            Creation order is used as a tie-breaker for deterministic ordering
            when steps have no explicit dependencies.
        """
        try:
# from .dependencies import DependencyAnalyzer  # Removed: defined in notebook cells above

            # Convert step dictionaries to format expected by DependencyAnalyzer
            bronze_dict = dict(self.bronze_steps)
            silver_dict = dict(self.silver_steps)
            gold_dict = dict(self.gold_steps)

            # Analyze dependencies with creation order for deterministic tie-breaking
            analyzer = DependencyAnalyzer()
            analysis = analyzer.analyze_dependencies(
                bronze_steps=bronze_dict,  # type: ignore[arg-type]
                silver_steps=silver_dict,  # type: ignore[arg-type]
                gold_steps=gold_dict,  # type: ignore[arg-type]
                creation_order=self._step_creation_order,  # Pass creation order
            )

            # Store execution order
            self.execution_order = analysis.execution_order

            # Log execution order
            if self.execution_order:
                self.logger.info(
                    f"📋 Execution order ({len(self.execution_order)} steps): "
                    f"{' → '.join(self.execution_order)}"
                )
            else:
                self.logger.warning("Execution order is empty - no steps to execute")
        except Exception as e:
            self.logger.warning(
                f"Could not calculate execution order: {e}. "
                f"Execution order will not be available."
            )
            self.execution_order = None

    # ============================================================================
    # PRESET CONFIGURATIONS AND HELPER METHODS
    # ============================================================================

    @classmethod
    def for_development(
        cls,
        spark: SparkSession,
        schema: str,
        functions: Optional[FunctionsProtocol] = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """Create a PipelineBuilder optimized for development with relaxed validation.

        Creates a PipelineBuilder instance with relaxed validation thresholds
        suitable for development environments. Allows faster iteration with
        lower quality gates.

        Args:
            spark: Active SparkSession instance for data processing.
            schema: Database schema name where tables will be created.
            functions: Optional FunctionsProtocol instance for PySpark operations.
            **kwargs: Additional configuration parameters passed to __init__.

        Returns:
            PipelineBuilder instance with development-optimized settings:
            - min_bronze_rate: 80.0%
            - min_silver_rate: 85.0%
            - min_gold_rate: 90.0%
            - verbose: True

        Example:
            >>> builder = PipelineBuilder.for_development(
            ...     spark=spark,
            ...     schema="dev_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=80.0,  # Relaxed validation
            min_silver_rate=85.0,
            min_gold_rate=90.0,
            verbose=True,
            functions=functions,
            **kwargs,
        )

    @classmethod
    def for_production(
        cls,
        spark: SparkSession,
        schema: str,
        functions: Optional[FunctionsProtocol] = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """Create a PipelineBuilder optimized for production with strict validation.

        Creates a PipelineBuilder instance with strict validation thresholds
        suitable for production environments. Enforces high data quality standards.

        Args:
            spark: Active SparkSession instance for data processing.
            schema: Database schema name where tables will be created.
            functions: Optional FunctionsProtocol instance for PySpark operations.
            **kwargs: Additional configuration parameters passed to __init__.

        Returns:
            PipelineBuilder instance with production-optimized settings:
            - min_bronze_rate: 95.0%
            - min_silver_rate: 98.0%
            - min_gold_rate: 99.0%
            - verbose: False

        Example:
            >>> builder = PipelineBuilder.for_production(
            ...     spark=spark,
            ...     schema="prod_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=95.0,  # Strict validation
            min_silver_rate=98.0,
            min_gold_rate=99.0,
            verbose=False,
            functions=functions,
            **kwargs,
        )

    @classmethod
    def for_testing(
        cls,
        spark: SparkSession,
        schema: str,
        functions: Optional[FunctionsProtocol] = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """Create a PipelineBuilder optimized for testing with minimal validation.

        Creates a PipelineBuilder instance with very relaxed validation thresholds
        suitable for testing environments. Allows maximum flexibility for test
        scenarios.

        Args:
            spark: Active SparkSession instance for data processing.
            schema: Database schema name where tables will be created.
            functions: Optional FunctionsProtocol instance for PySpark operations.
            **kwargs: Additional configuration parameters passed to __init__.

        Returns:
            PipelineBuilder instance with testing-optimized settings:
            - min_bronze_rate: 70.0%
            - min_silver_rate: 75.0%
            - min_gold_rate: 80.0%
            - verbose: True

        Example:
            >>> builder = PipelineBuilder.for_testing(
            ...     spark=spark,
            ...     schema="test_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=70.0,  # Very relaxed validation
            min_silver_rate=75.0,
            min_gold_rate=80.0,
            verbose=True,
            functions=functions,
            **kwargs,
        )

    # ============================================================================
    # VALIDATION HELPER METHODS
    # ============================================================================

    @staticmethod
    def not_null_rules(
        columns: list[str], functions: Optional[FunctionsProtocol] = None
    ) -> ColumnRules:
        """Create validation rules for non-null constraints on multiple columns.

        Helper method to quickly create validation rules requiring columns to
        be non-null. Useful for common validation patterns.

        Args:
            columns: List of column names to validate for non-null.
            functions: Optional FunctionsProtocol instance for column operations.
                If None, uses get_default_functions().

        Returns:
            Dictionary mapping column names to lists of validation rules.
            Each column gets a single rule: F.col(column).isNotNull().

        Example:
            >>> rules = PipelineBuilder.not_null_rules(["user_id", "timestamp", "value"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "user_id": [F.col("user_id").isNotNull()],
            >>> #     "timestamp": [F.col("timestamp").isNotNull()],
            >>> #     "value": [F.col("value").isNotNull()]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {col: [functions.col(col).isNotNull()] for col in columns}

    @staticmethod
    def positive_number_rules(
        columns: list[str], functions: Optional[FunctionsProtocol] = None
    ) -> ColumnRules:
        """Create validation rules for positive number constraints on multiple columns.

        Helper method to quickly create validation rules requiring columns to
        be non-null and greater than zero. Useful for count, amount, and quantity
        columns.

        Args:
            columns: List of column names to validate for positive numbers.
            functions: Optional FunctionsProtocol instance for column operations.
                If None, uses get_default_functions().

        Returns:
            Dictionary mapping column names to lists of validation rules.
            Each column gets two rules: isNotNull() and > 0.

        Example:
            >>> rules = PipelineBuilder.positive_number_rules(["value", "count"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "value": [F.col("value").isNotNull(), F.col("value") > 0],
            >>> #     "count": [F.col("count").isNotNull(), F.col("count") > 0]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [functions.col(col).isNotNull(), functions.col(col) > 0]
            for col in columns
        }

    @staticmethod
    def string_not_empty_rules(
        columns: list[str], functions: Optional[FunctionsProtocol] = None
    ) -> ColumnRules:
        """Create validation rules for non-empty string constraints on multiple columns.

        Helper method to quickly create validation rules requiring string columns
        to be non-null and have length greater than zero. Useful for name, category,
        and other string identifier columns.

        Args:
            columns: List of column names to validate for non-empty strings.
            functions: Optional FunctionsProtocol instance for column operations.
                If None, uses get_default_functions().

        Returns:
            Dictionary mapping column names to lists of validation rules.
            Each column gets two rules: isNotNull() and length() > 0.

        Example:
            >>> rules = PipelineBuilder.string_not_empty_rules(["name", "category"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "name": [F.col("name").isNotNull(), F.length(F.col("name")) > 0],
            >>> #     "category": [F.col("category").isNotNull(), F.length(F.col("category")) > 0]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [
                functions.col(col).isNotNull(),
                functions.length(functions.col(col)) > 0,
            ]
            for col in columns
        }

    @staticmethod
    def timestamp_rules(
        columns: list[str], functions: Optional[FunctionsProtocol] = None
    ) -> ColumnRules:
        """Create validation rules for timestamp constraints on multiple columns.

        Helper method to quickly create validation rules requiring timestamp
        columns to be non-null. Useful for created_at, updated_at, and other
        timestamp columns.

        Args:
            columns: List of column names to validate as timestamps.
            functions: Optional FunctionsProtocol instance for column operations.
                If None, uses get_default_functions().

        Returns:
            Dictionary mapping column names to lists of validation rules.
            Each column gets a single rule: isNotNull() (applied twice in
            current implementation - may be simplified in future).

        Example:
            >>> rules = PipelineBuilder.timestamp_rules(["created_at", "updated_at"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "created_at": [F.col("created_at").isNotNull()],
            >>> #     "updated_at": [F.col("updated_at").isNotNull()]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [functions.col(col).isNotNull(), functions.col(col).isNotNull()]
            for col in columns
        }

    @staticmethod
    def detect_timestamp_columns(df_schema: Any) -> list[str]:
        """Detect timestamp columns from a DataFrame schema.

        Analyzes column names to identify potential timestamp columns based
        on common naming patterns. Useful for automatically configuring
        incremental processing.

        Args:
            df_schema: DataFrame schema (StructType) or list of column names
                with types. Can also be a simple list of column name strings.

        Returns:
            List of column names that match timestamp naming patterns.
            Searches for keywords like "timestamp", "created_at", "updated_at",
            etc. in column names (case-insensitive).

        Example:
            >>> timestamp_cols = PipelineBuilder.detect_timestamp_columns(df.schema)
            >>> # Returns columns like ["timestamp", "created_at", "updated_at"]
            >>> # if they exist in the schema

        Note:
            Searches for these keywords in column names (case-insensitive):
            - timestamp, created_at, updated_at, event_time, process_time,
              ingestion_time, load_time, modified_at, date_time, ts
        """
        timestamp_keywords = [
            "timestamp",
            "created_at",
            "updated_at",
            "event_time",
            "process_time",
            "ingestion_time",
            "load_time",
            "modified_at",
            "date_time",
            "ts",
        ]

        if hasattr(df_schema, "fields"):
            # DataFrame schema
            columns = [field.name.lower() for field in df_schema.fields]
        else:
            # List of column names
            columns = [col.lower() for col in df_schema]

        # Find columns that match timestamp patterns
        timestamp_cols = []
        for col in columns:
            if any(keyword in col for keyword in timestamp_keywords):
                timestamp_cols.append(col)

        return timestamp_cols

    def _validate_schema(self, schema: str) -> None:
        """Validate that a schema exists and is accessible.

        Overrides the base class method to add Spark-specific schema validation.
        First validates schema name format, then checks if schema exists in
        Spark catalog.

        Args:
            schema: Schema name to validate.

        Raises:
            StepError: If schema doesn't exist, is not accessible, or name
                format is invalid.
            ValidationError: If schema name format validation fails.

        Note:
            Uses base validator for format validation, then checks Spark
            catalog for existence. Provides helpful suggestions if schema
            doesn't exist.
        """
        # First validate schema name format using base validator
        try:
            errors = self._base_validator.validate_schema(schema)
            if errors:
                raise ValidationError(errors[0])
        except ValidationError:
            raise
        except Exception as e:
            # Convert to StepError for consistency
            raise StepError(
                str(e),
                context={"step_name": "schema_validation", "step_type": "validation"},
            ) from e

        # Then check if schema exists in Spark catalog
        try:
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema not in databases:
                raise StepError(
                    f"Schema '{schema}' does not exist",
                    context={
                        "step_name": "schema_validation",
                        "step_type": "validation",
                    },
                    suggestions=[
                        f"Create the schema first: CREATE SCHEMA IF NOT EXISTS {schema}",
                        "Check schema permissions",
                        "Verify schema name spelling",
                    ],
                )
            self.logger.debug(f"✅ Schema '{schema}' is accessible")
        except StepError:
            # Re-raise StepError as-is
            raise
        except Exception as e:
            raise StepError(
                f"Schema '{schema}' is not accessible: {str(e)}",
                context={"step_name": "schema_validation", "step_type": "validation"},
                suggestions=[
                    f"Create the schema first: CREATE SCHEMA IF NOT EXISTS {schema}",
                    "Check schema permissions",
                    "Verify schema name spelling",
                ],
            ) from e

    def _create_schema_if_not_exists(self, schema: str) -> None:
        """Create a schema if it doesn't exist.

        Uses SQL CREATE SCHEMA IF NOT EXISTS to create the schema idempotently.
        Logs success or failure.

        Args:
            schema: Schema name to create.

        Raises:
            StepError: If schema creation fails.

        Note:
            Uses CREATE SCHEMA IF NOT EXISTS for idempotent operation.
            Errors are wrapped in StepError with helpful suggestions.
        """
        try:
            # Use SQL to create schema
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            self.logger.info(f"✅ Schema '{schema}' created or already exists")
        except Exception as e:
            raise StepError(
                f"Failed to create schema '{schema}': {str(e)}",
                context={"step_name": "schema_creation", "step_type": "validation"},
                suggestions=[
                    "Check schema permissions",
                    "Verify schema name is valid",
                    "Check for naming conflicts",
                ],
            ) from e

    def _get_effective_schema(self, step_schema: Optional[str]) -> str:
        """Get the effective schema for a step.

        Returns the step-specific schema if provided, otherwise falls back to
        the builder's default schema.

        Args:
            step_schema: Optional schema name specified for the step.

        Returns:
            The effective schema name (step_schema if provided, otherwise
            self.schema).
        """
        return step_schema if step_schema is not None else self.schema

    def to_pipeline(self) -> PipelineRunner:
        """Build and return a PipelineRunner for executing this pipeline.

        Validates the pipeline configuration, then creates a PipelineRunner
        instance ready for execution. The runner implements the abstracts.Runner
        interface and can execute the pipeline using various execution modes.

        Args:
            None (uses instance state).

        Returns:
            PipelineRunner instance ready for execution. Implements
            abstracts.Runner interface.

        Raises:
            ValueError: If pipeline validation fails or step validation fails.

        Example:
            >>> builder = PipelineBuilder(spark=spark, schema="analytics")
            >>> builder.with_bronze_rules(name="events", rules={"id": ["not_null"]})
            >>> builder.add_silver_transform(
            ...     name="clean_events",
            ...     transform=lambda spark, df, silvers: df.filter(F.col("status") == "active"),
            ...     rules={"status": ["not_null"]},
            ...     table_name="clean_events"
            ... )
            >>> pipeline = builder.to_pipeline()
            >>> result = pipeline.run_initial_load(bronze_sources={"events": source_df})

        Note:
            The pipeline is validated before building. All steps are validated
            using the abstracts.PipelineBuilder validation to ensure interface
            compatibility.
        """
        # Validate pipeline before building
        validation_errors = self.validate_pipeline()
        if validation_errors:
            raise ValueError(
                f"Pipeline validation failed with {len(validation_errors)} errors: {', '.join(validation_errors)}"
            )

        # Check that validation-only (with_silver_rules / with_gold_rules) target tables exist
        # when optional=False; fail early at build time instead of at run time.
        missing_tables: list[str] = []
        for step in self.silver_steps.values():
            if (
                getattr(step, "existing", False)
                and step.transform is None
                and not getattr(step, "optional", False)
            ):
                schema = getattr(step, "schema", None) or self.config.schema
                table_name = getattr(step, "table_name", step.name)
                table_fqn = fqn(schema, table_name)
                if not table_exists(self.spark, table_fqn):
                    missing_tables.append(
                        f"Silver step '{step.name}' requires existing table '{table_fqn}' (optional=False)"
                    )
        for gold_step in self.gold_steps.values():
            if (
                getattr(gold_step, "existing", False)
                and gold_step.transform is None
                and not getattr(gold_step, "optional", False)
            ):
                schema = getattr(gold_step, "schema", None) or self.config.schema
                table_name = getattr(gold_step, "table_name", gold_step.name)
                table_fqn = fqn(schema, table_name)
                if not table_exists(self.spark, table_fqn):
                    missing_tables.append(
                        f"Gold step '{gold_step.name}' requires existing table '{table_fqn}' (optional=False)"
                    )
        if missing_tables:
            raise ValueError(
                "Validation-only step target table(s) do not exist. "
                "Create the table(s) or use optional=True for those steps: "
                + "; ".join(missing_tables)
            )

        # Build steps list for abstracts.PipelineBuilder validation
        all_steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )

        # Use abstracts.PipelineBuilder to validate steps
        # This ensures step validation follows the abstracts interface
        # Type cast needed because BronzeStep/SilverStep/GoldStep satisfy Step Protocol
        try:
# from .step import Step as AbstractsStep  # Removed: defined in notebook cells above

            # Type ignore needed because BronzeStep/SilverStep/GoldStep satisfy Step Protocol
            steps_for_validation: list[AbstractsStep] = all_steps  # type: ignore[assignment]
            self._abstracts_builder.validate_steps(steps_for_validation)
        except ValueError as e:
            raise ValueError(f"Step validation failed: {e}") from e

        # Create PipelineRunner with proper configuration
        # PipelineRunner implements abstracts.Runner, so this satisfies the interface
        # Note: steps and engine are optional parameters for abstracts compatibility
        # but we pass them to ensure the runner is properly initialized
        runner = PipelineRunner(
            spark=self.spark,
            config=self.config,
            bronze_steps=self.bronze_steps,
            silver_steps=self.silver_steps,
            gold_steps=self.gold_steps,
            logger=self.logger,
            functions=self.functions,
            steps=all_steps
            if all_steps
            else None,  # Pass steps for abstracts.Runner compatibility
            engine=self.spark_engine,  # Pass engine for abstracts.Runner compatibility
            execution_order=self.execution_order,  # Match to_pipeline() reported order at run time
        )

        self.logger.info(
            f"🚀 Pipeline built successfully with {len(self.bronze_steps)} bronze, {len(self.silver_steps)} silver, {len(self.gold_steps)} gold steps"
        )

        return runner

In [None]:
# Module: pipeline_builder.pipeline.runner (pipeline_builder)
#
# Dependencies: abstracts.reports.run, abstracts.runner, abstracts.source, models.pipeline, models.steps, pipeline.models, pipeline_builder.compat, pipeline_builder.compat, pipeline_builder.compat_helpers, pipeline_builder.execution, pipeline_builder.execution, pipeline_builder.functions, pipeline_builder.functions, pipeline_builder.models, pipeline_builder.pipeline.models, pipeline_builder.sql_source, pipeline_builder_base.dependencies, pipeline_builder_base.logging, pipeline_builder_base.logging, pipeline_builder_base.models, pipeline_builder_base.runner

"""
Simplified pipeline runner for the framework.

This module provides a clean, focused pipeline runner that delegates
execution to the simplified execution engine.

"""

from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, Optional, Union, cast
# from .reports.run import Report  # Removed: defined in notebook cells above
# from .runner import Runner  # Removed: defined in notebook cells above
# from .source import Source  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
    # ExecutionMode,
    # PipelineConfig,
    # PipelineMetrics,
# )
# from .runner import BaseRunner  # Removed: defined in notebook cells above

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
from pyspark.sql import functions as F  # F from pyspark (not from compat)
# from ..execution import ExecutionEngine  # Removed: defined in notebook cells above
# from ..execution import ExecutionResult as SparkExecutionResult  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .models import PipelineMode, PipelineReport, PipelineStatus  # Removed: defined in notebook cells above

class SimplePipelineRunner(BaseRunner, Runner):
    """
    Simplified pipeline runner that delegates to the execution engine.

    This runner focuses on orchestration and reporting, delegating
    actual execution to the simplified ExecutionEngine.

    Implements abstracts.Runner interface while maintaining backward compatibility
    with additional methods (run_full_refresh, run_validation_only).
    """

    def __init__(
        self,
        spark: SparkSession,  # type: ignore[valid-type]
        config: PipelineConfig,
        bronze_steps: Optional[Dict[str, BronzeStep]] = None,
        silver_steps: Optional[Dict[str, SilverStep]] = None,
        gold_steps: Optional[Dict[str, GoldStep]] = None,
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
        # Abstracts.Runner compatibility - these will be set if using abstracts interface
        steps: Optional[list[Union[BronzeStep, SilverStep, GoldStep]]] = None,
        engine: Optional[
            Any
        ] = None,  # Engine from abstracts, but we use ExecutionEngine
        execution_order: Optional[list[str]] = None,
    ):
        """
        Initialize the simplified pipeline runner.

        Args:
            spark: Active SparkSession instance
            config: Pipeline configuration
            bronze_steps: Bronze steps dictionary
            silver_steps: Silver steps dictionary
            gold_steps: Gold steps dictionary
            logger: Optional logger instance
            functions: Optional functions object for PySpark operations
            steps: Optional list of steps (for abstracts.Runner compatibility)
            engine: Optional engine (for abstracts.Runner compatibility, ignored)
            execution_order: Optional pre-computed step order from builder (matches to_pipeline() report).
        """
        # Initialize BaseRunner first
        super().__init__(config, logger=logger)

        # Initialize abstracts.Runner with empty lists (we'll use our own step storage)
        # This satisfies the abstract base class requirement
        # Use Any for engine type to avoid type checking issues with _DummyEngine

        dummy_engine: Any = _DummyEngine()
        Runner.__init__(self, steps=[], engine=engine or dummy_engine)

        self.spark = spark
        self.bronze_steps = bronze_steps or {}
        self.silver_steps = silver_steps or {}
        self.gold_steps = gold_steps or {}
        self.functions = functions
        self.execution_engine = ExecutionEngine(spark, config, self.logger, functions)
        self.execution_order = execution_order

        # If steps provided (from abstracts interface), convert to step dictionaries
        if steps:
            for step in steps:
                if step.step_type.value == "bronze":
                    self.bronze_steps[step.name] = step  # type: ignore[assignment]
                elif step.step_type.value == "silver":
                    self.silver_steps[step.name] = step  # type: ignore[assignment]
                elif step.step_type.value == "gold":
                    self.gold_steps[step.name] = step  # type: ignore[assignment]

    def run_pipeline(
        self,
        steps: list[Union[BronzeStep, SilverStep, GoldStep]],
        mode: PipelineMode = PipelineMode.INITIAL,
        bronze_sources: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
    ) -> PipelineReport:
        """
        Run a complete pipeline.

        Args:
            steps: List of pipeline steps to execute
            mode: Pipeline execution mode
            bronze_sources: Optional bronze source data

        Returns:
            PipelineReport with execution results
        """
        start_time = datetime.now()
        pipeline_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        # Convert PipelineMode to ExecutionMode
        execution_mode = self._convert_mode(mode)

        try:
            self.logger.info(f"Starting pipeline execution: {pipeline_id}")

            # Prepare bronze sources if provided
            if bronze_sources:
                # Add bronze sources to context for execution
                context = {}
                for step in steps:
                    if step.step_type.value == "bronze" and step.name in bronze_sources:
                        context[step.name] = bronze_sources[step.name]
            else:
                context = {}

            # Resolve SQL-source bronze steps (read from JDBC/SQLAlchemy into context)
            # from ..sql_source import read_sql_source  # Removed: defined in notebook cells above

            for step in steps:
                if step.step_type.value != "bronze":
                    continue
                if step.name in context:
                    continue
                sql_src = getattr(step, "sql_source", None)
                if sql_src is not None:
                    context[step.name] = read_sql_source(sql_src, self.spark)

            # Execute pipeline using the execution engine
            result = self.execution_engine.execute_pipeline(
                steps,
                execution_mode,
                context=context,
                execution_order=self.execution_order,
            )

            # Convert execution result to pipeline report
            report = self._create_spark_pipeline_report(
                pipeline_id=pipeline_id,
                mode=mode,
                start_time=start_time,
                execution_result=result,
            )

            self.logger.info(f"Completed pipeline execution: {pipeline_id}")
            return report

        except Exception as e:
            self.logger.error(f"Pipeline execution failed: {e}")
            return self._create_error_report(
                pipeline_id=pipeline_id, mode=mode, start_time=start_time, error=str(e)
            )

    def run_initial_load(
        self,
        bronze_sources: Union[Optional[Dict[str, Source]], list] = None,
        steps: Optional[
            list
        ] = None,  # Backward compatibility: old signature accepted steps as first arg
    ) -> Report:  # PipelineReport satisfies Report Protocol
        """
        Run initial load pipeline.

        Implements abstracts.Runner.run_initial_load interface.
        Also supports backward-compatible signature with steps parameter.

        Args:
            bronze_sources: Dictionary mapping bronze step names to Source (DataFrame), or None
            steps: Optional list of steps (for backward compatibility with old signature)
        """
        # Handle backward compatibility: if first arg is a list, treat it as steps
        if isinstance(bronze_sources, list):
            # Old signature: run_initial_load([steps])
            steps = bronze_sources
            bronze_sources = None

        # Convert Source (Protocol) to DataFrame if needed
        # Source Protocol is satisfied by DataFrame, so we accept any DataFrame-like object
        # from ..compat_helpers import is_dataframe_like  # Removed: defined in notebook cells above

        bronze_sources_df: Optional[Dict[str, DataFrame]] = None  # type: ignore[valid-type]
        if bronze_sources:
            bronze_sources_df = {}
            for name, source in bronze_sources.items():
                # Check if it's a DataFrame-like object using compat helper
                if not is_dataframe_like(source):
                    raise TypeError(
                        f"bronze_sources must contain DataFrame-like objects, got {type(source)}"
                    )
                bronze_sources_df[name] = cast(DataFrame, source)

        # Use provided steps or stored steps
        if steps is None:
            steps = (
                list(self.bronze_steps.values())
                + list(self.silver_steps.values())
                + list(self.gold_steps.values())
            )

        # PipelineReport satisfies Report Protocol structurally
        return self.run_pipeline(steps, PipelineMode.INITIAL, bronze_sources_df)  # type: ignore[return-value]

    def run_incremental(
        self,
        bronze_sources: Union[Optional[Dict[str, Source]], list] = None,
        steps: Optional[
            list
        ] = None,  # Backward compatibility: old signature accepted steps as first arg
    ) -> Report:  # PipelineReport satisfies Report Protocol
        """
        Run incremental pipeline with all stored steps.

        Implements abstracts.Runner.run_incremental interface.
        Also supports backward-compatible signature with steps parameter.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to Source (DataFrame), or None
            steps: Optional list of steps (for backward compatibility with old signature)

        Returns:
            Report (PipelineReport) with execution results
        """
        # Handle backward compatibility: if first arg is a list, treat it as steps
        if isinstance(bronze_sources, list):
            # Old signature: run_incremental([steps])
            steps = bronze_sources
            bronze_sources = None

        # Convert Source (Protocol) to DataFrame if needed
        # Source Protocol is satisfied by DataFrame, so we accept any DataFrame-like object
        # from ..compat_helpers import is_dataframe_like  # Removed: defined in notebook cells above

        bronze_sources_df: Optional[Dict[str, DataFrame]] = None  # type: ignore[valid-type]
        if bronze_sources:
            bronze_sources_df = {}
            for name, source in bronze_sources.items():
                # Check if it's a DataFrame-like object using compat helper
                if not is_dataframe_like(source):
                    raise TypeError(
                        f"bronze_sources must contain DataFrame-like objects, got {type(source)}"
                    )
                bronze_sources_df[name] = cast(DataFrame, source)

        # Use provided steps or stored steps
        if steps is None:
            steps = (
                list(self.bronze_steps.values())
                + list(self.silver_steps.values())
                + list(self.gold_steps.values())
            )

        # PipelineReport satisfies Report Protocol structurally
        return self.run_pipeline(steps, PipelineMode.INCREMENTAL, bronze_sources_df)  # type: ignore[return-value]

    def run_full_refresh(
        self,
        bronze_sources: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
    ) -> PipelineReport:
        """
        Run full refresh pipeline with all stored steps.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to DataFrames

        Returns:
            PipelineReport with execution results
        """
        steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )
        return self.run_pipeline(steps, PipelineMode.FULL_REFRESH, bronze_sources)

    def run_validation_only(
        self,
        bronze_sources: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
    ) -> PipelineReport:
        """
        Run validation-only pipeline with all stored steps.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to DataFrames

        Returns:
            PipelineReport with execution results
        """
        steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )
        return self.run_pipeline(steps, PipelineMode.VALIDATION_ONLY, bronze_sources)

    def _get_all_steps(
        self, steps: Optional[list[Union[BronzeStep, SilverStep, GoldStep]]] = None
    ) -> list[Union[BronzeStep, SilverStep, GoldStep]]:
        """Get all steps from stored dictionaries or provided list.

        Args:
            steps: Optional list of steps. If None, returns all stored steps.

        Returns:
            List of all steps (bronze, silver, gold).
        """
        if steps is not None:
            return steps
        return (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )

    def run_until(
        self,
        step_name: str,
        steps: Optional[list[Union[BronzeStep, SilverStep, GoldStep]]] = None,
        mode: PipelineMode = PipelineMode.INITIAL,
        bronze_sources: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
        step_params: Optional[Dict[str, Dict[str, Any]]] = None,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:  # type: ignore[valid-type]
        """Run pipeline until a specific step completes (inclusive).

        Executes steps in dependency order until the specified step completes,
        then stops. Useful for debugging or partial pipeline execution.

        Args:
            step_name: Name of the step to stop after (inclusive).
            steps: Optional list of steps. If None, uses all stored steps.
            mode: Pipeline execution mode.
            bronze_sources: Optional bronze source data.
            step_params: Optional dictionary mapping step names to parameter
                dictionaries for transform functions.
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary) where context contains
            all step outputs for further execution.

        Example:
            >>> report, context = runner.run_until("clean_events")
            >>> # Now you can inspect context or continue execution
        """
        all_steps = self._get_all_steps(steps)
        start_time = datetime.now()
        pipeline_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        execution_mode = self._convert_mode(mode)

        try:
            self.logger.info(
                f"Starting pipeline execution until step '{step_name}': {pipeline_id}"
            )

            # Prepare context
            context: Dict[str, DataFrame] = {}  # type: ignore[valid-type]
            if bronze_sources:
                for step in all_steps:
                    if step.step_type.value == "bronze" and step.name in bronze_sources:
                        context[step.name] = bronze_sources[step.name]

            # Execute pipeline with stop_after_step
            result = self.execution_engine.execute_pipeline(
                all_steps,
                execution_mode,
                context=context,
                step_params=step_params,
                stop_after_step=step_name,
                write_outputs=write_outputs,
                execution_order=self.execution_order,
            )

            # Create report
            report = self._create_spark_pipeline_report(
                pipeline_id=pipeline_id,
                mode=mode,
                start_time=start_time,
                execution_result=result,
            )

            self.logger.info(
                f"Completed pipeline execution until step '{step_name}': {pipeline_id}"
            )
            return report, context

        except Exception as e:
            self.logger.error(f"Pipeline execution failed: {e}")
            error_report = self._create_error_report(
                pipeline_id=pipeline_id, mode=mode, start_time=start_time, error=str(e)
            )
            return error_report, context

    def run_step(
        self,
        step_name: str,
        steps: Optional[list[Union[BronzeStep, SilverStep, GoldStep]]] = None,
        mode: PipelineMode = PipelineMode.INITIAL,
        context: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
        step_params: Optional[Dict[str, Dict[str, Any]]] = None,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:  # type: ignore[valid-type]
        """Run a single step, loading dependencies from context or tables.

        Executes only the specified step, using existing outputs from context
        or reading from tables for dependencies. Useful for debugging individual steps.

        Args:
            step_name: Name of the step to execute.
            steps: Optional list of steps. If None, uses all stored steps.
            mode: Pipeline execution mode.
            context: Optional execution context. If None, empty dict is used.
                Dependencies will be loaded from tables if not in context.
            step_params: Optional dictionary mapping step names to parameter
                dictionaries for transform functions.
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary) with updated context.

        Example:
            >>> report, context = runner.run_step("clean_events", context=context)
            >>> # Step executed, context updated with output
        """
        all_steps = self._get_all_steps(steps)
        start_time = datetime.now()
        pipeline_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        execution_mode = self._convert_mode(mode)

        if context is None:
            context = {}

        try:
            self.logger.info(
                f"Starting single step execution '{step_name}': {pipeline_id}"
            )

            # Execute pipeline starting at this step
            result = self.execution_engine.execute_pipeline(
                all_steps,
                execution_mode,
                context=context,
                step_params=step_params,
                start_at_step=step_name,
                stop_after_step=step_name,
                write_outputs=write_outputs,
                execution_order=self.execution_order,
            )

            # Create report
            report = self._create_spark_pipeline_report(
                pipeline_id=pipeline_id,
                mode=mode,
                start_time=start_time,
                execution_result=result,
            )

            self.logger.info(f"Completed step execution '{step_name}': {pipeline_id}")
            return report, context

        except Exception as e:
            self.logger.error(f"Step execution failed: {e}")
            error_report = self._create_error_report(
                pipeline_id=pipeline_id, mode=mode, start_time=start_time, error=str(e)
            )
            return error_report, context

    def rerun_step(
        self,
        step_name: str,
        steps: Optional[list[Union[BronzeStep, SilverStep, GoldStep]]] = None,
        mode: PipelineMode = PipelineMode.INITIAL,
        context: Optional[Dict[str, DataFrame]] = None,  # type: ignore[valid-type]
        step_params: Optional[Dict[str, Dict[str, Any]]] = None,
        invalidate_downstream: bool = True,
        write_outputs: bool = True,
    ) -> tuple[PipelineReport, Dict[str, DataFrame]]:  # type: ignore[valid-type]
        """Rerun a step with optional parameter overrides.

        Reruns the specified step, optionally removing downstream outputs from
        context to ensure clean execution. Useful for debugging and iterative refinement.

        Args:
            step_name: Name of the step to rerun.
            steps: Optional list of steps. If None, uses all stored steps.
            mode: Pipeline execution mode.
            context: Optional execution context. If None, empty dict is used.
            step_params: Optional dictionary mapping step names to parameter
                dictionaries for transform functions. Overrides are applied to
                the specified step.
            invalidate_downstream: If True, remove downstream step outputs from
                context to ensure clean rerun. Defaults to True.
            write_outputs: If True, write outputs to tables. If False, skip writes.

        Returns:
            Tuple of (PipelineReport, context dictionary) with updated context.

        Example:
            >>> # First run
            >>> report, context = runner.run_step("clean_events")
            >>> # Rerun with different parameters
            >>> report2, context = runner.rerun_step(
            ...     "clean_events",
            ...     context=context,
            ...     step_params={"clean_events": {"filter_threshold": 0.9}}
            ... )
        """
        all_steps = self._get_all_steps(steps)
        start_time = datetime.now()
        pipeline_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        execution_mode = self._convert_mode(mode)

        if context is None:
            context = {}

        # Invalidate downstream steps if requested
        if invalidate_downstream:
# from .dependencies import DependencyAnalyzer  # Removed: defined in notebook cells above

            # Build dependency graph to find downstream steps
            bronze_steps = [s for s in all_steps if s.step_type.value == "bronze"]
            silver_steps = [s for s in all_steps if s.step_type.value == "silver"]
            gold_steps = [s for s in all_steps if s.step_type.value == "gold"]

            analyzer = DependencyAnalyzer()
            analysis = analyzer.analyze_dependencies(
                bronze_steps={s.name: s for s in bronze_steps},  # type: ignore[misc]
                silver_steps={s.name: s for s in silver_steps},  # type: ignore[misc]
                gold_steps={s.name: s for s in gold_steps},  # type: ignore[misc]
            )

            # Find downstream steps (steps that depend on step_name)
            execution_order = analysis.execution_order
            if step_name in execution_order:
                step_index = execution_order.index(step_name)
                downstream_steps = execution_order[step_index + 1 :]

                # Remove downstream outputs from context
                for downstream_name in downstream_steps:
                    if downstream_name in context:
                        del context[downstream_name]
                        self.logger.debug(
                            f"Removed downstream step '{downstream_name}' from context"
                        )

        try:
            self.logger.info(f"Rerunning step '{step_name}': {pipeline_id}")

            # Execute pipeline starting at this step
            result = self.execution_engine.execute_pipeline(
                all_steps,
                execution_mode,
                context=context,
                step_params=step_params,
                start_at_step=step_name,
                stop_after_step=step_name,
                write_outputs=write_outputs,
                execution_order=self.execution_order,
            )

            # Create report
            report = self._create_spark_pipeline_report(
                pipeline_id=pipeline_id,
                mode=mode,
                start_time=start_time,
                execution_result=result,
            )

            self.logger.info(f"Completed step rerun '{step_name}': {pipeline_id}")
            return report, context

        except Exception as e:
            self.logger.error(f"Step rerun failed: {e}")
            error_report = self._create_error_report(
                pipeline_id=pipeline_id, mode=mode, start_time=start_time, error=str(e)
            )
            return error_report, context

    def _convert_mode(self, mode: PipelineMode) -> ExecutionMode:
        """Convert PipelineMode to ExecutionMode."""
        mode_map = {
            PipelineMode.INITIAL: ExecutionMode.INITIAL,
            PipelineMode.INCREMENTAL: ExecutionMode.INCREMENTAL,
            PipelineMode.FULL_REFRESH: ExecutionMode.FULL_REFRESH,
            PipelineMode.VALIDATION_ONLY: ExecutionMode.VALIDATION_ONLY,
        }
        return mode_map.get(mode, ExecutionMode.INITIAL)

    def _create_spark_pipeline_report(
        self,
        pipeline_id: str,
        mode: PipelineMode,
        start_time: datetime,
        execution_result: SparkExecutionResult,
    ) -> PipelineReport:
        """Create a pipeline report from execution result."""
        end_time = execution_result.end_time or datetime.now()
        duration = (end_time - start_time).total_seconds()

        # Count successful and failed steps
        steps = execution_result.steps or []
        successful_steps = [s for s in steps if s.status.value == "completed"]
        failed_steps = [s for s in steps if s.status.value == "failed"]

        # Import StepType for layer filtering
        # from ..execution import StepType  # Removed: defined in notebook cells above

        # Organize step results by layer (bronze/silver/gold)
        bronze_results = {}
        silver_results = {}
        gold_results = {}

        for step_result in steps:
            step_info = {
                "status": step_result.status.value,
                "duration": step_result.duration,
                "rows_processed": step_result.rows_processed,
                "output_table": step_result.output_table,
                "start_time": step_result.start_time.isoformat(),
                "end_time": step_result.end_time.isoformat()
                if step_result.end_time
                else None,
                "write_mode": step_result.write_mode,  # type: ignore[attr-defined]
                "validation_rate": step_result.validation_rate,
                "rows_written": step_result.rows_written,
                "input_rows": step_result.input_rows,
            }

            # Add error if present
            if step_result.error:
                step_info["error"] = step_result.error

            # Add dataframe if available in context (for users who want to access output)
            if hasattr(execution_result, "context"):
                context = getattr(execution_result, "context", None)
                if (
                    context
                    and isinstance(context, dict)
                    and step_result.step_name in context
                ):
                    step_info["dataframe"] = context[step_result.step_name]

            # Categorize by step type
            if step_result.step_type.value == "bronze":
                bronze_results[step_result.step_name] = step_info
            elif step_result.step_type.value == "silver":
                silver_results[step_result.step_name] = step_info
            elif step_result.step_type.value == "gold":
                gold_results[step_result.step_name] = step_info

        # Aggregate row counts from step results
        total_rows_processed = sum(s.rows_processed or 0 for s in steps)
        # For rows_written, only count Silver/Gold steps (those with output_table)
        total_rows_written = sum(
            s.rows_processed or 0 for s in steps if s.output_table is not None
        )

        # Calculate durations by layer
        bronze_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.BRONZE
        )
        silver_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.SILVER
        )
        gold_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.GOLD
        )

        return PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=execution_result.execution_id,
            status=(
                PipelineStatus.COMPLETED
                if execution_result.status == "completed"
                else PipelineStatus.FAILED
            ),
            mode=mode,
            start_time=start_time,
            end_time=end_time,
            duration_seconds=duration,
            metrics=PipelineMetrics(
                total_steps=len(steps),
                successful_steps=len(successful_steps),
                failed_steps=len(failed_steps),
                total_duration=duration,
                bronze_duration=bronze_duration,
                silver_duration=silver_duration,
                gold_duration=gold_duration,
                total_rows_processed=total_rows_processed,
                total_rows_written=total_rows_written,
            ),
            bronze_results=bronze_results,
            silver_results=silver_results,
            gold_results=gold_results,
            errors=[s.error for s in failed_steps if s.error],
            warnings=[],
        )

    def _create_error_report(
        self, pipeline_id: str, mode: PipelineMode, start_time: datetime, error: str
    ) -> PipelineReport:
        """Create an error pipeline report."""
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()

        return PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=f"error_{pipeline_id}",
            status=PipelineStatus.FAILED,
            mode=mode,
            start_time=start_time,
            end_time=end_time,
            duration_seconds=duration,
            metrics=PipelineMetrics(
                total_steps=0,
                successful_steps=0,
                failed_steps=0,
                total_duration=duration,
            ),
            errors=[error],
            warnings=[],
        )

class _DummyEngine:
    """Dummy engine for Runner.__init__ compatibility."""

    pass

# Alias for backward compatibility
PipelineRunner = SimplePipelineRunner

# Explicitly clear abstract methods since they are implemented
# Python's ABC mechanism sometimes doesn't recognize implementations with positional-only args
if hasattr(SimplePipelineRunner, "__abstractmethods__"):
    SimplePipelineRunner.__abstractmethods__ = frozenset()

In [None]:
# Engine Configuration Helper (PySpark-only)
# This helper automatically configures the engine with PySpark components
# In standalone notebooks, we use PySpark directly (no mock Spark support)

# Store reference to original configure_engine before we wrap it
# The configure_engine function is already defined in the engine_config module cell above
_original_configure_engine_for_pyspark = configure_engine

def configure_engine_pyspark(spark):
    """Configure engine with PySpark components for standalone notebooks.

    This is a convenience function for notebooks that automatically configures
    the engine with PySpark components. In standalone notebooks, we only
    support PySpark (not mock Spark/sparkless).

    Args:
        spark: SparkSession instance
    """
    from pyspark.sql import functions as F
    from pyspark.sql.types import (
        BooleanType, FloatType, IntegerType, StringType,
        StructField, StructType, TimestampType
    )
    from pyspark.sql.utils import AnalysisException
    from pyspark.sql.window import Window

    # Configure engine with PySpark components
    # Use the original configure_engine function (stored before wrapping)
    # Note: engine_name, dataframe_cls, spark_session_cls, column_cls are optional
    try:
        _original_configure_engine_for_pyspark(
            functions=F,
            types=StructType,
            analysis_exception=AnalysisException,
            window=Window,
            engine_name="pyspark",
            dataframe_cls=type(spark.createDataFrame([], "id int")),
            spark_session_cls=type(spark),
            column_cls=type(F.col("dummy")),
        )
    except TypeError:
        # Fallback if some parameters aren't accepted
        _original_configure_engine_for_pyspark(
            functions=F,
            types=StructType,
            analysis_exception=AnalysisException,
            window=Window,
        )
    print("✅ Engine configured with PySpark components")

# Make configure_engine accept spark parameter for convenience
# This allows using configure_engine(spark=spark) like in examples
# We already stored the original function above as _original_configure_engine_for_pyspark

def configure_engine_wrapper(*, spark=None, **kwargs):
    """Configure engine - accepts spark parameter for convenience.

    In standalone notebooks, you can call configure_engine(spark=spark)
    and it will automatically configure with PySpark components.
    """
    if spark is not None:
        # Auto-configure with PySpark
        configure_engine_pyspark(spark)
    else:
        # Use original function
        _original_configure_engine_for_pyspark(**kwargs)

# Replace configure_engine in the current namespace
configure_engine = configure_engine_wrapper

In [None]:
# Usage Example
#
# Here's how to initialize PipelineBuilder and LogWriter:

# Example: Initialize PipelineBuilder and LogWriter
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("PipelineBuilder Example") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Configure engine (required! - uses PySpark automatically)
configure_engine(spark=spark)

# Initialize PipelineBuilder
builder = PipelineBuilder(spark=spark, schema="analytics")
print("✅ PipelineBuilder initialized")

# Initialize LogWriter (simplified API)
log_writer = LogWriter(spark, schema="analytics", table_name="pipeline_logs")
print("✅ LogWriter initialized")