In [None]:
# PipelineBuilder & LogWriter v2.1.2 - Standalone Notebook
#
# This notebook contains the complete PipelineBuilder and LogWriter implementation
# as a standalone, executable notebook. All dependencies are included as cells
# in the correct order.
#
# Usage:
# 1. Run all cells from top to bottom
# 2. The PipelineBuilder and LogWriter classes will be available after all cells execute
# 3. Use PipelineBuilder to build and execute data pipelines
# 4. Use LogWriter to log and analyze pipeline execution results
#
# Note: This is generated from version 2.1.2. Module dependencies are
# resolved automatically from source code analysis.

In [None]:
# External imports (PySpark, standard library)
from __future__ import annotations

import logging
import sys
import time
import uuid
from abc import ABC, abstractmethod
from collections import defaultdict, deque
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from functools import wraps
from typing import (
    Any,
    Callable,
    Dict,
    Generator,
    List,
    Optional,
    Protocol,
    Tuple,
    TypedDict,
    TypeVar,
    Union,
    cast,
)

# PySpark imports
from pyspark.sql import Column, DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    BooleanType,
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)
from pyspark.sql.utils import AnalysisException
from pyspark.sql.window import Window

# Delta Lake imports
try:
    from delta.tables import DeltaTable
except ImportError:
    print("⚠️  Delta Lake not available. Some features may not work.")
    DeltaTable = None

# Optional imports
try:
    import psutil
except ImportError:
    print("⚠️  psutil not available. Memory monitoring disabled.")
    psutil = None

In [None]:
# Module: pipeline_builder_base.logging (pipeline_builder_base)
#
# Dependencies: None (base module)

from datetime import timezone


class PipelineLogger:
    """
    Simple, focused logging for pipeline operations.

    Features:
    - Basic logging levels (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    - Console and file output
    - Simple context management
    - Performance timing
    """

    def __init__(
        self,
        name: str = "PipelineRunner",
        level: int = logging.INFO,
        log_file: Optional[str] = None,
        verbose: bool = True,
    ):
        self.name = name
        self.level = level
        self.log_file = log_file
        self.verbose = verbose

        # Create logger
        self.logger = logging.getLogger(name)
        self.logger.setLevel(level)

        # Clear existing handlers
        self.logger.handlers.clear()

        # Setup handlers
        self._setup_handlers()

        # Performance tracking
        self._timers: Dict[str, datetime] = {}

    def _setup_handlers(self) -> None:
        """Setup logging handlers."""
        # Console handler
        if self.verbose:
            console_handler = logging.StreamHandler(sys.stdout)
            console_formatter = logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                datefmt="%H:%M:%S",
            )
            console_handler.setFormatter(console_formatter)
            console_handler.setLevel(self.level)
            self.logger.addHandler(console_handler)

        # File handler
        if self.log_file:
            file_handler = logging.FileHandler(self.log_file)
            file_formatter = logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                datefmt="%Y-%m-%d %H:%M:%S",
            )
            file_handler.setFormatter(file_formatter)
            file_handler.setLevel(self.level)
            self.logger.addHandler(file_handler)

    # Basic logging methods
    def debug(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log debug message."""
        self.logger.debug(self._format_message(message, kwargs))

    def info(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log info message."""
        self.logger.info(self._format_message(message, kwargs))

    def warning(
        self, message: str, **kwargs: Union[str, int, float, bool, None]
    ) -> None:
        """Log warning message."""
        self.logger.warning(self._format_message(message, kwargs))

    def error(self, message: str, **kwargs: Union[str, int, float, bool, None]) -> None:
        """Log error message."""
        self.logger.error(self._format_message(message, kwargs))

    def critical(
        self, message: str, **kwargs: Union[str, int, float, bool, None]
    ) -> None:
        """Log critical message."""
        self.logger.critical(self._format_message(message, kwargs))

    def _format_message(
        self, message: str, kwargs: Dict[str, Union[str, int, float, bool, None]]
    ) -> str:
        """Format message with keyword arguments."""
        if not kwargs:
            return message
        kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
        return f"{message} ({kwargs_str})"

    # Performance timing
    @contextmanager
    def time_operation(self, operation_name: str) -> Generator[None, None, None]:
        """Context manager for timing operations."""
        start_time = datetime.now(timezone.utc)
        self._timers[operation_name] = start_time
        try:
            yield
        finally:
            end_time = datetime.now(timezone.utc)
            duration = (end_time - start_time).total_seconds()
            self.info(f"Operation '{operation_name}' took {duration:.2f}s")
            # Clean up timer after operation completes
            if operation_name in self._timers:
                del self._timers[operation_name]

    def start_timer(self, timer_name: str) -> None:
        """Start a named timer."""
        self._timers[timer_name] = datetime.now(timezone.utc)

    def stop_timer(self, timer_name: str) -> float:
        """Stop a named timer and return duration in seconds."""
        if timer_name not in self._timers:
            self.warning(f"Timer '{timer_name}' was not started")
            return 0.0
        start_time = self._timers[timer_name]
        end_time = datetime.now(timezone.utc)
        duration = (end_time - start_time).total_seconds()
        del self._timers[timer_name]
        return duration

    def get_timer_duration(self, timer_name: str) -> float:
        """Get current duration of a running timer without stopping it."""
        if timer_name not in self._timers:
            return 0.0
        start_time = self._timers[timer_name]
        end_time = datetime.now(timezone.utc)
        return (end_time - start_time).total_seconds()

    # Context management
    @contextmanager
    def log_context(self, context_name: str) -> Generator[None, None, None]:
        """Context manager for logging context."""
        self.info(f"Starting: {context_name}")
        try:
            yield
            self.info(f"Completed: {context_name}")
        except Exception as e:
            self.error(f"Failed: {context_name}", error=str(e))
            raise

    # Step execution logging
    def step_start(self, step_type: str, step_name: str) -> None:
        """Log step start."""
        self.info(f"▶️ Starting {step_type.upper()} step: {step_name}")

    def step_complete(
        self,
        step_type: str,
        step_name: str,
        duration: float,
        rows_processed: int = 0,
        rows_written: int = 0,
        invalid_rows: int = 0,
        validation_rate: float = 100.0,
    ) -> None:
        """Log step completion."""
        self.info(
            f"✅ Completed {step_type.upper()} step: {step_name} ({duration:.2f}s) - "
            f"{rows_processed} rows processed, {rows_written} rows written, "
            f"{invalid_rows} invalid, {validation_rate:.1f}% valid"
        )

    # Utility methods
    def set_level(self, level: int) -> None:
        """Set logging level."""
        self.level = level
        self.logger.setLevel(level)
        for handler in self.logger.handlers:
            handler.setLevel(level)

    def add_handler(self, handler: logging.Handler) -> None:
        """Add a custom logging handler."""
        self.logger.addHandler(handler)

    def remove_handler(self, handler: logging.Handler) -> None:
        """Remove a logging handler."""
        self.logger.removeHandler(handler)

    def clear_handlers(self) -> None:
        """Clear all logging handlers."""
        self.logger.handlers.clear()

    def close(self) -> None:
        """Close all logging handlers, especially file handlers."""
        for handler in self.logger.handlers[
            :
        ]:  # Copy list to avoid modification during iteration
            handler.close()
            self.logger.removeHandler(handler)

In [None]:
# Module: pipeline_builder_base.errors (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations


class ErrorSeverity(Enum):
    """Severity levels for errors."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"


class ErrorCategory(Enum):
    """Categories of errors."""

    CONFIGURATION = "configuration"
    VALIDATION = "validation"
    EXECUTION = "execution"
    DATA = "data"
    SYSTEM = "system"
    PERFORMANCE = "performance"
    RESOURCE = "resource"


# Type definitions for error context
ErrorContextValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ErrorContext = Dict[str, ErrorContextValue]
ErrorSuggestions = List[str]


class SparkForgeError(Exception):
    """
    Base exception for all framework errors.

    This is the root exception class that all other framework exceptions
    inherit from, providing consistent error handling patterns and rich context.
    """

    def __init__(
        self,
        message: str,
        *,
        error_code: str | None = None,
        category: ErrorCategory | None = None,
        severity: ErrorSeverity = ErrorSeverity.MEDIUM,
        context: ErrorContext | None = None,
        suggestions: ErrorSuggestions | None = None,
        timestamp: datetime | None = None,
        cause: Exception | None = None,
    ):
        """
        Initialize a framework error.

        Args:
            message: Human-readable error message
            error_code: Optional error code for programmatic handling
            category: Error category for classification
            severity: Error severity level
            context: Additional context information
            suggestions: Suggested actions to resolve the error
            timestamp: When the error occurred (defaults to now)
            cause: The underlying exception that caused this error
        """
        super().__init__(message)
        self.message = message
        self.error_code = error_code
        self.category = category
        self.severity = severity
        self.context = context or {}
        self.suggestions = suggestions or []
        self.timestamp = timestamp or datetime.now(timezone.utc)
        self.cause = cause

    def __str__(self) -> str:
        """Return string representation of the error."""
        parts = [self.message]

        if self.error_code:
            parts.append(f"[{self.error_code}]")

        if self.context:
            context_str = ", ".join(f"{k}={v}" for k, v in self.context.items())
            parts.append(f"Context: {context_str}")

        if self.suggestions:
            parts.append(f"Suggestions: {'; '.join(self.suggestions)}")

        return " | ".join(parts)

    def to_dict(self) -> Dict[str, Any]:
        """Convert error to dictionary for serialization."""
        return {
            "message": self.message,
            "error_code": self.error_code,
            "category": self.category.value if self.category else None,
            "severity": self.severity.value if self.severity else None,
            "context": self.context,
            "suggestions": self.suggestions,
            "timestamp": self.timestamp.isoformat() if self.timestamp else None,
            "cause": str(self.cause) if self.cause else None,
        }


class ValidationError(SparkForgeError):
    """Raised when validation fails."""

    def __init__(
        self,
        message: str,
        *,
        field: str | None = None,
        value: Any = None,
        **kwargs: Any,
    ):
        super().__init__(
            message,
            category=ErrorCategory.VALIDATION,
            severity=ErrorSeverity.MEDIUM,
            **kwargs,
        )
        self.field = field
        self.value = value
        if field:
            self.context["field"] = field
        if value is not None:
            self.context["value"] = str(value)


class PipelineValidationError(ValidationError):
    """Raised when pipeline validation fails."""

    def __init__(
        self,
        message: str,
        *,
        step_name: str | None = None,
        phase: str | None = None,
        **kwargs: Any,
    ):
        super().__init__(message, **kwargs)
        self.step_name = step_name
        self.phase = phase
        if step_name:
            self.context["step_name"] = step_name
        if phase:
            self.context["phase"] = phase


class ConfigurationError(SparkForgeError):
    """Raised when configuration is invalid."""

    def __init__(self, message: str, **kwargs: Any):
        # Only set default severity if not provided in kwargs
        if "severity" not in kwargs:
            kwargs["severity"] = ErrorSeverity.MEDIUM
        super().__init__(
            message,
            category=ErrorCategory.CONFIGURATION,
            **kwargs,
        )


class ExecutionError(SparkForgeError):
    """Raised when execution fails."""

    def __init__(
        self,
        message: str,
        *,
        step_name: str | None = None,
        phase: str | None = None,
        **kwargs: Any,
    ):
        super().__init__(
            message,
            category=ErrorCategory.EXECUTION,
            severity=ErrorSeverity.HIGH,
            **kwargs,
        )
        self.step_name = step_name
        self.phase = phase
        if step_name:
            self.context["step_name"] = step_name
        if phase:
            self.context["phase"] = phase


class DataError(SparkForgeError):
    """Raised when data operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.DATA,
            severity=ErrorSeverity.MEDIUM,
            **kwargs,
        )


class SystemError(SparkForgeError):
    """Raised when system operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.SYSTEM,
            severity=ErrorSeverity.CRITICAL,
            **kwargs,
        )


class PerformanceError(SparkForgeError):
    """Raised when performance issues are detected."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.PERFORMANCE,
            severity=ErrorSeverity.LOW,
            **kwargs,
        )


class ResourceError(SparkForgeError):
    """Raised when resource operations fail."""

    def __init__(self, message: str, **kwargs: Any):
        super().__init__(
            message,
            category=ErrorCategory.RESOURCE,
            severity=ErrorSeverity.HIGH,
            **kwargs,
        )

In [None]:
# Module: pipeline_builder_base.dependencies.graph (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations

import logging
from enum import Enum
from typing import Dict

logger = logging.getLogger(__name__)


class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


@dataclass
class StepNode:
    """Represents a single step in the dependency graph."""

    name: str
    step_type: StepType
    dependencies: set[str] = field(default_factory=set)
    dependents: set[str] = field(default_factory=set)
    execution_group: int = 0
    can_run_parallel: bool = True
    estimated_duration: float = 0.0
    metadata: Dict[str, Any] = field(default_factory=dict)


class DependencyGraph:
    """
    Represents the dependency graph of a pipeline.

    This class provides efficient operations for dependency analysis,
    cycle detection, and execution planning.
    """

    def __init__(self) -> None:
        self.nodes: Dict[str, StepNode] = {}
        self._adjacency_list: Dict[str, set[str]] = defaultdict(set)
        self._reverse_adjacency_list: Dict[str, set[str]] = defaultdict(set)

    def add_node(self, node: StepNode) -> None:
        """Add a node to the dependency graph."""
        self.nodes[node.name] = node
        self._adjacency_list[node.name] = set()
        self._reverse_adjacency_list[node.name] = set()

    def add_dependency(self, from_step: str, to_step: str) -> None:
        """Add a dependency from one step to another."""
        if from_step not in self.nodes or to_step not in self.nodes:
            raise ValueError(f"Steps {from_step} or {to_step} not found in graph")

        self._adjacency_list[from_step].add(to_step)
        self._reverse_adjacency_list[to_step].add(from_step)

        # Update node dependencies
        self.nodes[from_step].dependencies.add(to_step)
        self.nodes[to_step].dependents.add(from_step)

    def get_dependencies(self, step_name: str) -> set[str]:
        """Get all dependencies for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependencies.copy()

    def get_dependents(self, step_name: str) -> set[str]:
        """Get all dependents for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependents.copy()

    def detect_cycles(self) -> list[list[str]]:
        """Detect cycles in the dependency graph using DFS."""
        visited = set()
        rec_stack = set()
        cycles = []

        def dfs(node: str, path: list[str]) -> None:
            if node in rec_stack:
                # Found a cycle
                cycle_start = path.index(node)
                cycle = path[cycle_start:] + [node]
                cycles.append(cycle)
                return

            if node in visited:
                return

            visited.add(node)
            rec_stack.add(node)
            path.append(node)

            for neighbor in self._adjacency_list[node]:
                dfs(neighbor, path)

            rec_stack.remove(node)
            path.pop()

        for node in self.nodes:
            if node not in visited:
                dfs(node, [])

        return cycles

    def topological_sort(self) -> list[str]:
        """
        Perform topological sort of the dependency graph.

        Returns nodes in an order such that dependencies come before dependents.
        Uses reverse adjacency list since add_dependency(A, B) means A depends on B,
        so B must come before A in the sort.
        """
        in_degree = dict.fromkeys(self.nodes, 0)

        # Calculate in-degrees using reverse adjacency
        # If A depends on B, then B->A edge exists in reverse list
        for node in self.nodes:
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] += 1

        # Find nodes with no incoming edges (no dependencies)
        queue = deque([node for node, degree in in_degree.items() if degree == 0])
        result = []

        while queue:
            node = queue.popleft()
            result.append(node)

            # Process nodes that depend on this one
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] -= 1
                if in_degree[dependent] == 0:
                    queue.append(dependent)

        return result

    def get_execution_groups(self) -> list[list[str]]:
        """Get execution groups for parallel execution."""
        # Use topological sort to determine execution order
        sorted_nodes = self.topological_sort()

        # Group nodes by their level in the dependency tree
        levels = {}
        for node in sorted_nodes:
            if not self.nodes[node].dependencies:
                levels[node] = 0
            else:
                # Ensure all dependencies have been processed
                max_dep_level = 0
                for dep in self.nodes[node].dependencies:
                    if dep in levels:
                        max_dep_level = max(max_dep_level, levels[dep])
                    else:
                        # If dependency not found, it might be missing from the graph
                        # This could happen if the dependency graph is incomplete
                        logger.warning(
                            f"Dependency {dep} not found in levels for node {node}"
                        )
                        max_dep_level = max(max_dep_level, 0)
                levels[node] = max_dep_level + 1

        # Group nodes by level
        groups = defaultdict(list)
        for node, level in levels.items():
            groups[level].append(node)

        return [groups[level] for level in sorted(groups.keys())]

    def get_parallel_candidates(self) -> list[list[str]]:
        """Get groups of steps that can run in parallel."""
        execution_groups = self.get_execution_groups()
        return execution_groups

    def validate(self) -> list[str]:
        """Validate the dependency graph and return any issues."""
        issues = []

        # Check for cycles
        cycles = self.detect_cycles()
        if cycles:
            for cycle in cycles:
                issues.append(f"Circular dependency detected: {' -> '.join(cycle)}")

        # Check for missing dependencies
        for node_name, node in self.nodes.items():
            for dep in node.dependencies:
                if dep not in self.nodes:
                    issues.append(f"Node {node_name} depends on missing node {dep}")

        return issues

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the dependency graph."""
        total_nodes = len(self.nodes)
        total_edges = sum(len(deps) for deps in self._adjacency_list.values())

        # Count by step type
        type_counts: Dict[str, int] = defaultdict(int)
        for node in self.nodes.values():
            type_counts[node.step_type.value] += 1

        # Calculate average dependencies
        avg_dependencies = total_edges / total_nodes if total_nodes > 0 else 0

        return {
            "total_nodes": total_nodes,
            "total_edges": total_edges,
            "type_counts": dict(type_counts),
            "average_dependencies": avg_dependencies,
            "has_cycles": len(self.detect_cycles()) > 0,
        }

In [None]:
# Module: pipeline_builder_base.dependencies.exceptions (pipeline_builder_base)
#
# Dependencies: None (base module)

from typing import List


class DependencyError(Exception):
    """Base exception for dependency-related errors."""

    def __init__(self, message: str, step_name: Optional[str] = None):
        super().__init__(message)
        self.step_name = step_name


class DependencyAnalysisError(DependencyError):
    """Raised when dependency analysis fails."""

    def __init__(self, message: str, analysis_step: Optional[str] = None):
        super().__init__(message, analysis_step)
        self.analysis_step = analysis_step


class CircularDependencyError(DependencyError):
    """Raised when circular dependencies are detected."""

    def __init__(self, message: str, cycle: List[str]):
        super().__init__(message)
        self.cycle = cycle


class InvalidDependencyError(DependencyError):
    """Raised when invalid dependencies are detected."""

    def __init__(self, message: str, invalid_dependencies: List[str]):
        super().__init__(message)
        self.invalid_dependencies = invalid_dependencies


class DependencyConflictError(DependencyError):
    """Raised when dependency conflicts are detected."""

    def __init__(self, message: str, conflicting_steps: List[str]):
        super().__init__(message)
        self.conflicting_steps = conflicting_steps

In [None]:
# Module: pipeline_builder_base.models.steps (pipeline_builder_base)
#
# Dependencies: None (base module)

from typing import Dict, List


class StepProtocol(Protocol):
    """Protocol for all pipeline steps."""

    name: str
    rules: Dict[str, Any]

    def validate(self) -> None:
        """Validate the step configuration."""
        ...


class BronzeStepProtocol(StepProtocol, Protocol):
    """Protocol for bronze layer steps."""

    incremental_col: str | None


class SilverStepProtocol(StepProtocol, Protocol):
    """Protocol for silver layer steps."""

    source_bronze: str
    table_name: str


class GoldStepProtocol(StepProtocol, Protocol):
    """Protocol for gold layer steps."""

    source_silvers: list[str] | None
    table_name: str

In [None]:
# Module: pipeline_builder_base.models.enums (pipeline_builder_base)
#
# Dependencies: None (base module)

from enum import Enum


class PipelinePhase(Enum):
    """Enumeration of pipeline phases."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


class ExecutionMode(Enum):
    """Enumeration of execution modes."""

    INITIAL = "initial"
    INCREMENTAL = "incremental"
    FULL_REFRESH = "full_refresh"
    VALIDATION_ONLY = "validation_only"


class WriteMode(Enum):
    """Enumeration of write modes."""

    OVERWRITE = "overwrite"
    APPEND = "append"


class ValidationResult(Enum):
    """Enumeration of validation results."""

    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"

In [None]:
# Module: pipeline_builder_base.models.execution (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict

# from .base import BaseModel  # Removed: defined in notebook cells above
# from .enums import ExecutionMode, PipelinePhase  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError  # Removed: defined in notebook cells above
# from .pipeline import PipelineMetrics  # Removed: defined in notebook cells above


@dataclass
class ExecutionContext(BaseModel):
    """
    Context for pipeline execution.

    Attributes:
        mode: Execution mode (initial/incremental)
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Total execution duration
        run_id: Unique run identifier
        execution_id: Unique identifier for this execution
        pipeline_id: Identifier for the pipeline being executed
        schema: Target schema for data storage
        started_at: When execution started (alias for start_time)
        ended_at: When execution ended (alias for end_time)
        run_mode: Mode of execution (alias for mode)
        config: Pipeline configuration as dictionary
    """

    mode: ExecutionMode
    start_time: datetime
    end_time: datetime | None = None
    duration_secs: float | None = None
    run_id: str = field(default_factory=lambda: str(uuid.uuid4()))

    # Additional fields for writer compatibility
    execution_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    pipeline_id: str = "unknown"
    schema: str = "default"
    started_at: datetime | None = None
    ended_at: datetime | None = None
    run_mode: str = "initial"
    config: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self) -> None:
        """Initialize aliases and defaults."""
        if self.started_at is None:
            self.started_at = self.start_time
        if self.ended_at is None:
            self.ended_at = self.end_time
        if self.run_mode == "initial":
            # Map mode to run_mode string
            if hasattr(self.mode, "value"):
                self.run_mode = self.mode.value
            elif hasattr(self.mode, "name"):
                self.run_mode = self.mode.name.lower()

    def validate(self) -> None:
        """Validate the execution context."""
        if not self.run_id:
            raise ValueError("Run ID cannot be empty")
        if self.duration_secs is not None and self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")

    def finish(self) -> None:
        """Mark execution as finished and calculate duration."""
        self.end_time = datetime.now(timezone.utc)
        if self.start_time:
            self.duration_secs = (self.end_time - self.start_time).total_seconds()

    @property
    def is_finished(self) -> bool:
        """Check if execution is finished."""
        return self.end_time is not None

    @property
    def is_running(self) -> bool:
        """Check if execution is currently running."""
        return not self.is_finished


@dataclass
class StageStats(BaseModel):
    """
    Statistics for a pipeline stage.

    Attributes:
        stage: Stage name (bronze/silver/gold)
        step: Step name
        total_rows: Total number of rows processed
        valid_rows: Number of valid rows
        invalid_rows: Number of invalid rows
        validation_rate: Validation success rate (0-100)
        duration_secs: Processing duration in seconds
        start_time: When processing started
        end_time: When processing ended
    """

    stage: str
    step: str
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_time: datetime | None = None
    end_time: datetime | None = None

    def validate(self) -> None:
        """Validate stage statistics."""
        if self.total_rows != self.valid_rows + self.invalid_rows:
            raise PipelineConfigurationError(
                f"Total rows ({self.total_rows}) must equal valid ({self.valid_rows}) + invalid ({self.invalid_rows})"
            )
        if not 0 <= self.validation_rate <= 100:
            raise PipelineConfigurationError(
                f"Validation rate must be between 0 and 100, got {self.validation_rate}"
            )
        if self.duration_secs < 0:
            raise PipelineConfigurationError(
                f"Duration must be non-negative, got {self.duration_secs}"
            )

    @property
    def is_valid(self) -> bool:
        """Check if the stage passed validation."""
        return self.validation_rate >= 95.0  # Default threshold

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.total_rows == 0:
            return 0.0
        return (self.invalid_rows / self.total_rows) * 100

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.total_rows / self.duration_secs


@dataclass
class StepResult(BaseModel):
    """
    Result of a pipeline step execution.

    Attributes:
        step_name: Name of the step
        phase: Pipeline phase
        success: Whether the step succeeded
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Execution duration in seconds
        rows_processed: Number of rows processed
        rows_written: Number of rows written
        validation_rate: Validation success rate
        error_message: Error message if failed
        step_type: Type of step (bronze, silver, gold)
        table_fqn: Fully qualified table name if step writes to table
        write_mode: Write mode used (overwrite, append)
        input_rows: Number of input rows processed
    """

    step_name: str
    phase: PipelinePhase
    success: bool
    start_time: datetime
    end_time: datetime
    duration_secs: float
    rows_processed: int
    rows_written: int
    validation_rate: float
    error_message: str | None = None
    step_type: str | None = None
    table_fqn: str | None = None
    write_mode: str | None = None
    input_rows: int | None = None

    def validate(self) -> None:
        """Validate the step result."""
        if not self.step_name:
            raise ValueError("Step name cannot be empty")
        if self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")
        if self.rows_processed < 0:
            raise ValueError("Rows processed cannot be negative")
        if self.rows_written < 0:
            raise ValueError("Rows written cannot be negative")
        if not 0 <= self.validation_rate <= 100:
            raise ValueError("Validation rate must be between 0 and 100")

    @property
    def is_valid(self) -> bool:
        """Check if the step result is valid."""
        return self.success and self.validation_rate >= 95.0

    @property
    def is_high_quality(self) -> bool:
        """Check if the step result is high quality."""
        return self.success and self.validation_rate >= 98.0

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.rows_processed / self.duration_secs

    @classmethod
    def create_success(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        rows_processed: int,
        rows_written: int,
        validation_rate: float,
        step_type: str | None = None,
        table_fqn: str | None = None,
        write_mode: str | None = None,
        input_rows: int | None = None,
    ) -> StepResult:
        """Create a successful step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=True,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=rows_processed,
            rows_written=rows_written,
            validation_rate=validation_rate,
            error_message=None,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @classmethod
    def create_failure(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        error_message: str,
        step_type: str | None = None,
        table_fqn: str | None = None,
        write_mode: str | None = None,
        input_rows: int | None = None,
    ) -> StepResult:
        """Create a failed step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=False,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=0,
            rows_written=0,
            validation_rate=0.0,
            error_message=error_message,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.rows_processed == 0:
            return 0.0
        return 100.0 - self.validation_rate


@dataclass
class ExecutionResult(BaseModel):
    """
    Result of pipeline execution.

    Attributes:
        context: Execution context
        step_results: Results for each step
        metrics: Overall execution metrics
        success: Whether the entire pipeline succeeded
    """

    context: ExecutionContext
    step_results: list[StepResult]
    metrics: PipelineMetrics
    success: bool

    def validate(self) -> None:
        """Validate execution result."""
        if not isinstance(self.context, ExecutionContext):
            raise PipelineConfigurationError(
                "Context must be an ExecutionContext instance"
            )
        if not isinstance(self.step_results, list):
            raise PipelineConfigurationError("Step results must be a list")
        if not isinstance(self.metrics, PipelineMetrics):
            raise PipelineConfigurationError(
                "Metrics must be a PipelineMetrics instance"
            )
        if not isinstance(self.success, bool):
            raise PipelineConfigurationError("Success must be a boolean")

    @classmethod
    def from_context_and_results(
        cls, context: ExecutionContext, step_results: list[StepResult]
    ) -> ExecutionResult:
        """Create execution result from context and step results."""
        metrics = PipelineMetrics.from_step_results(step_results)
        success = all(result.success for result in step_results)
        return cls(
            context=context, step_results=step_results, metrics=metrics, success=success
        )

In [None]:
# Module: pipeline_builder_base.models.types (pipeline_builder_base)
#
# Dependencies: None (base module)

from typing import Dict, List, Protocol, Union

# Specific types for model values instead of Any
ModelValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ResourceValue = Union[str, int, float, bool, List[str], Dict[str, str]]

# Generic type for pipeline results
T = TypeVar("T")


class Validatable(Protocol):
    """Protocol for objects that can be validated."""

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid."""
        ...


class Serializable(Protocol):
    """Protocol for objects that can be serialized."""

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert object to dictionary."""
        ...

    def to_json(self) -> str:
        """Convert object to JSON string."""
        ...

In [None]:
# Module: pipeline_builder_base.models.exceptions (pipeline_builder_base)
#
# Dependencies: None (base module)


class PipelineConfigurationError(ValueError):
    """Raised when pipeline configuration is invalid."""

    pass


class PipelineExecutionError(RuntimeError):
    """Raised when pipeline execution fails."""

    pass

In [None]:
# Module: pipeline_builder_base.writer.models (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
from typing import Dict, Literal

# from ..models import ExecutionResult, StepResult  # Removed: defined in notebook cells above


# ============================================================================
# Enums
# ============================================================================


class WriteMode(Enum):
    """Write mode for log operations."""

    OVERWRITE = "overwrite"
    APPEND = "append"
    MERGE = "merge"
    IGNORE = "ignore"


# ============================================================================
# TypedDict Definitions
# ============================================================================


class LogRow(TypedDict):
    """
    Enhanced log row with full type safety and framework integration.

    This is an engine-agnostic log row structure that can be used
    by both Spark and SQL implementations.
    """

    # Run-level information
    run_id: str
    run_mode: Literal["initial", "incremental", "full_refresh", "validation_only"]
    run_started_at: datetime | None
    run_ended_at: datetime | None

    # Execution context
    execution_id: str
    pipeline_id: str
    schema: str

    # Step-level information
    phase: Literal["bronze", "silver", "gold", "pipeline"]
    step_name: str
    step_type: str

    # Timing information
    start_time: datetime | None
    end_time: datetime | None
    duration_secs: float

    # Table information
    table_fqn: str | None
    write_mode: Literal["overwrite", "append"] | None

    # Data metrics
    input_rows: int | None
    output_rows: int | None
    rows_written: int | None
    rows_processed: int
    table_total_rows: int | None  # Total rows in table after this write

    # Validation metrics
    valid_rows: int
    invalid_rows: int
    validation_rate: float

    # Execution status
    success: bool
    error_message: str | None

    # Performance metrics
    memory_usage_mb: float | None
    cpu_usage_percent: float | None

    # Metadata
    metadata: Dict[str, Any]


class WriterMetrics(TypedDict):
    """Metrics for writer operations."""

    total_writes: int
    successful_writes: int
    failed_writes: int
    total_duration_secs: float
    avg_write_duration_secs: float
    total_rows_written: int
    memory_usage_peak_mb: float


# ============================================================================
# Configuration Models
# ============================================================================


@dataclass
class WriterConfig:
    """
    Configuration for the LogWriter.

    Provides comprehensive configuration options for the writer module
    including table settings, performance tuning, and feature flags.
    """

    table_schema: str
    table_name: str
    write_mode: WriteMode = WriteMode.APPEND
    enable_analytics: bool = True
    enable_monitoring: bool = True
    enable_quality_checks: bool = True
    batch_size: int = 1000
    max_retries: int = 3
    retry_delay_secs: float = 1.0

    def validate(self) -> None:
        """Validate the writer configuration."""
        if not self.table_schema or not isinstance(self.table_schema, str):
            raise ValueError("table_schema must be a non-empty string")
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValueError("table_name must be a non-empty string")
        if self.batch_size < 1:
            raise ValueError("batch_size must be at least 1")
        if self.max_retries < 0:
            raise ValueError("max_retries must be non-negative")
        if self.retry_delay_secs < 0:
            raise ValueError("retry_delay_secs must be non-negative")


# ============================================================================
# Utility Functions
# ============================================================================


def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: str = "initial",
    metadata: Dict[str, Any] | None = None,
) -> list[LogRow]:
    """
    Create log rows from an execution result.

    This is an engine-agnostic function that creates log rows from
    execution results. Engine-specific implementations can use this
    as a base and extend it as needed.

    Args:
        execution_result: The execution result
        run_id: Run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List of log rows
    """
    log_rows = []

    # Create a main log row for the execution
    context = execution_result.context
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,  # type: ignore[typeddict-item]
        "run_started_at": context.start_time,
        "run_ended_at": context.end_time,
        "execution_id": context.execution_id,
        "pipeline_id": context.pipeline_id,
        "schema": context.schema,
        "phase": "pipeline",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": context.start_time,
        "end_time": context.end_time,
        "duration_secs": context.duration_secs or 0.0,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": None,
        "output_rows": None,
        "rows_written": None,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": execution_result.success,
        "error_message": None,
        "memory_usage_mb": None,
        "cpu_usage_percent": None,
        "metadata": metadata or {},
    }

    log_rows.append(main_row)

    # Add step results
    for step_result in execution_result.step_results:
        step_row: LogRow = {
            "run_id": run_id,
            "run_mode": run_mode,  # type: ignore[typeddict-item]
            "run_started_at": context.start_time,
            "run_ended_at": context.end_time,
            "execution_id": context.execution_id,
            "pipeline_id": context.pipeline_id,
            "schema": context.schema,
            "phase": step_result.phase.value,  # type: ignore[typeddict-item]
            "step_name": step_result.step_name,
            "step_type": step_result.step_type or "unknown",
            "start_time": step_result.start_time,
            "end_time": step_result.end_time,
            "duration_secs": step_result.duration_secs,
            "table_fqn": step_result.table_fqn,
            "write_mode": step_result.write_mode,  # type: ignore[typeddict-item]
            "input_rows": step_result.input_rows,
            "output_rows": step_result.rows_written,
            "rows_written": step_result.rows_written,
            "rows_processed": step_result.rows_processed,
            "table_total_rows": None,
            "valid_rows": step_result.rows_processed,
            "invalid_rows": 0,
            "validation_rate": step_result.validation_rate,
            "success": step_result.success,
            "error_message": step_result.error_message,
            "memory_usage_mb": None,
            "cpu_usage_percent": None,
            "metadata": {},
        }
        log_rows.append(step_row)

    return log_rows


def validate_log_data(log_rows: list[LogRow]) -> None:
    """
    Validate log data for quality and consistency.

    Args:
        log_rows: List of log rows to validate

    Raises:
        ValueError: If validation fails
    """
    if not log_rows:
        return

    # Basic validation - check required fields
    required_fields = {"run_id", "phase", "step_name"}
    for i, row in enumerate(log_rows):
        missing_fields = required_fields - set(row.keys())
        if missing_fields:
            raise ValueError(f"Log row {i} missing required fields: {missing_fields}")

In [None]:
# Module: pipeline_builder_base.writer.exceptions (pipeline_builder_base)
#
# Dependencies: None (base module)

# from ..errors import SparkForgeError  # Removed: defined in notebook cells above


class WriterError(SparkForgeError):
    """Base exception for writer errors."""

    pass


class WriterConfigurationError(WriterError):
    """Raised when writer configuration is invalid."""

    pass


class WriterValidationError(WriterError):
    """Raised when writer validation fails."""

    pass


class WriterTableError(WriterError):
    """Raised when table operations fail."""

    pass


class WriterDataQualityError(WriterError):
    """Raised when data quality checks fail."""

    pass


class WriterPerformanceError(WriterError):
    """Raised when performance issues are detected."""

    pass

In [None]:
# Module: pipeline_builder_base.writer.base (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations

from typing import Dict

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..models import ExecutionResult  # Removed: defined in notebook cells above
# from .models import LogRow, WriteMode, WriterConfig, WriterMetrics, create_log_rows_from_execution_result  # Removed: defined in notebook cells above


class BaseLogWriter(ABC):
    """
    Abstract base class for LogWriter implementations.

    This class defines the interface that all LogWriter implementations
    must follow, while allowing engine-specific implementations for
    storage operations.

    Subclasses must implement:
    - _write_log_rows() - Engine-specific write operation
    - _read_log_table() - Engine-specific read operation
    - _table_exists() - Engine-specific table existence check
    - _create_table() - Engine-specific table creation
    """

    def __init__(
        self,
        schema: str,
        table_name: str,
        config: WriterConfig | None = None,
        logger: PipelineLogger | None = None,
    ) -> None:
        """
        Initialize the base LogWriter.

        Args:
            schema: Database schema name
            table_name: Table name
            config: Writer configuration (optional)
            logger: Pipeline logger (optional)
        """
        self.schema = schema
        self.table_name = table_name
        self.logger = logger or PipelineLogger()

        # Create config from schema/table_name if not provided
        if config is None:
            # from .models import WriteMode  # Removed: defined in notebook cells above
            config = WriterConfig(
                table_schema=schema,
                table_name=table_name,
                write_mode=WriteMode.APPEND,
            )
        self.config = config
        self.config.validate()

    @property
    def table_fqn(self) -> str:
        """Get fully qualified table name."""
        return f"{self.schema}.{self.table_name}"

    def create_table(self, execution_result: ExecutionResult) -> None:
        """
        Create the log table from the first execution result.

        Args:
            execution_result: The execution result to create table from
        """
        if self._table_exists():
            self.logger.warning(
                f"Table {self.table_fqn} already exists, skipping creation"
            )
            return

        self.logger.info(f"Creating log table {self.table_fqn}")
        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=execution_result.context.run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to create table from")
            return

        self._create_table(log_rows)
        self._write_log_rows(log_rows, WriteMode.APPEND)

    def append(self, execution_result: ExecutionResult) -> WriterMetrics:
        """
        Append execution result to the log table.

        Args:
            execution_result: The execution result to append

        Returns:
            Writer metrics
        """
        if not self._table_exists():
            self.logger.warning(f"Table {self.table_fqn} does not exist, creating it")
            self.create_table(execution_result)
            return self._get_metrics()

        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=execution_result.context.run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to append")
            return self._get_metrics()

        self._write_log_rows(log_rows, WriteMode.APPEND)
        return self._get_metrics()

    def write(
        self, execution_result: ExecutionResult, mode: WriteMode = WriteMode.APPEND
    ) -> WriterMetrics:
        """
        Write execution result to the log table.

        Args:
            execution_result: The execution result to write
            mode: Write mode (APPEND or OVERWRITE)

        Returns:
            Writer metrics
        """
        if mode == WriteMode.OVERWRITE or not self._table_exists():
            if not self._table_exists():
                self.logger.info(f"Table {self.table_fqn} does not exist, creating it")
                self.create_table(execution_result)
            else:
                self.logger.info(f"Overwriting table {self.table_fqn}")
                # For overwrite, we need to clear the table first
                # This is engine-specific, so subclasses should override if needed
                self._write_log_rows([], WriteMode.OVERWRITE)

        log_rows = create_log_rows_from_execution_result(
            execution_result,
            run_id=execution_result.context.run_id,
            run_mode=execution_result.context.run_mode,
        )

        if not log_rows:
            self.logger.warning("No log rows to write")
            return self._get_metrics()

        self._write_log_rows(log_rows, mode)
        return self._get_metrics()

    def read(self, limit: int | None = None) -> list[LogRow]:
        """
        Read log rows from the table.

        Args:
            limit: Maximum number of rows to read (None for all)

        Returns:
            List of log rows
        """
        if not self._table_exists():
            self.logger.warning(f"Table {self.table_fqn} does not exist")
            return []

        return self._read_log_table(limit)

    # Abstract methods that must be implemented by subclasses

    @abstractmethod
    def _write_log_rows(self, log_rows: list[LogRow], mode: WriteMode) -> None:
        """
        Write log rows to the storage system.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            log_rows: List of log rows to write
            mode: Write mode
        """
        pass

    @abstractmethod
    def _read_log_table(self, limit: int | None = None) -> list[LogRow]:
        """
        Read log rows from the storage system.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            limit: Maximum number of rows to read (None for all)

        Returns:
            List of log rows
        """
        pass

    @abstractmethod
    def _table_exists(self) -> bool:
        """
        Check if the log table exists.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Returns:
            True if table exists, False otherwise
        """
        pass

    @abstractmethod
    def _create_table(self, sample_rows: list[LogRow]) -> None:
        """
        Create the log table with appropriate schema.

        This is an engine-specific operation that must be implemented
        by subclasses.

        Args:
            sample_rows: Sample log rows to infer schema from
        """
        pass

    def _get_metrics(self) -> WriterMetrics:
        """
        Get writer metrics.

        This is a default implementation that can be overridden
        by subclasses for more detailed metrics.

        Returns:
            Writer metrics
        """
        return {
            "total_writes": 1,
            "successful_writes": 1,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }

In [None]:
# Module: pipeline_builder_base.validation.utils (pipeline_builder_base)
#
# Dependencies: None (base module)

from __future__ import annotations


def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
    """
    Safely divide two numbers, returning default if denominator is zero or None.

    Args:
        numerator: The numerator
        denominator: The denominator
        default: Default value to return if denominator is zero or None

    Returns:
        The division result or default value
    """
    if denominator is None or numerator is None or denominator == 0:
        return default
    return numerator / denominator

In [None]:
# Module: pipeline_builder.logging (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

# Re-export from base for backward compatibility
# from .logging import PipelineLogger  # Removed: defined in notebook cells above

__all__ = ["PipelineLogger"]

In [None]:
# Module: pipeline_builder.compat (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

import os
from typing import Type

_ENGINE = os.getenv("SPARKFORGE_ENGINE", "auto").lower()


def _try_import_pyspark() -> Optional[
    Tuple[Type[Any], Type[Any], Type[Any], Any, Any, Type[Exception]]
]:
    """Try to import PySpark modules."""
    try:
        from pyspark.sql import Column as _Column
        from pyspark.sql import DataFrame as _DataFrame
        from pyspark.sql import SparkSession as _SparkSession
        from pyspark.sql import functions as _F
        from pyspark.sql import types as _types
        from pyspark.sql.utils import (
            AnalysisException as _AnalysisException,
        )

        return _DataFrame, _SparkSession, _Column, _F, _types, _AnalysisException
    except Exception:
        return None


def _try_import_mockspark() -> Optional[
    Tuple[Type[Any], Type[Any], Type[Any], Any, Any, Type[Exception]]
]:
    """Try to import mock-spark modules."""
    try:
        return _DataFrame, _SparkSession, _Column, _F, _types, _AnalysisException
    except Exception:
        # Log the error for debugging but don't fail
        # Note: mock-spark 3.1.0 has Python 3.8 compatibility issues
        # This is a known issue with the mock-spark package
        return None


def _select_engine() -> Tuple[
    str, Tuple[Type[Any], Type[Any], Type[Any], Any, Any, Type[Exception]]
]:
    """Select the appropriate engine based on environment and availability."""
    if _ENGINE in ("pyspark", "spark", "real"):
        ps = _try_import_pyspark()
        if ps is None:
            raise ImportError(
                "SPARKFORGE_ENGINE=pyspark but pyspark is not importable. "
                "Install with: pip install sparkforge[pyspark]"
            )
        return "pyspark", ps
    if _ENGINE in ("mock", "mockspark"):
        ms = _try_import_mockspark()
        if ms is None:
            raise ImportError(
                "SPARKFORGE_ENGINE=mock but mock-spark is not importable. "
                "Install with: pip install sparkforge[mock]"
            )
        return "mock", ms

    # auto mode: prefer PySpark if available, otherwise mock-spark
    ps = _try_import_pyspark()
    if ps is not None:
        return "pyspark", ps
    ms = _try_import_mockspark()
    if ms is not None:
        return "mock", ms

    raise ImportError(
        "Neither pyspark nor mock-spark could be imported. "
        "Install with: pip install sparkforge[pyspark] or pip install sparkforge[mock]"
    )


_ENGINE_NAME, (DataFrame, SparkSession, Column, F, types, AnalysisException) = (
    _select_engine()
)


def is_mock_spark() -> bool:
    """Check if currently using mock-spark."""
    return bool(_ENGINE_NAME == "mock")


def compat_name() -> str:
    """Get the name of the current compatibility engine."""
    return str(_ENGINE_NAME)


def require_pyspark(message: str | None = None) -> None:
    """Raise an error if not using PySpark."""
    if is_mock_spark():
        raise RuntimeError(
            message
            or "This operation requires PySpark and is not supported in mock mode"
        )


# Function shims when running in mock mode (no-op fallbacks)
def desc(col_name: str) -> Any:
    """Get descending order expression for a column."""
    if _ENGINE_NAME == "pyspark":
        # Delegate to PySpark's desc via functions
        return F.desc(col_name)
    # mock-spark: return a tuple understood by orderBy implementation if present
    return (col_name, False)


def col(col_name: str) -> Any:
    """Get a column by name."""
    return F.col(col_name)


def lit(value: Any) -> Any:
    """Create a literal column."""
    return F.lit(value)


def current_timestamp() -> Any:
    """Get current timestamp."""
    ct = getattr(F, "current_timestamp", None)
    if callable(ct):
        return ct()
    # Fallback: literal current timestamp string
    import datetime as _dt

    return lit(_dt.datetime.now().isoformat())


# Export Window if available
if _ENGINE_NAME == "pyspark":
    try:
        from pyspark.sql import Window  # type: ignore[import-untyped]
    except ImportError:
        # Fallback Window for mock-spark
        class Window:
            @staticmethod
            def orderBy(*cols: Any) -> Any:
                return None
else:
    # Mock Window for mock-spark
    class Window:
        @staticmethod
        def orderBy(*cols: Any) -> Any:
            return None

In [None]:
# Module: pipeline_builder.constants (pipeline_builder)
#
# Dependencies: None (base module)

# Memory and Size Constants
BYTES_PER_KB = 1024
BYTES_PER_MB = BYTES_PER_KB * 1024
BYTES_PER_GB = BYTES_PER_MB * 1024

# Default Memory Limits
DEFAULT_MAX_MEMORY_MB = 1024
DEFAULT_CACHE_MEMORY_MB = 512

# File Size Constants
DEFAULT_MAX_FILE_SIZE_MB = 10
DEFAULT_BACKUP_COUNT = 5

# Performance Constants
DEFAULT_CACHE_PARTITIONS = 200
DEFAULT_SHUFFLE_PARTITIONS = 200

# Validation Constants
DEFAULT_BRONZE_THRESHOLD = 95.0
DEFAULT_SILVER_THRESHOLD = 98.0
DEFAULT_GOLD_THRESHOLD = 99.0

# Timeout Constants (in seconds)
DEFAULT_TIMEOUT_SECONDS = 300
DEFAULT_RETRY_TIMEOUT_SECONDS = 60

# Logging Constants
DEFAULT_LOG_LEVEL = "INFO"
DEFAULT_VERBOSE = True

# Schema Constants
DEFAULT_SCHEMA = "default"
TEST_SCHEMA = "test_schema"

# Error Constants
MAX_ERROR_MESSAGE_LENGTH = 1000
MAX_STACK_TRACE_LINES = 50

# Performance Monitoring Constants
DEFAULT_METRICS_INTERVAL_SECONDS = 30
DEFAULT_ALERT_THRESHOLD_PERCENT = 80.0

In [None]:
# Module: pipeline_builder.errors (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

# Re-export from base for backward compatibility
# from .errors import (  # Removed: defined in notebook cells above
# ConfigurationError,
# DataError,
# ErrorCategory,
# ErrorContext,
# ErrorContextValue,
# ErrorSeverity,
# ErrorSuggestions,
# ExecutionError,
# PerformanceError,
# PipelineValidationError,
# ResourceError,
# SparkForgeError,
# SystemError,
# ValidationError,
# )

__all__ = [
    "SparkForgeError",
    "ValidationError",
    "PipelineValidationError",
    "ConfigurationError",
    "ExecutionError",
    "DataError",
    "SystemError",
    "PerformanceError",
    "ResourceError",
    "ErrorSeverity",
    "ErrorCategory",
    "ErrorContext",
    "ErrorContextValue",
    "ErrorSuggestions",
]

# Backward compatibility aliases
PipelineValidationError = ValidationError
PipelineConfigurationError = ConfigurationError
PipelineExecutionError = ExecutionError
TableOperationError = DataError
DependencyError = ValidationError
StepError = ExecutionError
PipelineError = ExecutionError

In [None]:
# Module: pipeline_builder.reporting (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

# TypedDict is available in typing for Python 3.8+
try:
    from typing import TypedDict
except ImportError:
    from typing_extensions import TypedDict

# Re-export from base
# from .reporting import (  # Removed: defined in notebook cells above
# format_duration,
# safe_divide,
# )
# from .models import StageStats  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================


class ValidationReport(TypedDict):
    """Validation report structure."""

    stage: str | None
    step: str | None
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_at: datetime
    end_at: datetime


class TransformReport(TypedDict):
    """Transform operation report structure."""

    input_rows: int
    output_rows: int
    duration_secs: float
    skipped: bool
    start_at: datetime
    end_at: datetime


class WriteReport(TypedDict):
    """Write operation report structure."""

    mode: str
    rows_written: int
    duration_secs: float
    table_fqn: str
    skipped: bool
    start_at: datetime
    end_at: datetime


class ExecutionSummary(TypedDict):
    """Execution summary nested structure."""

    total_steps: int
    successful_steps: int
    failed_steps: int
    success_rate: float
    failure_rate: float


class PerformanceMetrics(TypedDict):
    """Performance metrics nested structure."""

    total_duration_secs: float
    formatted_duration: str
    avg_validation_rate: float


class DataMetrics(TypedDict):
    """Data metrics nested structure."""

    total_rows_processed: int
    total_rows_written: int
    processing_efficiency: float


class SummaryReport(TypedDict):
    """Complete summary report structure."""

    execution_summary: ExecutionSummary
    performance_metrics: PerformanceMetrics
    data_metrics: DataMetrics


def create_validation_dict(
    stats: StageStats | None, *, start_at: datetime, end_at: datetime
) -> ValidationReport:
    """
    Create validation dictionary for reporting.

    Args:
        stats: Stage statistics
        start_at: Start time
        end_at: End time

    Returns:
        Validation dictionary
    """
    if stats is None:
        return {
            "stage": None,
            "step": None,
            "total_rows": 0,
            "valid_rows": 0,
            "invalid_rows": 0,
            "validation_rate": 100.0,
            "duration_secs": 0.0,
            "start_at": start_at,
            "end_at": end_at,
        }

    return {
        "stage": stats.stage,
        "step": stats.step,
        "total_rows": stats.total_rows,
        "valid_rows": stats.valid_rows,
        "invalid_rows": stats.invalid_rows,
        "validation_rate": round(stats.validation_rate, 2),
        "duration_secs": round(stats.duration_secs, 3),
        "start_at": start_at,
        "end_at": end_at,
    }


def create_transform_dict(
    input_rows: int,
    output_rows: int,
    duration_secs: float,
    skipped: bool,
    *,
    start_at: datetime,
    end_at: datetime,
) -> TransformReport:
    """
    Create transform dictionary for reporting.

    Args:
        input_rows: Number of input rows
        output_rows: Number of output rows
        duration_secs: Duration in seconds
        skipped: Whether operation was skipped
        start_at: Start time
        end_at: End time

    Returns:
        Transform dictionary
    """
    return {
        "input_rows": int(input_rows),
        "output_rows": int(output_rows),
        "duration_secs": round(duration_secs, 3),
        "skipped": bool(skipped),
        "start_at": start_at,
        "end_at": end_at,
    }


def create_write_dict(
    mode: str,
    rows: int,
    duration_secs: float,
    table_fqn: str,
    skipped: bool,
    *,
    start_at: datetime,
    end_at: datetime,
) -> WriteReport:
    """
    Create write dictionary for reporting.

    Args:
        mode: Write mode
        rows: Number of rows written
        duration_secs: Duration in seconds
        table_fqn: Fully qualified table name
        skipped: Whether operation was skipped
        start_at: Start time
        end_at: End time

    Returns:
        Write dictionary
    """
    return {
        "mode": mode,
        "rows_written": int(rows),
        "duration_secs": round(duration_secs, 3),
        "table_fqn": table_fqn,
        "skipped": bool(skipped),
        "start_at": start_at,
        "end_at": end_at,
    }


def create_summary_report(
    total_steps: int,
    successful_steps: int,
    failed_steps: int,
    total_duration: float,
    total_rows_processed: int,
    total_rows_written: int,
    avg_validation_rate: float,
) -> SummaryReport:
    """
    Create a summary report for pipeline execution.

    Args:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        total_duration: Total duration in seconds
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate

    Returns:
        Summary report dictionary
    """
    if total_steps == 0:
        success_rate = 0.0
        failure_rate = 0.0
    else:
        success_rate = safe_divide(successful_steps * 100.0, total_steps, 0.0)
        failure_rate = 100.0 - success_rate

    return {
        "execution_summary": {
            "total_steps": total_steps,
            "successful_steps": successful_steps,
            "failed_steps": failed_steps,
            "success_rate": round(success_rate, 2),
            "failure_rate": round(failure_rate, 2),
        },
        "performance_metrics": {
            "total_duration_secs": round(total_duration, 3),
            "formatted_duration": format_duration(total_duration),
            "avg_validation_rate": round(avg_validation_rate, 2),
        },
        "data_metrics": {
            "total_rows_processed": total_rows_processed,
            "total_rows_written": total_rows_written,
            "processing_efficiency": round(
                safe_divide(total_rows_written * 100.0, total_rows_processed, 0.0), 2
            ),
        },
    }

In [None]:
# Module: pipeline_builder.dependencies.graph (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict

logger = logging.getLogger(__name__)


class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


@dataclass
class StepNode:
    """Represents a single step in the dependency graph."""

    name: str
    step_type: StepType
    dependencies: set[str] = field(default_factory=set)
    dependents: set[str] = field(default_factory=set)
    execution_group: int = 0
    can_run_parallel: bool = True
    estimated_duration: float = 0.0
    metadata: Dict[str, Any] = field(default_factory=dict)


class DependencyGraph:
    """
    Represents the dependency graph of a pipeline.

    This class provides efficient operations for dependency analysis,
    cycle detection, and execution planning.
    """

    def __init__(self) -> None:
        self.nodes: Dict[str, StepNode] = {}
        self._adjacency_list: Dict[str, set[str]] = defaultdict(set)
        self._reverse_adjacency_list: Dict[str, set[str]] = defaultdict(set)

    def add_node(self, node: StepNode) -> None:
        """Add a node to the dependency graph."""
        self.nodes[node.name] = node
        self._adjacency_list[node.name] = set()
        self._reverse_adjacency_list[node.name] = set()

    def add_dependency(self, from_step: str, to_step: str) -> None:
        """Add a dependency from one step to another."""
        if from_step not in self.nodes or to_step not in self.nodes:
            raise ValueError(f"Steps {from_step} or {to_step} not found in graph")

        self._adjacency_list[from_step].add(to_step)
        self._reverse_adjacency_list[to_step].add(from_step)

        # Update node dependencies
        self.nodes[from_step].dependencies.add(to_step)
        self.nodes[to_step].dependents.add(from_step)

    def get_dependencies(self, step_name: str) -> set[str]:
        """Get all dependencies for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependencies.copy()

    def get_dependents(self, step_name: str) -> set[str]:
        """Get all dependents for a step."""
        return self.nodes.get(
            step_name, StepNode("", StepType.BRONZE)
        ).dependents.copy()

    def detect_cycles(self) -> list[list[str]]:
        """Detect cycles in the dependency graph using DFS."""
        visited = set()
        rec_stack = set()
        cycles = []

        def dfs(node: str, path: list[str]) -> None:
            if node in rec_stack:
                # Found a cycle
                cycle_start = path.index(node)
                cycle = path[cycle_start:] + [node]
                cycles.append(cycle)
                return

            if node in visited:
                return

            visited.add(node)
            rec_stack.add(node)
            path.append(node)

            for neighbor in self._adjacency_list[node]:
                dfs(neighbor, path)

            rec_stack.remove(node)
            path.pop()

        for node in self.nodes:
            if node not in visited:
                dfs(node, [])

        return cycles

    def topological_sort(self) -> list[str]:
        """
        Perform topological sort of the dependency graph.

        Returns nodes in an order such that dependencies come before dependents.
        Uses reverse adjacency list since add_dependency(A, B) means A depends on B,
        so B must come before A in the sort.
        """
        in_degree = dict.fromkeys(self.nodes, 0)

        # Calculate in-degrees using reverse adjacency
        # If A depends on B, then B->A edge exists in reverse list
        for node in self.nodes:
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] += 1

        # Find nodes with no incoming edges (no dependencies)
        queue = deque([node for node, degree in in_degree.items() if degree == 0])
        result = []

        while queue:
            node = queue.popleft()
            result.append(node)

            # Process nodes that depend on this one
            for dependent in self._reverse_adjacency_list[node]:
                in_degree[dependent] -= 1
                if in_degree[dependent] == 0:
                    queue.append(dependent)

        return result

    def get_execution_groups(self) -> list[list[str]]:
        """Get execution groups for parallel execution."""
        # Use topological sort to determine execution order
        sorted_nodes = self.topological_sort()

        # Group nodes by their level in the dependency tree
        levels = {}
        for node in sorted_nodes:
            if not self.nodes[node].dependencies:
                levels[node] = 0
            else:
                # Ensure all dependencies have been processed
                max_dep_level = 0
                for dep in self.nodes[node].dependencies:
                    if dep in levels:
                        max_dep_level = max(max_dep_level, levels[dep])
                    else:
                        # If dependency not found, it might be missing from the graph
                        # This could happen if the dependency graph is incomplete
                        logger.warning(
                            f"Dependency {dep} not found in levels for node {node}"
                        )
                        max_dep_level = max(max_dep_level, 0)
                levels[node] = max_dep_level + 1

        # Group nodes by level
        groups = defaultdict(list)
        for node, level in levels.items():
            groups[level].append(node)

        return [groups[level] for level in sorted(groups.keys())]

    def get_parallel_candidates(self) -> list[list[str]]:
        """Get groups of steps that can run in parallel."""
        execution_groups = self.get_execution_groups()
        return execution_groups

    def validate(self) -> list[str]:
        """Validate the dependency graph and return any issues."""
        issues = []

        # Check for cycles
        cycles = self.detect_cycles()
        if cycles:
            for cycle in cycles:
                issues.append(f"Circular dependency detected: {' -> '.join(cycle)}")

        # Check for missing dependencies
        for node_name, node in self.nodes.items():
            for dep in node.dependencies:
                if dep not in self.nodes:
                    issues.append(f"Node {node_name} depends on missing node {dep}")

        return issues

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the dependency graph."""
        total_nodes = len(self.nodes)
        total_edges = sum(len(deps) for deps in self._adjacency_list.values())

        # Count by step type
        type_counts: Dict[str, int] = defaultdict(int)
        for node in self.nodes.values():
            type_counts[node.step_type.value] += 1

        # Calculate average dependencies
        avg_dependencies = total_edges / total_nodes if total_nodes > 0 else 0

        return {
            "total_nodes": total_nodes,
            "total_edges": total_edges,
            "type_counts": dict(type_counts),
            "average_dependencies": avg_dependencies,
            "has_cycles": len(self.detect_cycles()) > 0,
        }

In [None]:
# Module: pipeline_builder.dependencies.analyzer (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

# Re-export from base - the base analyzer uses protocols so it works with Spark steps
# from .dependencies import (  # Removed: defined in notebook cells above
# AnalysisStrategy,
# DependencyAnalysisResult,
# DependencyAnalyzer,
# DependencyError,
# DependencyGraph,
# StepNode,
# StepType,
# )

# Keep for backward compatibility - the base analyzer works with any step type via protocols
__all__ = [
    "DependencyAnalyzer",
    "DependencyAnalysisResult",
    "AnalysisStrategy",
    "DependencyGraph",
    "StepNode",
    "StepType",
    "DependencyError",
]

In [None]:
# Module: pipeline_builder.dependencies.exceptions (pipeline_builder)
#
# Dependencies: None (base module)

from typing import List


class DependencyError(Exception):
    """Base exception for dependency-related errors."""

    def __init__(self, message: str, step_name: Optional[str] = None):
        super().__init__(message)
        self.step_name = step_name


class DependencyAnalysisError(DependencyError):
    """Raised when dependency analysis fails."""

    def __init__(self, message: str, analysis_step: Optional[str] = None):
        super().__init__(message, analysis_step)
        self.analysis_step = analysis_step


class CircularDependencyError(DependencyError):
    """Raised when circular dependencies are detected."""

    def __init__(self, message: str, cycle: List[str]):
        super().__init__(message)
        self.cycle = cycle


class InvalidDependencyError(DependencyError):
    """Raised when invalid dependencies are detected."""

    def __init__(self, message: str, invalid_dependencies: List[str]):
        super().__init__(message)
        self.invalid_dependencies = invalid_dependencies


class DependencyConflictError(DependencyError):
    """Raised when dependency conflicts are detected."""

    def __init__(self, message: str, conflicting_steps: List[str]):
        super().__init__(message)
        self.conflicting_steps = conflicting_steps

In [None]:
# Module: pipeline_builder.models.enums (pipeline_builder)
#
# Dependencies: None (base module)

from enum import Enum


class PipelinePhase(Enum):
    """Enumeration of pipeline phases."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


class ExecutionMode(Enum):
    """Enumeration of execution modes."""

    INITIAL = "initial"
    INCREMENTAL = "incremental"


class WriteMode(Enum):
    """Enumeration of write modes."""

    OVERWRITE = "overwrite"
    APPEND = "append"


class ValidationResult(Enum):
    """Enumeration of validation results."""

    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"

In [None]:
# Module: pipeline_builder.models.exceptions (pipeline_builder)
#
# Dependencies: None (base module)


class PipelineConfigurationError(ValueError):
    """Raised when pipeline configuration is invalid."""

    pass


class PipelineExecutionError(RuntimeError):
    """Raised when pipeline execution fails."""

    pass

In [None]:
# Module: pipeline_builder.writer.exceptions (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

from typing import Dict


class WriterError(Exception):
    """
    Base exception for all writer-related errors.

    Provides a common base class for all writer exceptions with
    enhanced error context and suggestions.
    """

    def __init__(
        self,
        message: str,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
        cause: Exception | None = None,
    ) -> None:
        """
        Initialize the writer error.

        Args:
            message: Error message
            context: Additional context information
            suggestions: List of suggestions to resolve the error
            cause: The underlying exception that caused this error
        """
        super().__init__(message)
        self.message = message
        self.context = context or {}
        self.suggestions = suggestions or []
        self.cause = cause

    def __str__(self) -> str:
        """Return formatted error message."""
        msg = self.message
        if self.context:
            msg += f"\nContext: {self.context}"
        if self.suggestions:
            msg += f"\nSuggestions: {'; '.join(self.suggestions)}"
        return msg


class WriterValidationError(WriterError):
    """
    Raised when writer validation fails.

    This exception is raised when data validation fails during
    the writing process, such as invalid log rows or schema mismatches.
    """

    def __init__(
        self,
        message: str,
        validation_errors: list[str] | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
    ) -> None:
        """
        Initialize validation error.

        Args:
            message: Error message
            validation_errors: List of specific validation errors
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.validation_errors = validation_errors or []


class WriterConfigurationError(WriterError):
    """
    Raised when writer configuration is invalid.

    This exception is raised when the WriterConfig contains
    invalid values or conflicting settings.
    """

    def __init__(
        self,
        message: str,
        config_errors: list[str] | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
    ) -> None:
        """
        Initialize configuration error.

        Args:
            message: Error message
            config_errors: List of specific configuration errors
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.config_errors = config_errors or []


class WriterTableError(WriterError):
    """
    Raised when table operations fail.

    This exception is raised when there are issues with Delta table
    operations, such as table creation, writing, or schema evolution.
    """

    def __init__(
        self,
        message: str,
        table_name: str | None = None,
        operation: str | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
        cause: Exception | None = None,
    ) -> None:
        """
        Initialize table error.

        Args:
            message: Error message
            table_name: Name of the table that caused the error
            operation: The operation that failed
            context: Additional context information
            suggestions: List of suggestions to resolve the error
            cause: The underlying exception that caused this error
        """
        super().__init__(message, context, suggestions, cause)
        self.table_name = table_name
        self.operation = operation


class WriterPerformanceError(WriterError):
    """
    Raised when performance thresholds are exceeded.

    This exception is raised when operations take longer than expected
    or consume more resources than configured limits.
    """

    def __init__(
        self,
        message: str,
        actual_duration: float | None = None,
        expected_duration: float | None = None,
        actual_memory: float | None = None,
        expected_memory: float | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
    ) -> None:
        """
        Initialize performance error.

        Args:
            message: Error message
            actual_duration: Actual duration in seconds
            expected_duration: Expected duration in seconds
            actual_memory: Actual memory usage in MB
            expected_memory: Expected memory usage in MB
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.actual_duration = actual_duration
        self.expected_duration = expected_duration
        self.actual_memory = actual_memory
        self.expected_memory = expected_memory


class WriterSchemaError(WriterError):
    """
    Raised when schema operations fail.

    This exception is raised when there are issues with schema
    validation, evolution, or compatibility.
    """

    def __init__(
        self,
        message: str,
        schema_errors: list[str] | None = None,
        expected_schema: str | None = None,
        actual_schema: str | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
    ) -> None:
        """
        Initialize schema error.

        Args:
            message: Error message
            schema_errors: List of specific schema errors
            expected_schema: Expected schema definition
            actual_schema: Actual schema definition
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.schema_errors = schema_errors or []
        self.expected_schema = expected_schema
        self.actual_schema = actual_schema


class WriterDataQualityError(WriterError):
    """
    Raised when data quality checks fail.

    This exception is raised when data quality validation fails,
    such as when validation rates are too low or data anomalies are detected.
    """

    def __init__(
        self,
        message: str,
        quality_issues: list[str] | None = None,
        validation_rate: float | None = None,
        threshold: float | None = None,
        context: Dict[str, Any] | None = None,
        suggestions: list[str] | None = None,
    ) -> None:
        """
        Initialize data quality error.

        Args:
            message: Error message
            quality_issues: List of specific quality issues
            validation_rate: Actual validation rate
            threshold: Expected validation threshold
            context: Additional context information
            suggestions: List of suggestions to resolve the error
        """
        super().__init__(message, context, suggestions)
        self.quality_issues = quality_issues or []
        self.validation_rate = validation_rate
        self.threshold = threshold

In [None]:
# Module: pipeline_builder.engine.spark_engine (pipeline_builder)
#
# Dependencies: None (base module)

from __future__ import annotations

from abstracts.engine import Engine
from abstracts.reports.transform import TransformReport
from abstracts.reports.validation import ValidationReport
from abstracts.reports.write import WriteReport
from abstracts.source import Source
from abstracts.step import Step

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
# from ..execution import ExecutionEngine  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..table_operations import fqn  # Removed: defined in notebook cells above
# from ..validation import apply_column_rules  # Removed: defined in notebook cells above


class SparkEngine(Engine):
    """
    SparkEngine implements abstracts.Engine using ExecutionEngine.

    This engine adapts between the abstracts interface (Step, Source protocols)
    and the concrete pipeline_builder types (BronzeStep/SilverStep/GoldStep, DataFrame).
    """

    def __init__(
        self,
        spark: SparkSession,
        config: Any,  # PipelineConfig
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
    ):
        """
        Initialize SparkEngine.

        Args:
            spark: SparkSession instance
            config: PipelineConfig instance
            logger: Optional logger instance
            functions: Optional functions protocol for PySpark operations
        """
        self.spark = spark
        self.config = config
        self.logger = logger or PipelineLogger()
        self.functions = functions
        self._execution_engine = ExecutionEngine(spark, config, self.logger, functions)

    def validate_source(self, step: Step, source: Source) -> ValidationReport:
        """
        Validate a data source according to step rules.

        Args:
            step: Step with validation rules
            source: Source data to validate (DataFrame)

        Returns:
            ValidationReport with validation results
        """
        # Type check: source should be a DataFrame
        if not isinstance(source, DataFrame):
            raise TypeError(f"Source must be a DataFrame, got {type(source)}")

        df: DataFrame = source

        # Type check: step should be a concrete step type
        if not isinstance(step, (BronzeStep, SilverStep, GoldStep)):
            raise TypeError(
                f"Step must be BronzeStep, SilverStep, or GoldStep, got {type(step)}"
            )

        # Apply validation rules
        try:
            # Rules type compatibility - Step Protocol uses Rules, concrete steps use ColumnRules
            # mypy doesn't understand Protocol structural typing here, so we use Any
            rules: Any = step.rules
            valid_df, invalid_df, validation_stats = apply_column_rules(
                df,
                rules,
                "pipeline",
                step.name,
                functions=self.functions,
            )

            valid_rows = valid_df.count()
            invalid_rows = invalid_df.count()

            return ValidationReport(
                source=valid_df,  # Return validated source
                valid_rows=valid_rows,
                invalid_rows=invalid_rows,
                error=None,
            )
        except Exception as e:
            return ValidationReport(
                source=df,
                valid_rows=0,
                invalid_rows=df.count() if df is not None else 0,
                error=e,
            )

    def transform_source(self, step: Step, source: Source) -> TransformReport:
        """
        Transform a data source according to step transformation logic.

        Args:
            step: Step with transformation function
            source: Source data to transform (DataFrame)

        Returns:
            TransformReport with transformed source
        """
        # Type check: source should be a DataFrame
        if not isinstance(source, DataFrame):
            raise TypeError(f"Source must be a DataFrame, got {type(source)}")

        df: DataFrame = source

        # Type check: step should be a concrete step type
        if not isinstance(step, (BronzeStep, SilverStep, GoldStep)):
            raise TypeError(
                f"Step must be BronzeStep, SilverStep, or GoldStep, got {type(step)}"
            )

        try:
            # Bronze steps: no transformation, just return source
            if isinstance(step, BronzeStep):
                return TransformReport(source=df, error=None)

            # Silver steps: transform with bronze data and empty silvers dict
            elif isinstance(step, SilverStep):
                if step.transform is None:
                    raise ValueError(
                        f"Silver step '{step.name}' requires a transform function"
                    )
                transformed_df = step.transform(self.spark, df, {})
                return TransformReport(source=transformed_df, error=None)

            # Gold steps: transform with silvers dict
            # Note: For gold steps, the "source" parameter is actually a dict of silvers
            # This is a limitation of the abstracts.Engine interface for gold steps
            elif isinstance(step, GoldStep):
                if step.transform is None:
                    raise ValueError(
                        f"Gold step '{step.name}' requires a transform function"
                    )
                # For gold steps, source should be a dict of silvers (Dict[str, DataFrame])
                # The abstracts interface expects Source, but we accept dict for gold steps
                if isinstance(source, dict):
                    silvers = source
                else:
                    # If single DataFrame, this is an error for gold steps
                    raise TypeError(
                        f"Gold step '{step.name}' requires a dict of silvers, got {type(source)}"
                    )
                transformed_df = step.transform(self.spark, silvers)
                return TransformReport(source=transformed_df, error=None)

            else:
                raise ValueError(f"Unknown step type: {type(step)}")

        except Exception as e:
            return TransformReport(source=df, error=e)

    def write_target(self, step: Step, source: Source) -> WriteReport:
        """
        Write a data source to target table.

        Args:
            step: Step with target configuration
            source: Source data to write (DataFrame)

        Returns:
            WriteReport with write results
        """
        # Type check: source should be a DataFrame
        if not isinstance(source, DataFrame):
            raise TypeError(f"Source must be a DataFrame, got {type(source)}")

        df: DataFrame = source

        # Type check: step should be a concrete step type
        if not isinstance(step, (BronzeStep, SilverStep, GoldStep)):
            raise TypeError(
                f"Step must be BronzeStep, SilverStep, or GoldStep, got {type(step)}"
            )

        # Bronze steps don't write to tables
        if isinstance(step, BronzeStep):
            rows_written = df.count()
            return WriteReport(
                source=df,
                written_rows=rows_written,
                failed_rows=0,
                error=None,
            )

        # Get table name and schema
        table_name = getattr(step, "table_name", None) or getattr(
            step, "target", step.name
        )
        schema = getattr(step, "schema", None) or getattr(step, "write_schema", None)

        if schema is None:
            raise ValueError(
                f"Step '{step.name}' requires a schema to be specified for writing"
            )

        output_table = fqn(schema, table_name)

        # Determine write mode
        write_mode = getattr(step, "write_mode", "overwrite")
        if write_mode is None:
            write_mode = "overwrite"

        # Create schema if needed
        try:
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
        except Exception as e:
            raise RuntimeError(f"Failed to create schema '{schema}': {e}") from e

        # Write to table
        try:
            rows_before = df.count()
            df.write.mode(write_mode).saveAsTable(output_table)
            rows_written = rows_before  # Assuming all rows were written successfully
            return WriteReport(
                source=df,
                written_rows=rows_written,
                failed_rows=0,
                error=None,
            )
        except Exception as e:
            return WriteReport(
                source=df,
                written_rows=0,
                failed_rows=df.count() if df is not None else 0,
                error=e,
            )

In [None]:
# Module: pipeline_builder_base.dependencies.analyzer (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.logging

from __future__ import annotations

import hashlib
from dataclasses import dataclass
from enum import Enum
from typing import Dict, Protocol

# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from .exceptions import DependencyError  # Removed: defined in notebook cells above
# from .graph import DependencyGraph, StepNode, StepType  # Removed: defined in notebook cells above


class AnalysisStrategy(Enum):
    """Strategies for dependency analysis."""

    CONSERVATIVE = "conservative"  # Assume all dependencies exist
    OPTIMISTIC = "optimistic"  # Assume minimal dependencies
    HYBRID = "hybrid"  # Balance between conservative and optimistic


@dataclass
class DependencyAnalysisResult:
    """Result of dependency analysis."""

    graph: DependencyGraph
    execution_groups: list[list[str]]
    cycles: list[list[str]]
    conflicts: list[str]
    recommendations: list[str]
    stats: Dict[str, Any]
    analysis_duration: float


# Protocol for step objects that can be analyzed
class StepProtocol(Protocol):
    """Protocol for steps that can be analyzed for dependencies."""

    name: str


class BronzeStepProtocol(StepProtocol, Protocol):
    """Protocol for bronze steps."""

    incremental_col: str | None


class SilverStepProtocol(StepProtocol, Protocol):
    """Protocol for silver steps."""

    source_bronze: str


class GoldStepProtocol(StepProtocol, Protocol):
    """Protocol for gold steps."""

    source_silvers: list[str] | None


class DependencyAnalyzer:
    """
    Unified dependency analyzer for all pipeline step types.

    This analyzer works with any step implementation that follows the step protocols.
    It analyzes dependencies across bronze, silver, and gold steps.

    Features:
    - Single analyzer for all step types (Bronze, Silver, Gold)
    - Multiple analysis strategies
    - Cycle detection and resolution
    - Execution group optimization
    - Performance analysis and recommendations
    """

    def __init__(
        self,
        strategy: AnalysisStrategy = AnalysisStrategy.HYBRID,
        logger: PipelineLogger | None = None,
    ):
        self.strategy = strategy
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger
        self._analysis_cache: Dict[str, DependencyAnalysisResult] = {}

    def analyze_dependencies(
        self,
        bronze_steps: Dict[str, BronzeStepProtocol] | None = None,
        silver_steps: Dict[str, SilverStepProtocol] | None = None,
        gold_steps: Dict[str, GoldStepProtocol] | None = None,
        force_refresh: bool = False,
    ) -> DependencyAnalysisResult:
        """
        Analyze dependencies across all step types.

        Args:
            bronze_steps: Dictionary of bronze steps (any object with name and incremental_col)
            silver_steps: Dictionary of silver steps (any object with name and source_bronze)
            gold_steps: Dictionary of gold steps (any object with name and source_silvers)
            force_refresh: Whether to force refresh of cached results

        Returns:
            DependencyAnalysisResult containing analysis results
        """
        start_time = time.time()

        # Create cache key
        cache_key = self._create_cache_key(bronze_steps, silver_steps, gold_steps)

        if not force_refresh and cache_key in self._analysis_cache:
            self.logger.info(f"Using cached dependency analysis: {cache_key}")
            return self._analysis_cache[cache_key]

        self.logger.info(
            f"Starting dependency analysis with strategy: {self.strategy.value}"
        )

        try:
            # Step 1: Build dependency graph
            graph = self._build_dependency_graph(bronze_steps, silver_steps, gold_steps)

            # Step 2: Detect cycles
            cycles = graph.detect_cycles()
            if cycles:
                self.logger.warning(f"Detected {len(cycles)} circular dependencies")
                graph = self._resolve_cycles(graph, cycles)

            # Step 3: Detect conflicts
            conflicts = self._detect_conflicts(graph)
            if conflicts:
                self.logger.warning(f"Detected {len(conflicts)} dependency conflicts")

            # Step 4: Generate execution groups
            execution_groups = graph.get_execution_groups()

            # Step 5: Generate recommendations
            recommendations = self._generate_recommendations(graph, cycles, conflicts)

            # Step 6: Calculate statistics
            stats = graph.get_stats()

            # Create result
            result = DependencyAnalysisResult(
                graph=graph,
                execution_groups=execution_groups,
                cycles=cycles,
                conflicts=conflicts,
                recommendations=recommendations,
                stats=stats,
                analysis_duration=time.time() - start_time,
            )

            # Cache result
            self._analysis_cache[cache_key] = result

            self.logger.info(
                f"Dependency analysis completed in {result.analysis_duration:.2f}s"
            )
            return result

        except Exception as e:
            self.logger.error(f"Dependency analysis failed: {str(e)}")
            raise DependencyError(f"Dependency analysis failed: {str(e)}") from e

    def _build_dependency_graph(
        self,
        bronze_steps: Dict[str, BronzeStepProtocol] | None,
        silver_steps: Dict[str, SilverStepProtocol] | None,
        gold_steps: Dict[str, GoldStepProtocol] | None,
    ) -> DependencyGraph:
        """Build the dependency graph from all step types."""
        graph = DependencyGraph()

        # Add bronze steps
        if bronze_steps:
            for name, step in bronze_steps.items():
                node = StepNode(
                    name=name, step_type=StepType.BRONZE, metadata={"step": step}
                )
                graph.add_node(node)

        # Add silver steps
        if silver_steps:
            for name, silver_step in silver_steps.items():
                node = StepNode(
                    name=name, step_type=StepType.SILVER, metadata={"step": silver_step}
                )
                graph.add_node(node)

                # Add dependencies
                # SilverStep always has source_bronze attribute
                source_bronze = getattr(silver_step, "source_bronze", None)
                if source_bronze:
                    # Check if the source bronze step exists
                    if source_bronze in graph.nodes:
                        graph.add_dependency(name, source_bronze)
                    else:
                        # Log warning about missing dependency
                        self.logger.warning(
                            f"Silver step {name} references non-existent bronze step {source_bronze}"
                        )

                # Check for additional dependencies
                if hasattr(silver_step, "depends_on"):
                    depends_on = getattr(silver_step, "depends_on", None)
                    if depends_on and isinstance(depends_on, (list, tuple, set)):
                        for dep in depends_on:
                            if dep in graph.nodes:
                                graph.add_dependency(name, dep)
                            else:
                                self.logger.warning(
                                    f"Silver step {name} references non-existent dependency {dep}"
                                )

        # Add gold steps
        if gold_steps:
            for name, gold_step in gold_steps.items():
                node = StepNode(
                    name=name, step_type=StepType.GOLD, metadata={"step": gold_step}
                )
                graph.add_node(node)

                # Add dependencies
                # GoldStep always has source_silvers attribute (can be None)
                source_silvers = getattr(gold_step, "source_silvers", None)
                if source_silvers:
                    for dep in source_silvers:
                        if dep in graph.nodes:
                            graph.add_dependency(name, dep)
                        else:
                            self.logger.warning(
                                f"Gold step {name} references non-existent silver step {dep}"
                            )

        return graph

    def _resolve_cycles(
        self, graph: DependencyGraph, cycles: list[list[str]]
    ) -> DependencyGraph:
        """Resolve cycles in the dependency graph."""
        # Simple cycle resolution: break cycles by removing the last dependency
        for cycle in cycles:
            if len(cycle) > 1:
                # Remove the last dependency in the cycle
                from_step = cycle[-2]
                to_step = cycle[-1]

                self.logger.warning(
                    f"Breaking cycle by removing dependency: {from_step} -> {to_step}"
                )

                # Remove from adjacency lists
                if to_step in graph._adjacency_list[from_step]:
                    graph._adjacency_list[from_step].remove(to_step)
                if from_step in graph._reverse_adjacency_list[to_step]:
                    graph._reverse_adjacency_list[to_step].remove(from_step)

                # Update node dependencies
                if to_step in graph.nodes[from_step].dependencies:
                    graph.nodes[from_step].dependencies.remove(to_step)
                if from_step in graph.nodes[to_step].dependents:
                    graph.nodes[to_step].dependents.remove(from_step)

        return graph

    def _detect_conflicts(self, graph: DependencyGraph) -> list[str]:
        """Detect dependency conflicts."""
        conflicts = []

        # Check for conflicting step names
        step_names = list(graph.nodes.keys())
        seen_names = set()
        for node_name in step_names:
            if node_name in seen_names:
                conflicts.append(f"Conflicting step name: {node_name}")
            seen_names.add(node_name)

        # Check for missing dependencies
        for node_name, node in graph.nodes.items():
            for dep in node.dependencies:
                if dep not in graph.nodes:
                    conflicts.append(f"Node {node_name} depends on missing node {dep}")

        return conflicts

    def _generate_recommendations(
        self, graph: DependencyGraph, cycles: list[list[str]], conflicts: list[str]
    ) -> list[str]:
        """Generate optimization recommendations."""
        recommendations = []

        # Cycle recommendations
        if cycles:
            recommendations.append(
                "Consider refactoring to eliminate circular dependencies"
            )

        # Conflict recommendations
        if conflicts:
            recommendations.append("Resolve dependency conflicts before execution")

        # Performance recommendations
        stats = graph.get_stats()
        if stats["average_dependencies"] > 3:
            recommendations.append(
                "Consider reducing step dependencies for better parallelization"
            )

        if len(graph.nodes) > 10:
            recommendations.append(
                "Consider breaking large pipelines into smaller, focused pipelines"
            )

        return recommendations

    def _create_cache_key(
        self,
        bronze_steps: Dict[str, BronzeStepProtocol] | None,
        silver_steps: Dict[str, SilverStepProtocol] | None,
        gold_steps: Dict[str, GoldStepProtocol] | None,
    ) -> str:
        """Create a cache key for the analysis."""
        # Create a simple hash of the step configurations
        key_parts = []

        if bronze_steps:
            key_parts.extend(sorted(bronze_steps.keys()))
        if silver_steps:
            key_parts.extend(sorted(silver_steps.keys()))
        if gold_steps:
            key_parts.extend(sorted(gold_steps.keys()))

        key_string = f"{self.strategy.value}:{':'.join(key_parts)}"
        return hashlib.sha256(key_string.encode()).hexdigest()

    def clear_cache(self) -> None:
        """Clear the analysis cache."""
        self._analysis_cache.clear()
        self.logger.info("Dependency analysis cache cleared")

In [None]:
# Module: pipeline_builder_base.models.pipeline (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors

from __future__ import annotations

from dataclasses import dataclass

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel, ParallelConfig, ValidationThresholds  # Removed: defined in notebook cells above


@dataclass
class PipelineConfig(BaseModel):
    """
    Main pipeline configuration.

    Attributes:
        schema: Database schema name
        thresholds: Validation thresholds for each phase
        parallel: Parallel execution configuration
        verbose: Whether to enable verbose logging
    """

    schema: str
    thresholds: ValidationThresholds
    parallel: ParallelConfig | bool
    verbose: bool = True

    def __post_init__(self) -> None:
        """Post-initialization to convert boolean parallel to ParallelConfig."""
        # Convert boolean parallel to ParallelConfig for backward compatibility
        if isinstance(self.parallel, bool):
            if self.parallel:
                # If True, create default parallel config
                object.__setattr__(self, "parallel", ParallelConfig.create_default())
            else:
                # If False, create sequential config
                object.__setattr__(self, "parallel", ParallelConfig.create_sequential())

    @property
    def min_bronze_rate(self) -> float:
        """Get bronze validation threshold."""
        return self.thresholds.bronze

    @property
    def min_silver_rate(self) -> float:
        """Get silver validation threshold."""
        return self.thresholds.silver

    @property
    def min_gold_rate(self) -> float:
        """Get gold validation threshold."""
        return self.thresholds.gold

    @property
    def enable_parallel_silver(self) -> bool:
        """Get parallel silver execution setting."""
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            return self.parallel.enabled
        # Fallback for mock configs in tests
        return bool(self.parallel)

    @property
    def max_parallel_workers(self) -> int:
        """Get max parallel workers setting."""
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            return self.parallel.max_workers
        # Fallback for mock configs in tests
        return 4

    @property
    def enable_caching(self) -> bool:
        """Get caching setting."""
        return getattr(self.parallel, "enable_caching", True)

    @property
    def enable_monitoring(self) -> bool:
        """Get monitoring setting."""
        return getattr(self.parallel, "enable_monitoring", True)

    def validate(self) -> None:
        """Validate pipeline configuration."""
        if not self.schema or not isinstance(self.schema, str):
            raise PipelineValidationError("Schema name must be a non-empty string")
        self.thresholds.validate()
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            self.parallel.validate()

    @classmethod
    def create_default(cls, schema: str) -> PipelineConfig:
        """Create default pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_default(),
            parallel=ParallelConfig.create_default(),
            verbose=True,
        )

    @classmethod
    def create_high_performance(cls, schema: str) -> PipelineConfig:
        """Create high-performance pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            parallel=ParallelConfig.create_high_performance(),
            verbose=False,
        )

    @classmethod
    def create_conservative(cls, schema: str) -> PipelineConfig:
        """Create conservative pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            parallel=ParallelConfig.create_sequential(),
            verbose=True,
        )


@dataclass
class PipelineMetrics(BaseModel):
    """
    Overall pipeline execution metrics.

    Attributes:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        skipped_steps: Number of skipped steps
        total_duration: Total execution duration
        bronze_duration: Bronze layer duration
        silver_duration: Silver layer duration
        gold_duration: Gold layer duration
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate
        parallel_efficiency: Parallel execution efficiency
        cache_hit_rate: Cache hit rate
        error_count: Number of errors
        retry_count: Number of retries
    """

    total_steps: int = 0
    successful_steps: int = 0
    failed_steps: int = 0
    skipped_steps: int = 0
    total_duration: float = 0.0
    bronze_duration: float = 0.0
    silver_duration: float = 0.0
    gold_duration: float = 0.0
    total_rows_processed: int = 0
    total_rows_written: int = 0
    avg_validation_rate: float = 0.0
    parallel_efficiency: float = 0.0
    cache_hit_rate: float = 0.0
    error_count: int = 0
    retry_count: int = 0

    def validate(self) -> None:
        """Validate the pipeline metrics."""
        if self.total_steps < 0:
            raise ValueError("Total steps cannot be negative")
        if self.successful_steps < 0:
            raise ValueError("Successful steps cannot be negative")
        if self.failed_steps < 0:
            raise ValueError("Failed steps cannot be negative")
        if self.skipped_steps < 0:
            raise ValueError("Skipped steps cannot be negative")
        if self.total_duration < 0:
            raise ValueError("Total duration cannot be negative")
        if not 0 <= self.avg_validation_rate <= 100:
            raise ValueError("Average validation rate must be between 0 and 100")

    @property
    def success_rate(self) -> float:
        """Calculate success rate."""
        return (
            (self.successful_steps / self.total_steps * 100)
            if self.total_steps > 0
            else 0.0
        )

    @property
    def failure_rate(self) -> float:
        """Calculate failure rate."""
        return 100.0 - self.success_rate

    @classmethod
    def from_step_results(cls, step_results: list[Any]) -> PipelineMetrics:
        """Create metrics from step results."""
        total_steps = len(step_results)
        successful_steps = sum(1 for result in step_results if result.success)
        failed_steps = total_steps - successful_steps
        total_duration_secs = sum(result.duration_secs for result in step_results)
        total_rows_processed = sum(result.rows_processed for result in step_results)
        total_rows_written = sum(result.rows_written for result in step_results)
        avg_validation_rate = (
            sum(result.validation_rate for result in step_results) / total_steps
            if total_steps > 0
            else 0.0
        )

        return cls(
            total_steps=total_steps,
            successful_steps=successful_steps,
            failed_steps=failed_steps,
            total_duration=total_duration_secs,
            total_rows_processed=total_rows_processed,
            total_rows_written=total_rows_written,
            avg_validation_rate=avg_validation_rate,
        )

In [None]:
# Module: pipeline_builder_base.models.base (pipeline_builder_base)
#
# Dependencies: pipeline_builder_base.errors

from __future__ import annotations

import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .enums import PipelinePhase  # Removed: defined in notebook cells above
# from .types import ModelValue  # Removed: defined in notebook cells above


@dataclass
class BaseModel(ABC):
    """
    Base class for all pipeline models with common functionality.

    Provides standard validation, serialization, and representation methods
    for all pipeline data models. All models in the pipeline system inherit
    from this base class to ensure consistent behavior.

    Features:
    - Automatic validation support
    - JSON serialization and deserialization
    - Dictionary conversion for easy data exchange
    - String representation for debugging
    - Type-safe field access

    Example:
        >>> @dataclass
        >>> class MyStep(BaseModel):
        ...     name: str
        ...     rules: Dict[str, List[ColumnRule]]
        ...
        ...     def validate(self) -> None:
        ...         if not self.name:
        ...             raise ValueError("Name cannot be empty")
        ...         if not self.rules:
        ...             raise ValueError("Rules cannot be empty")
        >>>
        >>> step = MyStep(name="test", rules={"id": [F.col("id").isNotNull()]})
        >>> step.validate()
        >>> print(step.to_json())
    """

    @abstractmethod
    def validate(self) -> None:
        """Validate the model. Override in subclasses."""
        pass

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert model to dictionary."""
        result: Dict[str, ModelValue] = {}
        for field_info in self.__dataclass_fields__.values():
            value = getattr(self, field_info.name)
            if hasattr(value, "to_dict"):
                result[field_info.name] = value.to_dict()
            else:
                result[field_info.name] = value
        return result

    def to_json(self) -> str:
        """Convert model to JSON string."""
        return json.dumps(self.to_dict(), default=str, indent=2)

    def __str__(self) -> str:
        """String representation of the model."""
        return f"{self.__class__.__name__}({', '.join(f'{k}={v}' for k, v in self.to_dict().items())})"


@dataclass
class ValidationThresholds(BaseModel):
    """
    Validation thresholds for different pipeline phases.

    Attributes:
        bronze: Bronze layer validation threshold (0-100)
        silver: Silver layer validation threshold (0-100)
        gold: Gold layer validation threshold (0-100)
    """

    bronze: float
    silver: float
    gold: float

    def validate(self) -> None:
        """Validate threshold values."""
        for phase, threshold in [
            ("bronze", self.bronze),
            ("silver", self.silver),
            ("gold", self.gold),
        ]:
            if not 0 <= threshold <= 100:
                raise PipelineValidationError(
                    f"{phase} threshold must be between 0 and 100, got {threshold}"
                )

    def get_threshold(self, phase: PipelinePhase) -> float:
        """Get threshold for a specific phase."""
        phase_map = {
            PipelinePhase.BRONZE: self.bronze,
            PipelinePhase.SILVER: self.silver,
            PipelinePhase.GOLD: self.gold,
        }
        return phase_map[phase]

    @classmethod
    def create_default(cls) -> ValidationThresholds:
        """Create default validation thresholds."""
        return cls(bronze=95.0, silver=98.0, gold=99.0)

    @classmethod
    def create_strict(cls) -> ValidationThresholds:
        """Create strict validation thresholds."""
        return cls(bronze=99.0, silver=99.5, gold=99.9)

    @classmethod
    def create_loose(cls) -> ValidationThresholds:
        """Create loose validation thresholds."""
        return cls(bronze=80.0, silver=85.0, gold=90.0)


@dataclass
class ParallelConfig(BaseModel):
    """
    Configuration for parallel execution.

    Attributes:
        enabled: Whether parallel execution is enabled
        max_workers: Maximum number of parallel workers
        timeout_secs: Timeout for parallel operations in seconds
    """

    enabled: bool
    max_workers: int
    timeout_secs: int = 300

    def validate(self) -> None:
        """Validate parallel configuration."""
        if self.max_workers < 1:
            raise PipelineValidationError(
                f"max_workers must be at least 1, got {self.max_workers}"
            )
        if self.max_workers > 32:
            raise PipelineValidationError(
                f"max_workers should not exceed 32, got {self.max_workers}"
            )
        if self.timeout_secs < 1:
            raise PipelineValidationError(
                f"timeout_secs must be at least 1, got {self.timeout_secs}"
            )

    @classmethod
    def create_default(cls) -> ParallelConfig:
        """Create default parallel configuration."""
        return cls(enabled=True, max_workers=4, timeout_secs=300)

    @classmethod
    def create_sequential(cls) -> ParallelConfig:
        """Create sequential execution configuration."""
        return cls(enabled=False, max_workers=1, timeout_secs=600)

    @classmethod
    def create_high_performance(cls) -> ParallelConfig:
        """Create high-performance parallel configuration."""
        return cls(enabled=True, max_workers=16, timeout_secs=1200)

In [None]:
# Module: pipeline_builder.functions (pipeline_builder)
#
# Dependencies: compat

from __future__ import annotations

from typing import Protocol

# from .compat import Column  # Removed: defined in notebook cells above


class FunctionsProtocol(Protocol):
    """Protocol for PySpark functions interface."""

    def col(self, col_name: str) -> Column:
        """Create a column reference."""
        ...

    def expr(self, expr: str) -> Column:
        """Create an expression from a string."""
        ...

    def lit(self, value: str | int | float | bool | None) -> Column:
        """Create a literal column."""
        ...

    def when(self, condition: Column, value: str | int | float | bool | None) -> Column:
        """Create a conditional expression."""
        ...

    def count(self, col: str | Column = "*") -> Column:
        """Create a count aggregation."""
        ...

    def countDistinct(self, *cols: str | Column) -> Column:
        """Create a count distinct aggregation."""
        ...

    def sum(self, col: str | Column) -> Column:
        """Create a sum aggregation."""
        ...

    def max(self, col: str | Column) -> Column:
        """Create a max aggregation."""
        ...

    def min(self, col: str | Column) -> Column:
        """Create a min aggregation."""
        ...

    def avg(self, col: str | Column) -> Column:
        """Create an average aggregation."""
        ...

    def length(self, col: str | Column) -> Column:
        """Create a length function."""
        ...

    def date_trunc(self, format: str, col: str | Column) -> Column:
        """Create a date truncation function."""
        ...

    def dayofweek(self, col: str | Column) -> Column:
        """Create a day of week function."""
        ...

    def current_timestamp(self) -> Column:
        """Create a current timestamp function."""
        ...


def get_default_functions() -> FunctionsProtocol:
    """Get the default PySpark functions implementation.

    Returns the functions from the current compatibility layer.
    """
    # from .compat import F  # Removed: defined in notebook cells above

    return F

In [None]:
# Module: pipeline_builder.types (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

from enum import Enum
from typing import Dict, List, Protocol, Union

# from .compat import Column, DataFrame, SparkSession  # Removed: defined in notebook cells above

# ============================================================================
# Basic Type Aliases
# ============================================================================

# String types
StepName = str
PipelineId = str
ExecutionId = str
TableName = str
SchemaName = str
ErrorCode = str

# Numeric types
QualityRate = float
Duration = float
RowCount = int

# Dictionary types
StringDict = Dict[str, str]
NumericDict = Dict[str, Union[int, float]]
GenericDict = Dict[str, Any]
OptionalDict = Optional[Dict[str, Any]]
OptionalList = Optional[List[Any]]

# ============================================================================
# Enums
# ============================================================================


class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


class StepStatus(Enum):
    """Step execution status."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    SKIPPED = "skipped"


# PipelineMode moved to pipeline/models.py to avoid duplication


# ============================================================================
# Function Types
# ============================================================================

# Transform function types
TransformFunction = Callable[[SparkSession, DataFrame], DataFrame]
BronzeTransformFunction = Callable[[SparkSession, DataFrame], DataFrame]
SilverTransformFunction = Callable[
    [SparkSession, DataFrame, Dict[str, DataFrame]], DataFrame
]
GoldTransformFunction = Callable[[SparkSession, Dict[str, DataFrame]], DataFrame]

# Filter function type
FilterFunction = Callable[[DataFrame], DataFrame]

# ============================================================================
# Data Types
# ============================================================================

# Column rules type
ColumnRules = Dict[str, List[Union[str, Column]]]

# Result types
StepResult = Dict[str, Any]
PipelineResult = Dict[str, Any]
ExecutionResultDict = Dict[str, Any]
ValidationResultDict = Dict[str, Any]

# Context types
StepContext = Dict[str, Any]
ExecutionContext = Dict[str, Any]

# Configuration types
PipelineConfigDict = Dict[str, Any]
ExecutionConfig = Dict[str, Any]
ValidationConfig = Dict[str, Any]
MonitoringConfig = Dict[str, Any]

# Quality types
QualityThresholds = Dict[str, float]

# Error types
ErrorContext = Dict[str, Any]
ErrorSuggestions = List[str]

# ============================================================================
# Protocols (Simplified)
# ============================================================================


class Validatable(Protocol):
    """Protocol for objects that can be validated."""

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid."""
        ...


class Serializable(Protocol):
    """Protocol for objects that can be serialized."""

    def to_dict(self) -> Dict[str, Any]:
        """Convert object to dictionary."""
        ...

In [None]:
# Module: pipeline_builder.models.types (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

from typing import Callable, Dict, List, Protocol, TypeVar, Union

# from ..compat import Column, DataFrame, SparkSession  # Removed: defined in notebook cells above

# Specific types for model values instead of Any
ModelValue = Union[str, int, float, bool, List[str], Dict[str, str], None]
ColumnRule = Union[DataFrame, str, bool]  # PySpark Column, string, or boolean
ResourceValue = Union[str, int, float, bool, List[str], Dict[str, str]]

# Type aliases for better readability
ColumnRules = Dict[str, List[Union[str, Column]]]
TransformFunction = Callable[[DataFrame], DataFrame]
SilverTransformFunction = Callable[
    [SparkSession, DataFrame, Dict[str, DataFrame]], DataFrame
]
GoldTransformFunction = Callable[[SparkSession, Dict[str, DataFrame]], DataFrame]

# Generic type for pipeline results
T = TypeVar("T")


class Validatable(Protocol):
    """Protocol for objects that can be validated."""

    def validate(self) -> None:
        """Validate the object and raise ValidationError if invalid."""
        ...


class Serializable(Protocol):
    """Protocol for objects that can be serialized."""

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert object to dictionary."""
        ...

    def to_json(self) -> str:
        """Convert object to JSON string."""
        ...

In [None]:
# Module: pipeline_builder.writer.query_builder (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

from __future__ import annotations

from typing import Any, Dict

# from ..compat import DataFrame  # Removed: defined in notebook cells above

# Import specific functions for convenience
# from ..compat import F as functions  # Removed: defined in notebook cells above


class QueryBuilder:
    """Builder class for common PySpark DataFrame operations."""

    @staticmethod
    def filter_by_date_range(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Filter DataFrame by date range.

        Args:
            df: Input DataFrame
            days: Number of days to look back

        Returns:
            Filtered DataFrame
        """
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)
        return df.filter(
            functions.col("created_at")
            >= functions.lit(start_date.strftime("%Y-%m-%d"))
        )

    @staticmethod
    def add_date_column(
        df: DataFrame,
        date_column: str = "created_at",
        output_column: str = "date",
        format: str = "yyyy-MM-dd",
    ) -> DataFrame:
        """
        Add formatted date column to DataFrame.

        Args:
            df: Input DataFrame
            date_column: Source date column name
            output_column: Output column name
            format: Date format string

        Returns:
            DataFrame with added date column
        """
        return df.withColumn(
            output_column, functions.date_format(functions.col(date_column), format)
        )

    @staticmethod
    def get_common_aggregations() -> Dict[str, Any]:
        """
        Get common aggregation functions.

        Returns:
            Dictionary of common aggregations
        """
        return {
            "count_all": functions.count("*").alias("total_executions"),
            "count_rows": functions.count("*").alias("execution_count"),
            "avg_validation_rate": functions.avg("validation_rate").alias(
                "avg_validation_rate"
            ),
            "min_validation_rate": functions.min("validation_rate").alias(
                "min_validation_rate"
            ),
            "max_validation_rate": functions.max("validation_rate").alias(
                "max_validation_rate"
            ),
            "stddev_validation_rate": functions.stddev("validation_rate").alias(
                "stddev_validation_rate"
            ),
            "avg_execution_time": functions.avg("execution_time").alias(
                "avg_execution_time"
            ),
            "min_execution_time": functions.min("execution_time").alias(
                "min_execution_time"
            ),
            "max_execution_time": functions.max("execution_time").alias(
                "max_execution_time"
            ),
            "stddev_execution_time": functions.stddev("execution_time").alias(
                "stddev_execution_time"
            ),
            "sum_rows_written": functions.sum("rows_written").alias(
                "total_rows_written"
            ),
            "successful_executions": functions.sum(
                functions.when(functions.col("success"), 1).otherwise(0)
            ).alias("successful_executions"),
            "failed_executions": functions.sum(
                functions.when(~functions.col("success"), 1).otherwise(0)
            ).alias("failed_executions"),
            "high_quality_executions": functions.sum(
                functions.when(functions.col("validation_rate") >= 95.0, 1).otherwise(0)
            ).alias("high_quality_executions"),
            "low_quality_executions": functions.sum(
                functions.when(functions.col("validation_rate") < 80.0, 1).otherwise(0)
            ).alias("low_quality_executions"),
        }

    @staticmethod
    def get_quality_aggregations() -> Dict[str, Any]:
        """
        Get quality-specific aggregations.

        Returns:
            Dictionary of quality aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "total_executions": aggs["count_all"],
            "avg_validation_rate": aggs["avg_validation_rate"],
            "min_validation_rate": aggs["min_validation_rate"],
            "max_validation_rate": aggs["max_validation_rate"],
            "stddev_validation_rate": aggs["stddev_validation_rate"],
            "high_quality_executions": aggs["high_quality_executions"],
            "low_quality_executions": aggs["low_quality_executions"],
        }

    @staticmethod
    def get_performance_aggregations() -> Dict[str, Any]:
        """
        Get performance-specific aggregations.

        Returns:
            Dictionary of performance aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "execution_count": aggs["count_rows"],
            "avg_execution_time": aggs["avg_execution_time"],
            "min_execution_time": aggs["min_execution_time"],
            "max_execution_time": aggs["max_execution_time"],
            "stddev_execution_time": aggs["stddev_execution_time"],
            "avg_validation_rate": aggs["avg_validation_rate"],
            "total_rows_written": aggs["sum_rows_written"],
            "successful_executions": aggs["successful_executions"],
        }

    @staticmethod
    def get_trend_aggregations() -> Dict[str, Any]:
        """
        Get trend-specific aggregations.

        Returns:
            Dictionary of trend aggregations
        """
        aggs = QueryBuilder.get_common_aggregations()
        return {
            "daily_executions": aggs["count_all"],
            "successful_executions": aggs["successful_executions"],
            "failed_executions": aggs["failed_executions"],
            "avg_execution_time": aggs["avg_execution_time"],
            "total_rows_written": aggs["sum_rows_written"],
        }

    @staticmethod
    def build_daily_trends_query(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Build daily trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with daily trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_trend_aggregations()

        return (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )

    @staticmethod
    def build_phase_trends_query(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Build phase trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with phase trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_performance_aggregations()

        return filtered_df.groupBy("phase").agg(**aggs).orderBy("phase")

    @staticmethod
    def build_step_trends_query(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Build step trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with step trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_performance_aggregations()

        return (
            filtered_df.groupBy("step")
            .agg(**aggs)
            .orderBy(functions.desc("avg_execution_time"))
        )

    @staticmethod
    def build_quality_trends_query(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Build quality trends query with common aggregations.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with quality trends
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_quality_aggregations()

        return (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )

    @staticmethod
    def build_overall_metrics_query(df: DataFrame, days: int = 30) -> DataFrame:
        """
        Build overall metrics query.

        Args:
            df: Input DataFrame
            days: Number of days to analyze

        Returns:
            DataFrame with overall metrics
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = QueryBuilder.get_quality_aggregations()

        return filtered_df.agg(**aggs)

    @staticmethod
    def build_anomaly_detection_query(
        df: DataFrame, threshold_column: str, threshold_value: float
    ) -> DataFrame:
        """
        Build anomaly detection query.

        Args:
            df: Input DataFrame
            threshold_column: Column to check against threshold
            threshold_value: Threshold value

        Returns:
            DataFrame with anomalies
        """
        return df.filter(functions.col(threshold_column) < threshold_value)

    @staticmethod
    def build_performance_anomaly_query(
        df: DataFrame, performance_threshold: float
    ) -> DataFrame:
        """
        Build performance anomaly detection query.

        Args:
            df: Input DataFrame
            performance_threshold: Performance threshold value

        Returns:
            DataFrame with performance anomalies
        """
        return df.filter(
            (functions.col("execution_time") > performance_threshold)
            | (functions.col("validation_rate") < 80.0)
            | (~functions.col("success"))
        )

    @staticmethod
    def build_quality_anomaly_query(
        df: DataFrame, quality_threshold: float = 90.0
    ) -> DataFrame:
        """
        Build quality anomaly detection query.

        Args:
            df: Input DataFrame
            quality_threshold: Quality threshold value

        Returns:
            DataFrame with quality anomalies
        """
        return df.filter(functions.col("validation_rate") < quality_threshold)

    @staticmethod
    def build_temporal_anomaly_query(
        df: DataFrame, change_threshold: float = -10.0
    ) -> DataFrame:
        """
        Build temporal anomaly detection query.

        Args:
            df: Input DataFrame
            change_threshold: Change threshold value

        Returns:
            DataFrame with temporal anomalies
        """
        # First, calculate daily quality metrics
        daily_quality_df = (
            df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(functions.avg("validation_rate").alias("daily_avg_validation_rate"))
            .orderBy("date")
        )

        # Use window function to calculate lag and quality change
        # from ..compat import Window  # Removed: defined in notebook cells above

        window_spec = Window.orderBy("date")
        return (
            daily_quality_df.withColumn(
                "prev_avg_validation_rate",
                functions.lag("daily_avg_validation_rate", 1).over(window_spec),
            )
            .withColumn(
                "quality_change",
                functions.col("daily_avg_validation_rate")
                - functions.col("prev_avg_validation_rate"),
            )
            .filter(functions.col("quality_change") < change_threshold)
            .orderBy("quality_change")
        )

    @staticmethod
    def calculate_statistics(df: DataFrame, column: str) -> Dict[str, float]:
        """
        Calculate basic statistics for a column.

        Args:
            df: Input DataFrame
            column: Column name to calculate statistics for

        Returns:
            Dictionary with statistics
        """
        stats_df = df.agg(
            functions.avg(column).alias("avg"),
            functions.stddev(column).alias("stddev"),
            functions.min(column).alias("min"),
            functions.max(column).alias("max"),
        )

        result = stats_df.collect()[0]
        return {
            "avg": result["avg"],
            "stddev": result["stddev"],
            "min": result["min"],
            "max": result["max"],
        }

    @staticmethod
    def build_recent_performance_query(df: DataFrame, days: int = 7) -> DataFrame:
        """
        Build recent performance query.

        Args:
            df: Input DataFrame
            days: Number of recent days to analyze

        Returns:
            DataFrame with recent performance
        """
        filtered_df = QueryBuilder.filter_by_date_range(df, days)
        aggs = {
            "daily_executions": functions.count("*").alias("daily_executions"),
            "avg_execution_time": functions.avg("execution_time").alias(
                "avg_execution_time"
            ),
            "avg_validation_rate": functions.avg("validation_rate").alias(
                "avg_validation_rate"
            ),
        }

        return (
            filtered_df.transform(lambda df: QueryBuilder.add_date_column(df))
            .groupBy("date")
            .agg(**aggs)
            .orderBy("date")
        )

In [None]:
# Module: pipeline_builder.validation.utils (pipeline_builder)
#
# Dependencies: pipeline_builder.compat

from __future__ import annotations

from typing import Any, Dict

# from ..compat import DataFrame  # Removed: defined in notebook cells above
# Re-export safe_divide from base for backward compatibility
# from .validation import safe_divide  # Removed: defined in notebook cells above


def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
    """
    Safely divide two numbers, returning default if denominator is zero or None.

    Args:
        numerator: The numerator
        denominator: The denominator
        default: Default value to return if denominator is zero or None

    Returns:
        The division result or default value
    """
    if denominator is None or numerator is None or denominator == 0:
        return default
    return numerator / denominator


def get_dataframe_info(df: DataFrame) -> Dict[str, Any]:
    """
    Get basic information about a DataFrame.

    Args:
        df: DataFrame to analyze

    Returns:
        Dictionary with DataFrame information
    """
    try:
        row_count = df.count()
        column_count = len(df.columns)
        schema = df.schema

        return {
            "row_count": row_count,
            "column_count": column_count,
            "columns": df.columns,
            "schema": str(schema),
            "is_empty": row_count == 0,
        }
    except Exception as e:
        return {
            "error": str(e),
            "row_count": 0,
            "column_count": 0,
            "columns": [],
            "schema": "unknown",
            "is_empty": True,
        }

In [None]:
# Module: pipeline_builder.pipeline.models (pipeline_builder)
#
# Dependencies: models.pipeline

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict

# from .models import PipelineMetrics  # Removed: defined in notebook cells above


class PipelineMode(Enum):
    """Pipeline execution modes."""

    INITIAL = "initial"
    INCREMENTAL = "incremental"
    FULL_REFRESH = "full_refresh"
    VALIDATION_ONLY = "validation_only"


class PipelineStatus(Enum):
    """Pipeline execution status."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
    PAUSED = "paused"


# PipelineMetrics moved to main models.py to avoid duplication


@dataclass
class PipelineReport:
    """Comprehensive pipeline execution report."""

    pipeline_id: str
    execution_id: str
    mode: PipelineMode
    status: PipelineStatus  # Protocol expects str, but we use enum - structural typing allows this
    start_time: datetime
    end_time: datetime | None = None
    duration_seconds: float = 0.0
    metrics: PipelineMetrics = field(default_factory=PipelineMetrics)
    bronze_results: Dict[str, Any] = field(default_factory=dict)
    silver_results: Dict[str, Any] = field(default_factory=dict)
    gold_results: Dict[str, Any] = field(default_factory=dict)
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    recommendations: list[str] = field(default_factory=list)
    execution_groups_count: int = 0
    max_group_size: int = 0

    @property
    def success(self) -> bool:
        """Whether the pipeline executed successfully."""
        return self.status == PipelineStatus.COMPLETED and len(self.errors) == 0

    @property
    def status_str(self) -> str:
        """Return status as string for Protocol compatibility."""
        return self.status.value

    @property
    def successful_steps(self) -> int:
        """Number of successful steps."""
        return self.metrics.successful_steps

    @property
    def failed_steps(self) -> int:
        """Number of failed steps."""
        return self.metrics.failed_steps

    @property
    def parallel_efficiency(self) -> float:
        """Parallel execution efficiency percentage."""
        return self.metrics.parallel_efficiency

    def to_dict(self) -> Dict[str, Any]:
        """Convert report to dictionary."""
        return {
            "pipeline_id": self.pipeline_id,
            "execution_id": self.execution_id,
            "mode": self.mode.value,
            "status": self.status.value,
            "start_time": self.start_time.isoformat(),
            "end_time": self.end_time.isoformat() if self.end_time else None,
            "duration_seconds": self.duration_seconds,
            "metrics": {
                "total_steps": self.metrics.total_steps,
                "successful_steps": self.metrics.successful_steps,
                "failed_steps": self.metrics.failed_steps,
                "skipped_steps": self.metrics.skipped_steps,
                "total_duration": self.metrics.total_duration,
                "bronze_duration": self.metrics.bronze_duration,
                "silver_duration": self.metrics.silver_duration,
                "gold_duration": self.metrics.gold_duration,
                "total_rows_processed": self.metrics.total_rows_processed,
                "total_rows_written": self.metrics.total_rows_written,
                "parallel_efficiency": self.metrics.parallel_efficiency,
                "cache_hit_rate": self.metrics.cache_hit_rate,
                "error_count": self.metrics.error_count,
                "retry_count": self.metrics.retry_count,
            },
            "bronze_results": self.bronze_results,
            "silver_results": self.silver_results,
            "gold_results": self.gold_results,
            "errors": self.errors,
            "warnings": self.warnings,
            "recommendations": self.recommendations,
        }


# ParallelConfig and PipelineConfig moved to main models.py to avoid duplication


@dataclass
class StepExecutionContext:
    """Context for step execution."""

    step_name: str
    step_type: str
    mode: PipelineMode
    start_time: datetime
    dependencies: list[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def duration(self) -> float:
        """Duration of step execution in seconds."""
        return (datetime.now() - self.start_time).total_seconds()

In [None]:
# Module: pipeline_builder.models.steps (pipeline_builder)
#
# Dependencies: models.base, models.types, pipeline_builder_base.errors

from __future__ import annotations

from dataclasses import dataclass

# from .errors import PipelineValidationError, ValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel  # Removed: defined in notebook cells above
# from .types import ColumnRules, GoldTransformFunction, SilverTransformFunction  # Removed: defined in notebook cells above


@dataclass
class BronzeStep(BaseModel):
    """
    Bronze layer step configuration for raw data validation and ingestion.

    Bronze steps represent the first layer of the Medallion Architecture,
    handling raw data validation and establishing the foundation for downstream
    processing. They define validation rules and incremental processing capabilities.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `rules`: Must be a non-empty dictionary with validation rules
        - `incremental_col`: Must be a string if provided

    Attributes:
        name: Unique identifier for this Bronze step
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        incremental_col: Column name for incremental processing (e.g., "timestamp").
                        If provided, enables watermarking for efficient updates.
                        If None, forces full refresh mode for downstream steps.
        schema: Optional schema name for reading bronze data

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> from pyspark.sql import functions as F
        >>>
        >>> # Valid Bronze step with PySpark expressions
        >>> bronze_step = BronzeStep(
        ...     name="user_events",
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "event_type": [F.col("event_type").isin(["click", "view", "purchase"])],
        ...         "timestamp": [F.col("timestamp").isNotNull(), F.col("timestamp") > "2020-01-01"]
        ...     },
        ...     incremental_col="timestamp"
        ... )
        >>>
        >>> # Validate configuration
        >>> bronze_step.validate()
        >>> print(f"Supports incremental: {bronze_step.has_incremental_capability}")

        >>> # Invalid Bronze step (will raise ValidationError)
        >>> try:
        ...     BronzeStep(name="", rules={})  # Empty name and rules
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Step name must be a non-empty string"
    """

    name: str
    rules: ColumnRules
    incremental_col: str | None = None
    schema: str | None = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if not isinstance(self.rules, dict) or not self.rules:
            raise ValidationError("Rules must be a non-empty dictionary")
        if self.incremental_col is not None and not isinstance(
            self.incremental_col, str
        ):
            raise ValidationError("Incremental column must be a string")

    def validate(self) -> None:
        """Validate bronze step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if self.incremental_col is not None and not isinstance(
            self.incremental_col, str
        ):
            raise PipelineValidationError("Incremental column must be a string")

    @property
    def has_incremental_capability(self) -> bool:
        """Check if this Bronze step supports incremental processing."""
        return self.incremental_col is not None


@dataclass
class SilverStep(BaseModel):
    """
    Silver layer step configuration for data cleaning and enrichment.

    Silver steps represent the second layer of the Medallion Architecture,
    transforming raw Bronze data into clean, business-ready datasets.
    They apply data quality rules, business logic, and data transformations.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `source_bronze`: Must be a non-empty string (except for existing tables)
        - `transform`: Must be callable and cannot be None
        - `rules`: Must be a non-empty dictionary with validation rules
        - `table_name`: Must be a non-empty string

    Attributes:
        name: Unique identifier for this Silver step
        source_bronze: Name of the Bronze step providing input data
        transform: Transformation function with signature:
                 (spark: SparkSession, bronze_df: DataFrame, prior_silvers: Dict[str, DataFrame]) -> DataFrame
                 Must be callable and cannot be None.
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        table_name: Target Delta table name where results will be stored
        watermark_col: Column name for watermarking (e.g., "timestamp", "updated_at").
                      If provided, enables incremental processing with append mode.
                      If None, uses overwrite mode for full refresh.
        existing: Whether this represents an existing table (for validation-only steps)
        schema: Optional schema name for writing silver data

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> def clean_user_events(spark, bronze_df, prior_silvers):
        ...     return (bronze_df
        ...         .filter(F.col("user_id").isNotNull())
        ...         .withColumn("event_date", F.date_trunc("day", "timestamp"))
        ...         .withColumn("is_weekend", F.dayofweek("timestamp").isin([1, 7]))
        ...     )
        >>>
        >>> # Valid Silver step
        >>> silver_step = SilverStep(
        ...     name="clean_events",
        ...     source_bronze="user_events",
        ...     transform=clean_user_events,
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "event_date": [F.col("event_date").isNotNull()]
        ...     },
        ...     table_name="clean_user_events",
        ...     watermark_col="timestamp"
        ... )

        >>> # Invalid Silver step (will raise ValidationError)
        >>> try:
        ...     SilverStep(name="clean_events", source_bronze="", transform=None, rules={}, table_name="")
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Transform function is required and must be callable"
    """

    name: str
    source_bronze: str
    transform: SilverTransformFunction
    rules: ColumnRules
    table_name: str
    watermark_col: str | None = None
    existing: bool = False
    schema: str | None = None
    source_incremental_col: str | None = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if not self.existing and (
            not self.source_bronze or not isinstance(self.source_bronze, str)
        ):
            raise ValidationError("Source bronze step name must be a non-empty string")
        if self.transform is None or not callable(self.transform):
            raise ValidationError("Transform function is required and must be callable")
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValidationError("Table name must be a non-empty string")
        if self.source_incremental_col is not None and not isinstance(
            self.source_incremental_col, str
        ):
            raise ValidationError("source_incremental_col must be a string")

    def validate(self) -> None:
        """Validate silver step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not self.source_bronze or not isinstance(self.source_bronze, str):
            raise PipelineValidationError(
                "Source bronze step name must be a non-empty string"
            )
        if not callable(self.transform):
            raise PipelineValidationError("Transform must be a callable function")
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if not self.table_name or not isinstance(self.table_name, str):
            raise PipelineValidationError("Table name must be a non-empty string")
        if self.source_incremental_col is not None and not isinstance(
            self.source_incremental_col, str
        ):
            raise PipelineValidationError(
                "source_incremental_col must be a string when provided"
            )


@dataclass
class GoldStep(BaseModel):
    """
    Gold layer step configuration for business analytics and reporting.

    Gold steps represent the third layer of the Medallion Architecture,
    creating business-ready datasets for analytics, reporting, and dashboards.
    They aggregate and transform Silver layer data into meaningful business insights.

    **Validation Requirements:**
        - `name`: Must be a non-empty string
        - `transform`: Must be callable and cannot be None
        - `rules`: Must be a non-empty dictionary with validation rules
        - `table_name`: Must be a non-empty string
        - `source_silvers`: Must be a non-empty list if provided

    Attributes:
        name: Unique identifier for this Gold step
        transform: Transformation function with signature:
                 (spark: SparkSession, silvers: Dict[str, DataFrame]) -> DataFrame
                 - spark: Active SparkSession for operations
                 - silvers: Dictionary of all Silver DataFrames by step name
                 Must be callable and cannot be None.
        rules: Dictionary mapping column names to validation rule lists.
               Each rule should be a PySpark Column expression.
        table_name: Target Delta table name where results will be stored
        source_silvers: List of Silver step names to use as input sources.
                       If None, uses all available Silver steps.
                       Allows selective consumption of Silver data.
        schema: Optional schema name for writing gold data

    Raises:
        ValidationError: If validation requirements are not met during construction

    Example:
        >>> def user_daily_metrics(spark, silvers):
        ...     events_df = silvers["clean_events"]
        ...     return (events_df
        ...         .groupBy("user_id", "event_date")
        ...         .agg(
        ...             F.count("*").alias("total_events"),
        ...             F.countDistinct("event_type").alias("unique_event_types"),
        ...             F.max("timestamp").alias("last_activity"),
        ...             F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases")
        ...         )
        ...         .withColumn("is_active_user", F.col("total_events") > 5)
        ...     )
        >>>
        >>> # Valid Gold step
        >>> gold_step = GoldStep(
        ...     name="user_metrics",
        ...     transform=user_daily_metrics,
        ...     rules={
        ...         "user_id": [F.col("user_id").isNotNull()],
        ...         "total_events": [F.col("total_events") > 0]
        ...     },
        ...     table_name="user_daily_metrics",
        ...     source_silvers=["clean_events"]
        ... )

        >>> # Invalid Gold step (will raise ValidationError)
        >>> try:
        ...     GoldStep(name="", transform=None, rules={}, table_name="", source_silvers=[])
        ... except ValidationError as e:
        ...     print(f"Validation failed: {e}")
        ...     # Output: "Step name must be a non-empty string"
    """

    name: str
    transform: GoldTransformFunction
    rules: ColumnRules
    table_name: str
    source_silvers: list[str] | None = None
    schema: str | None = None

    def __post_init__(self) -> None:
        """Validate required fields after initialization."""
        if not self.name or not isinstance(self.name, str):
            raise ValidationError("Step name must be a non-empty string")
        if self.transform is None or not callable(self.transform):
            raise ValidationError("Transform function is required and must be callable")
        if not self.table_name or not isinstance(self.table_name, str):
            raise ValidationError("Table name must be a non-empty string")
        if not isinstance(self.rules, dict) or not self.rules:
            raise ValidationError("Rules must be a non-empty dictionary")
        if self.source_silvers is not None and (
            not isinstance(self.source_silvers, list) or not self.source_silvers
        ):
            raise ValidationError("Source silvers must be a non-empty list")

    def validate(self) -> None:
        """Validate gold step configuration."""
        if not self.name or not isinstance(self.name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not callable(self.transform):
            raise PipelineValidationError("Transform must be a callable function")
        if not isinstance(self.rules, dict):
            raise PipelineValidationError("Rules must be a dictionary")
        if not self.table_name or not isinstance(self.table_name, str):
            raise PipelineValidationError("Table name must be a non-empty string")
        if self.source_silvers is not None and not isinstance(
            self.source_silvers, list
        ):
            raise PipelineValidationError("Source silvers must be a list or None")

In [None]:
# Module: pipeline_builder.models.execution (pipeline_builder)
#
# Dependencies: models.base, models.exceptions, models.pipeline, pipeline_builder.models.enums

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict

# from .base import BaseModel  # Removed: defined in notebook cells above
# from .enums import ExecutionMode, PipelinePhase  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError  # Removed: defined in notebook cells above
# from .pipeline import PipelineMetrics  # Removed: defined in notebook cells above


@dataclass
class ExecutionContext(BaseModel):
    """
    Context for pipeline execution.

    Attributes:
        mode: Execution mode (initial/incremental)
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Total execution duration
        run_id: Unique run identifier
        execution_id: Unique identifier for this execution
        pipeline_id: Identifier for the pipeline being executed
        schema: Target schema for data storage
        started_at: When execution started (alias for start_time)
        ended_at: When execution ended (alias for end_time)
        run_mode: Mode of execution (alias for mode)
        config: Pipeline configuration as dictionary
    """

    mode: ExecutionMode
    start_time: datetime
    end_time: datetime | None = None
    duration_secs: float | None = None
    run_id: str = field(default_factory=lambda: str(uuid.uuid4()))

    # Additional fields for writer compatibility
    execution_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    pipeline_id: str = "unknown"
    schema: str = "default"
    started_at: datetime | None = None
    ended_at: datetime | None = None
    run_mode: str = "initial"
    config: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self) -> None:
        """Initialize aliases and defaults."""
        if self.started_at is None:
            self.started_at = self.start_time
        if self.ended_at is None:
            self.ended_at = self.end_time
        if self.run_mode == "initial":
            # Map mode to run_mode string
            if hasattr(self.mode, "value"):
                self.run_mode = self.mode.value
            elif hasattr(self.mode, "name"):
                self.run_mode = self.mode.name.lower()

    def validate(self) -> None:
        """Validate the execution context."""
        if not self.run_id:
            raise ValueError("Run ID cannot be empty")
        if self.duration_secs is not None and self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")

    def finish(self) -> None:
        """Mark execution as finished and calculate duration."""
        self.end_time = datetime.now(timezone.utc)
        if self.start_time:
            self.duration_secs = (self.end_time - self.start_time).total_seconds()

    @property
    def is_finished(self) -> bool:
        """Check if execution is finished."""
        return self.end_time is not None

    @property
    def is_running(self) -> bool:
        """Check if execution is currently running."""
        return not self.is_finished


@dataclass
class StageStats(BaseModel):
    """
    Statistics for a pipeline stage.

    Attributes:
        stage: Stage name (bronze/silver/gold)
        step: Step name
        total_rows: Total number of rows processed
        valid_rows: Number of valid rows
        invalid_rows: Number of invalid rows
        validation_rate: Validation success rate (0-100)
        duration_secs: Processing duration in seconds
        start_time: When processing started
        end_time: When processing ended
    """

    stage: str
    step: str
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_time: datetime | None = None
    end_time: datetime | None = None

    def validate(self) -> None:
        """Validate stage statistics."""
        if self.total_rows != self.valid_rows + self.invalid_rows:
            raise PipelineConfigurationError(
                f"Total rows ({self.total_rows}) must equal valid ({self.valid_rows}) + invalid ({self.invalid_rows})"
            )
        if not 0 <= self.validation_rate <= 100:
            raise PipelineConfigurationError(
                f"Validation rate must be between 0 and 100, got {self.validation_rate}"
            )
        if self.duration_secs < 0:
            raise PipelineConfigurationError(
                f"Duration must be non-negative, got {self.duration_secs}"
            )

    @property
    def is_valid(self) -> bool:
        """Check if the stage passed validation."""
        return self.validation_rate >= 95.0  # Default threshold

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.total_rows == 0:
            return 0.0
        return (self.invalid_rows / self.total_rows) * 100

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.total_rows / self.duration_secs


@dataclass
class StepResult(BaseModel):
    """
    Result of a pipeline step execution.

    Attributes:
        step_name: Name of the step
        phase: Pipeline phase
        success: Whether the step succeeded
        start_time: When execution started
        end_time: When execution ended
        duration_secs: Execution duration in seconds
        rows_processed: Number of rows processed
        rows_written: Number of rows written
        validation_rate: Validation success rate
        error_message: Error message if failed
        step_type: Type of step (bronze, silver, gold)
        table_fqn: Fully qualified table name if step writes to table
        write_mode: Write mode used (overwrite, append)
        input_rows: Number of input rows processed
    """

    step_name: str
    phase: PipelinePhase
    success: bool
    start_time: datetime
    end_time: datetime
    duration_secs: float
    rows_processed: int
    rows_written: int
    validation_rate: float
    error_message: str | None = None
    step_type: str | None = None
    table_fqn: str | None = None
    write_mode: str | None = None
    input_rows: int | None = None

    def validate(self) -> None:
        """Validate the step result."""
        if not self.step_name:
            raise ValueError("Step name cannot be empty")
        if self.duration_secs < 0:
            raise ValueError("Duration cannot be negative")
        if self.rows_processed < 0:
            raise ValueError("Rows processed cannot be negative")
        if self.rows_written < 0:
            raise ValueError("Rows written cannot be negative")
        if not 0 <= self.validation_rate <= 100:
            raise ValueError("Validation rate must be between 0 and 100")

    @property
    def is_valid(self) -> bool:
        """Check if the step result is valid."""
        return self.success and self.validation_rate >= 95.0

    @property
    def is_high_quality(self) -> bool:
        """Check if the step result is high quality."""
        return self.success and self.validation_rate >= 98.0

    @property
    def throughput_rows_per_sec(self) -> float:
        """Calculate throughput in rows per second."""
        if self.duration_secs == 0:
            return 0.0
        return self.rows_processed / self.duration_secs

    @classmethod
    def create_success(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        rows_processed: int,
        rows_written: int,
        validation_rate: float,
        step_type: str | None = None,
        table_fqn: str | None = None,
        write_mode: str | None = None,
        input_rows: int | None = None,
    ) -> StepResult:
        """Create a successful step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=True,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=rows_processed,
            rows_written=rows_written,
            validation_rate=validation_rate,
            error_message=None,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @classmethod
    def create_failure(
        cls,
        step_name: str,
        phase: PipelinePhase,
        start_time: datetime,
        end_time: datetime,
        error_message: str,
        step_type: str | None = None,
        table_fqn: str | None = None,
        write_mode: str | None = None,
        input_rows: int | None = None,
    ) -> StepResult:
        """Create a failed step result."""
        duration_secs = (end_time - start_time).total_seconds()
        return cls(
            step_name=step_name,
            phase=phase,
            success=False,
            start_time=start_time,
            end_time=end_time,
            duration_secs=duration_secs,
            rows_processed=0,
            rows_written=0,
            validation_rate=0.0,
            error_message=error_message,
            step_type=step_type,
            table_fqn=table_fqn,
            write_mode=write_mode,
            input_rows=input_rows,
        )

    @property
    def error_rate(self) -> float:
        """Calculate error rate."""
        if self.rows_processed == 0:
            return 0.0
        return 100.0 - self.validation_rate


@dataclass
class ExecutionResult(BaseModel):
    """
    Result of pipeline execution.

    Attributes:
        context: Execution context
        step_results: Results for each step
        metrics: Overall execution metrics
        success: Whether the entire pipeline succeeded
    """

    context: ExecutionContext
    step_results: list[StepResult]
    metrics: PipelineMetrics
    success: bool

    def validate(self) -> None:
        """Validate execution result."""
        if not isinstance(self.context, ExecutionContext):
            raise PipelineConfigurationError(
                "Context must be an ExecutionContext instance"
            )
        if not isinstance(self.step_results, list):
            raise PipelineConfigurationError("Step results must be a list")
        if not isinstance(self.metrics, PipelineMetrics):
            raise PipelineConfigurationError(
                "Metrics must be a PipelineMetrics instance"
            )
        if not isinstance(self.success, bool):
            raise PipelineConfigurationError("Success must be a boolean")

    @classmethod
    def from_context_and_results(
        cls, context: ExecutionContext, step_results: list[StepResult]
    ) -> ExecutionResult:
        """Create execution result from context and step results."""
        metrics = PipelineMetrics.from_step_results(step_results)
        success = all(result.success for result in step_results)
        return cls(
            context=context, step_results=step_results, metrics=metrics, success=success
        )

In [None]:
# Module: pipeline_builder.models.pipeline (pipeline_builder)
#
# Dependencies: models.base, pipeline_builder_base.errors

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel, ParallelConfig, ValidationThresholds  # Removed: defined in notebook cells above


@dataclass
class PipelineConfig(BaseModel):
    """
    Main pipeline configuration.

    Attributes:
        schema: Database schema name
        thresholds: Validation thresholds for each phase
        parallel: Parallel execution configuration
        verbose: Whether to enable verbose logging
    """

    schema: str
    thresholds: ValidationThresholds
    parallel: ParallelConfig | bool
    verbose: bool = True

    def __post_init__(self) -> None:
        """Post-initialization to convert boolean parallel to ParallelConfig."""
        # Convert boolean parallel to ParallelConfig for backward compatibility
        if isinstance(self.parallel, bool):
            if self.parallel:
                # If True, create default parallel config
                object.__setattr__(self, "parallel", ParallelConfig.create_default())
            else:
                # If False, create sequential config
                object.__setattr__(self, "parallel", ParallelConfig.create_sequential())

    @property
    def min_bronze_rate(self) -> float:
        """Get bronze validation threshold."""
        return self.thresholds.bronze

    @property
    def min_silver_rate(self) -> float:
        """Get silver validation threshold."""
        return self.thresholds.silver

    @property
    def min_gold_rate(self) -> float:
        """Get gold validation threshold."""
        return self.thresholds.gold

    @property
    def enable_parallel_silver(self) -> bool:
        """Get parallel silver execution setting."""
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            return self.parallel.enabled
        # Fallback for mock configs in tests
        return bool(self.parallel)

    @property
    def max_parallel_workers(self) -> int:
        """Get max parallel workers setting."""
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            return self.parallel.max_workers
        # Fallback for mock configs in tests
        return 4

    @property
    def enable_caching(self) -> bool:
        """Get caching setting."""
        return getattr(self.parallel, "enable_caching", True)

    @property
    def enable_monitoring(self) -> bool:
        """Get monitoring setting."""
        return getattr(self.parallel, "enable_monitoring", True)

    def validate(self) -> None:
        """Validate pipeline configuration."""
        if not self.schema or not isinstance(self.schema, str):
            raise PipelineValidationError("Schema name must be a non-empty string")
        self.thresholds.validate()
        # After __post_init__, parallel is always ParallelConfig
        if isinstance(self.parallel, ParallelConfig):
            self.parallel.validate()

    @classmethod
    def create_default(cls, schema: str) -> PipelineConfig:
        """Create default pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_default(),
            parallel=ParallelConfig.create_default(),
            verbose=True,
        )

    @classmethod
    def create_high_performance(cls, schema: str) -> PipelineConfig:
        """Create high-performance pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            parallel=ParallelConfig.create_high_performance(),
            verbose=False,
        )

    @classmethod
    def create_conservative(cls, schema: str) -> PipelineConfig:
        """Create conservative pipeline configuration."""
        return cls(
            schema=schema,
            thresholds=ValidationThresholds.create_strict(),
            parallel=ParallelConfig.create_sequential(),
            verbose=True,
        )


@dataclass
class PipelineMetrics(BaseModel):
    """
    Overall pipeline execution metrics.

    Attributes:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        skipped_steps: Number of skipped steps
        total_duration: Total execution duration
        bronze_duration: Bronze layer duration
        silver_duration: Silver layer duration
        gold_duration: Gold layer duration
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate
        parallel_efficiency: Parallel execution efficiency
        cache_hit_rate: Cache hit rate
        error_count: Number of errors
        retry_count: Number of retries
    """

    total_steps: int = 0
    successful_steps: int = 0
    failed_steps: int = 0
    skipped_steps: int = 0
    total_duration: float = 0.0
    bronze_duration: float = 0.0
    silver_duration: float = 0.0
    gold_duration: float = 0.0
    total_rows_processed: int = 0
    total_rows_written: int = 0
    avg_validation_rate: float = 0.0
    parallel_efficiency: float = 0.0
    cache_hit_rate: float = 0.0
    error_count: int = 0
    retry_count: int = 0

    def validate(self) -> None:
        """Validate the pipeline metrics."""
        if self.total_steps < 0:
            raise ValueError("Total steps cannot be negative")
        if self.successful_steps < 0:
            raise ValueError("Successful steps cannot be negative")
        if self.failed_steps < 0:
            raise ValueError("Failed steps cannot be negative")
        if self.skipped_steps < 0:
            raise ValueError("Skipped steps cannot be negative")
        if self.total_duration < 0:
            raise ValueError("Total duration cannot be negative")
        if not 0 <= self.avg_validation_rate <= 100:
            raise ValueError("Average validation rate must be between 0 and 100")

    @property
    def success_rate(self) -> float:
        """Calculate success rate."""
        return (
            (self.successful_steps / self.total_steps * 100)
            if self.total_steps > 0
            else 0.0
        )

    @property
    def failure_rate(self) -> float:
        """Calculate failure rate."""
        return 100.0 - self.success_rate

    @classmethod
    def from_step_results(cls, step_results: list[Any]) -> PipelineMetrics:
        """Create metrics from step results."""
        total_steps = len(step_results)
        successful_steps = sum(1 for result in step_results if result.success)
        failed_steps = total_steps - successful_steps
        total_duration_secs = sum(result.duration_secs for result in step_results)
        total_rows_processed = sum(result.rows_processed for result in step_results)
        total_rows_written = sum(result.rows_written for result in step_results)
        avg_validation_rate = (
            sum(result.validation_rate for result in step_results) / total_steps
            if total_steps > 0
            else 0.0
        )

        return cls(
            total_steps=total_steps,
            successful_steps=successful_steps,
            failed_steps=failed_steps,
            total_duration=total_duration_secs,
            total_rows_processed=total_rows_processed,
            total_rows_written=total_rows_written,
            avg_validation_rate=avg_validation_rate,
        )

In [None]:
# Module: pipeline_builder.models.base (pipeline_builder)
#
# Dependencies: pipeline_builder.models.enums, pipeline_builder.models.types, pipeline_builder_base.errors

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict

# from ..errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .enums import PipelinePhase  # Removed: defined in notebook cells above
# from .types import ModelValue  # Removed: defined in notebook cells above


@dataclass
class BaseModel(ABC):
    """
    Base class for all pipeline models with common functionality.

    Provides standard validation, serialization, and representation methods
    for all pipeline data models. All models in the pipeline system inherit
    from this base class to ensure consistent behavior.

    Features:
    - Automatic validation support
    - JSON serialization and deserialization
    - Dictionary conversion for easy data exchange
    - String representation for debugging
    - Type-safe field access

    Example:
        >>> @dataclass
        >>> class MyStep(BaseModel):
        ...     name: str
        ...     rules: Dict[str, List[ColumnRule]]
        ...
        ...     def validate(self) -> None:
        ...         if not self.name:
        ...             raise ValueError("Name cannot be empty")
        ...         if not self.rules:
        ...             raise ValueError("Rules cannot be empty")
        >>>
        >>> step = MyStep(name="test", rules={"id": [F.col("id").isNotNull()]})
        >>> step.validate()
        >>> print(step.to_json())
    """

    @abstractmethod
    def validate(self) -> None:
        """Validate the model. Override in subclasses."""
        pass

    def to_dict(self) -> Dict[str, ModelValue]:
        """Convert model to dictionary."""
        result: Dict[str, ModelValue] = {}
        for field_info in self.__dataclass_fields__.values():
            value = getattr(self, field_info.name)
            if hasattr(value, "to_dict"):
                result[field_info.name] = value.to_dict()
            else:
                result[field_info.name] = value
        return result

    def to_json(self) -> str:
        """Convert model to JSON string."""
        return json.dumps(self.to_dict(), default=str, indent=2)

    def __str__(self) -> str:
        """String representation of the model."""
        return f"{self.__class__.__name__}({', '.join(f'{k}={v}' for k, v in self.to_dict().items())})"


@dataclass
class ValidationThresholds(BaseModel):
    """
    Validation thresholds for different pipeline phases.

    Attributes:
        bronze: Bronze layer validation threshold (0-100)
        silver: Silver layer validation threshold (0-100)
        gold: Gold layer validation threshold (0-100)
    """

    bronze: float
    silver: float
    gold: float

    def validate(self) -> None:
        """Validate threshold values."""
        for phase, threshold in [
            ("bronze", self.bronze),
            ("silver", self.silver),
            ("gold", self.gold),
        ]:
            if not 0 <= threshold <= 100:
                raise PipelineValidationError(
                    f"{phase} threshold must be between 0 and 100, got {threshold}"
                )

    def get_threshold(self, phase: PipelinePhase) -> float:
        """Get threshold for a specific phase."""
        phase_map = {
            PipelinePhase.BRONZE: self.bronze,
            PipelinePhase.SILVER: self.silver,
            PipelinePhase.GOLD: self.gold,
        }
        return phase_map[phase]

    @classmethod
    def create_default(cls) -> ValidationThresholds:
        """Create default validation thresholds."""
        return cls(bronze=95.0, silver=98.0, gold=99.0)

    @classmethod
    def create_strict(cls) -> ValidationThresholds:
        """Create strict validation thresholds."""
        return cls(bronze=99.0, silver=99.5, gold=99.9)

    @classmethod
    def create_loose(cls) -> ValidationThresholds:
        """Create loose validation thresholds."""
        return cls(bronze=80.0, silver=85.0, gold=90.0)


@dataclass
class ParallelConfig(BaseModel):
    """
    Configuration for parallel execution.

    Attributes:
        enabled: Whether parallel execution is enabled
        max_workers: Maximum number of parallel workers
        timeout_secs: Timeout for parallel operations in seconds
    """

    enabled: bool
    max_workers: int
    timeout_secs: int = 300

    def validate(self) -> None:
        """Validate parallel configuration."""
        if self.max_workers < 1:
            raise PipelineValidationError(
                f"max_workers must be at least 1, got {self.max_workers}"
            )
        if self.max_workers > 32:
            raise PipelineValidationError(
                f"max_workers should not exceed 32, got {self.max_workers}"
            )
        if self.timeout_secs < 1:
            raise PipelineValidationError(
                f"timeout_secs must be at least 1, got {self.timeout_secs}"
            )

    @classmethod
    def create_default(cls) -> ParallelConfig:
        """Create default parallel configuration."""
        return cls(enabled=True, max_workers=4, timeout_secs=300)

    @classmethod
    def create_sequential(cls) -> ParallelConfig:
        """Create sequential execution configuration."""
        return cls(enabled=False, max_workers=1, timeout_secs=600)

    @classmethod
    def create_high_performance(cls) -> ParallelConfig:
        """Create high-performance parallel configuration."""
        return cls(enabled=True, max_workers=16, timeout_secs=1200)

In [None]:
# Module: pipeline_builder.writer.analytics (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.writer.exceptions, pipeline_builder.writer.query_builder, pipeline_builder_base.logging

from __future__ import annotations

from typing import Dict, TypedDict, Union

# from ..compat import DataFrame, F, SparkSession  # Removed: defined in notebook cells above
# from ..logging import PipelineLogger  # Removed: defined in notebook cells above
# from .exceptions import WriterError  # Removed: defined in notebook cells above
# from .query_builder import QueryBuilder  # Removed: defined in notebook cells above

# Alias for convenience
col = F.col


# ============================================================================
# TypedDict Definitions
# ============================================================================


class AnalysisPeriod(TypedDict):
    """Analysis period structure."""

    start_date: str
    end_date: str
    days_analyzed: int


class DailyQualityTrend(TypedDict):
    """Daily quality trend data point."""

    date: str
    total_executions: int
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float
    stddev_validation_rate: float
    high_quality_executions: int
    low_quality_executions: int
    quality_score: str


class OverallQualityMetrics(TypedDict):
    """Overall quality metrics."""

    total_executions: int
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float
    stddev_validation_rate: float


class DegradationAlert(TypedDict):
    """Quality degradation alert."""

    type: str
    message: str
    severity: Literal["high", "medium", "low"]


class QualityTrends(TypedDict):
    """Quality trends analysis structure."""

    analysis_period: AnalysisPeriod
    daily_trends: list[DailyQualityTrend]
    overall_metrics: OverallQualityMetrics
    degradation_alerts: list[DegradationAlert]
    quality_grade: str


class ValidationAnomaly(TypedDict):
    """Validation anomaly data point."""

    step: str
    phase: str
    validation_rate: float
    valid_rows: int
    invalid_rows: int
    timestamp: str


class StepAnomaly(TypedDict):
    """Step-level anomaly data point."""

    step: str
    execution_count: int
    avg_validation_rate: float
    min_validation_rate: float
    stddev_validation_rate: float
    anomaly_score: float


class TemporalAnomaly(TypedDict):
    """Temporal anomaly data point."""

    date: str
    daily_avg_validation_rate: float
    prev_avg_validation_rate: float
    quality_change: float


class AnomalySummary(TypedDict):
    """Anomaly summary statistics."""

    total_validation_anomalies: int
    total_step_anomalies: int
    total_temporal_anomalies: int
    overall_anomaly_score: float


class QualityAnomalies(TypedDict):
    """Quality anomalies analysis structure."""

    validation_anomalies: list[ValidationAnomaly]
    step_anomalies: list[StepAnomaly]
    temporal_anomalies: list[TemporalAnomaly]
    anomaly_summary: AnomalySummary


class VolumeTrendPoint(TypedDict):
    """Volume trend data point."""

    date: str
    daily_executions: int
    successful_executions: int
    failed_executions: int
    success_rate: float
    avg_execution_time: float
    total_rows_written: int


class PhaseTrendPoint(TypedDict):
    """Phase trend data point."""

    phase: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int
    success_rate: float


class StepTrendPoint(TypedDict):
    """Step trend data point."""

    step: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    stddev_execution_time: float
    min_execution_time: float
    max_execution_time: float
    performance_grade: str


class TrendIndicators(TypedDict):
    """Trend indicators."""

    execution_volume_trend: str
    success_rate_trend: str
    recent_executions: int
    historical_avg_executions: float
    recent_success_rate: float
    historical_success_rate: float


class ExecutionTrends(TypedDict):
    """Execution trends analysis structure."""

    analysis_period: AnalysisPeriod
    volume_trends: list[VolumeTrendPoint]
    phase_trends: list[PhaseTrendPoint]
    step_trends: list[StepTrendPoint]
    trend_indicators: TrendIndicators


class DataQualityAnalyzer:
    """Analyzes data quality metrics and trends."""

    def __init__(self, spark: SparkSession, logger: PipelineLogger | None = None):
        """Initialize the data quality analyzer."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("DataQualityAnalyzer")
        else:
            self.logger = logger

    def analyze_quality_trends(self, df: DataFrame, days: int = 30) -> QualityTrends:
        """
        Analyze data quality trends over time.

        Args:
            df: DataFrame containing log data
            days: Number of days to analyze

        Returns:
            Dictionary containing quality trend analysis
        """
        try:
            self.logger.info(f"Analyzing data quality trends for last {days} days")

            # Use query builder for quality trends
            quality_trends_df = QueryBuilder.build_quality_trends_query(df, days)
            quality_trends = quality_trends_df.collect()

            # Use query builder for overall metrics
            overall_metrics_df = QueryBuilder.build_overall_metrics_query(df, days)
            overall_metrics = overall_metrics_df.collect()[0]

            # Detect quality degradation
            degradation_alerts = []
            if len(quality_trends) > 1:
                recent_avg = quality_trends[-1]["avg_validation_rate"]
                historical_avg = sum(
                    row["avg_validation_rate"] for row in quality_trends[:-1]
                ) / len(quality_trends[:-1])

                if recent_avg < historical_avg - 5.0:  # 5% degradation threshold
                    degradation_alerts.append(
                        {
                            "type": "quality_degradation",
                            "message": f"Recent validation rate ({recent_avg:.1f}%) is significantly lower than historical average ({historical_avg:.1f}%)",
                            "severity": (
                                "high"
                                if recent_avg < historical_avg - 10.0
                                else "medium"
                            ),
                        }
                    )

            # Get date range for analysis period
            end_date = datetime.now()
            start_date = end_date - timedelta(days=days)

            analysis_result = {
                "analysis_period": {
                    "start_date": start_date.strftime("%Y-%m-%d"),
                    "end_date": end_date.strftime("%Y-%m-%d"),
                    "days_analyzed": days,
                },
                "daily_trends": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "total_executions": row["total_executions"],
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "min_validation_rate": round(row["min_validation_rate"], 2),
                        "max_validation_rate": round(row["max_validation_rate"], 2),
                        "stddev_validation_rate": round(
                            row["stddev_validation_rate"], 2
                        ),
                        "high_quality_executions": row["high_quality_executions"],
                        "low_quality_executions": row["low_quality_executions"],
                        "quality_score": self._calculate_quality_score(row.asDict()),
                    }
                    for row in quality_trends
                ],
                "overall_metrics": {
                    "total_executions": overall_metrics["total_executions"],
                    "avg_validation_rate": round(
                        overall_metrics["overall_avg_validation_rate"], 2
                    ),
                    "min_validation_rate": round(
                        overall_metrics["overall_min_validation_rate"], 2
                    ),
                    "max_validation_rate": round(
                        overall_metrics["overall_max_validation_rate"], 2
                    ),
                    "stddev_validation_rate": round(
                        overall_metrics["overall_stddev_validation_rate"], 2
                    ),
                },
                "degradation_alerts": degradation_alerts,
                "quality_grade": self._calculate_quality_grade(
                    overall_metrics["overall_avg_validation_rate"]
                ),
            }

            self.logger.info("Data quality trends analysis completed")
            return cast(QualityTrends, analysis_result)

        except Exception as e:
            self.logger.error(f"Failed to analyze quality trends: {e}")
            raise WriterError(f"Failed to analyze quality trends: {e}") from e

    def detect_quality_anomalies(self, df: DataFrame) -> QualityAnomalies:
        """
        Detect data quality anomalies.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting data quality anomalies")

            # Calculate overall statistics for anomaly detection
            overall_stats = QueryBuilder.calculate_statistics(df, "validation_rate")
            threshold = overall_stats["avg"] - (2 * overall_stats["stddev"])

            # Detect validation rate anomalies using query builder
            validation_anomalies_df = (
                QueryBuilder.build_anomaly_detection_query(
                    df, "validation_rate", threshold
                )
                .select(
                    "step",
                    "phase",
                    "validation_rate",
                    "valid_rows",
                    "invalid_rows",
                    "created_at",
                )
                .orderBy("validation_rate")
            )

            validation_anomalies = validation_anomalies_df.collect()

            # Detect step-specific anomalies using query builder
            step_anomalies_df = (
                df.groupBy("step")
                .agg(**QueryBuilder.get_performance_aggregations())
                .filter(
                    (col("avg_validation_rate") < 90.0)
                    | (col("stddev_validation_rate") > 10.0)
                )
                .orderBy("avg_validation_rate")
            )

            step_anomalies = step_anomalies_df.collect()

            # Detect temporal anomalies using query builder
            temporal_anomalies_df = QueryBuilder.build_temporal_anomaly_query(df)
            temporal_anomalies = temporal_anomalies_df.collect()

            anomaly_result = {
                "validation_anomalies": [
                    {
                        "step": row["step"],
                        "phase": row["phase"],
                        "validation_rate": round(row["validation_rate"], 2),
                        "valid_rows": row["valid_rows"],
                        "invalid_rows": row["invalid_rows"],
                        "timestamp": row["created_at"].strftime("%Y-%m-%d %H:%M:%S"),
                    }
                    for row in validation_anomalies
                ],
                "step_anomalies": [
                    {
                        "step": row["step"],
                        "execution_count": row["execution_count"],
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "min_validation_rate": round(row["min_validation_rate"], 2),
                        "stddev_validation_rate": round(
                            row["stddev_validation_rate"], 2
                        ),
                        "anomaly_score": self._calculate_anomaly_score(row.asDict()),
                    }
                    for row in step_anomalies
                ],
                "temporal_anomalies": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_avg_validation_rate": round(
                            row["daily_avg_validation_rate"], 2
                        ),
                        "prev_avg_validation_rate": round(
                            row["prev_avg_validation_rate"], 2
                        ),
                        "quality_change": round(row["quality_change"], 2),
                    }
                    for row in temporal_anomalies
                ],
                "anomaly_summary": {
                    "total_validation_anomalies": len(validation_anomalies),
                    "total_step_anomalies": len(step_anomalies),
                    "total_temporal_anomalies": len(temporal_anomalies),
                    "overall_anomaly_score": self._calculate_overall_anomaly_score(
                        len(validation_anomalies),
                        len(step_anomalies),
                        len(temporal_anomalies),
                    ),
                },
            }

            self.logger.info(
                f"Quality anomaly detection completed: {len(validation_anomalies)} validation anomalies found"
            )
            return cast(QualityAnomalies, anomaly_result)

        except Exception as e:
            self.logger.error(f"Failed to detect quality anomalies: {e}")
            raise WriterError(f"Failed to detect quality anomalies: {e}") from e

    def _calculate_quality_score(self, row: Dict[str, Union[int, float]]) -> str:
        """Calculate quality score for a row."""
        avg_rate = row["avg_validation_rate"]
        if avg_rate >= 95.0:
            return "A"
        elif avg_rate >= 90.0:
            return "B"
        elif avg_rate >= 80.0:
            return "C"
        else:
            return "D"

    def _calculate_quality_grade(self, avg_validation_rate: float) -> str:
        """Calculate overall quality grade."""
        if avg_validation_rate >= 95.0:
            return "A"
        elif avg_validation_rate >= 90.0:
            return "B"
        elif avg_validation_rate >= 80.0:
            return "C"
        else:
            return "D"

    def _calculate_anomaly_score(self, row: Dict[str, Union[int, float]]) -> float:
        """Calculate anomaly score for a step."""
        avg_rate = row["avg_validation_rate"]
        stddev_rate = row["stddev_validation_rate"]

        # Lower average rate and higher standard deviation = higher anomaly score
        anomaly_score = (100 - avg_rate) + (stddev_rate * 2)
        return float(round(min(anomaly_score, 100.0), 2))

    def _calculate_overall_anomaly_score(
        self, validation_anomalies: int, step_anomalies: int, temporal_anomalies: int
    ) -> float:
        """Calculate overall anomaly score."""
        total_anomalies = validation_anomalies + step_anomalies + temporal_anomalies

        if total_anomalies == 0:
            return 0.0

        # Weight different types of anomalies
        weighted_score = (
            (validation_anomalies * 1.0)
            + (step_anomalies * 0.8)
            + (temporal_anomalies * 1.2)
        )
        return round(min(weighted_score, 100.0), 2)


class TrendAnalyzer:
    """Analyzes execution trends and patterns."""

    def __init__(self, spark: SparkSession, logger: PipelineLogger | None = None):
        """Initialize the trend analyzer."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("TrendAnalyzer")
        else:
            self.logger = logger

    def analyze_execution_trends(
        self, df: DataFrame, days: int = 30
    ) -> ExecutionTrends:
        """
        Analyze execution trends over time.

        Args:
            df: DataFrame containing log data
            days: Number of days to analyze

        Returns:
            Dictionary containing trend analysis
        """
        try:
            self.logger.info(f"Analyzing execution trends for last {days} days")

            # Use query builder for all trend analyses
            volume_trends_df = QueryBuilder.build_daily_trends_query(df, days)
            volume_trends = volume_trends_df.collect()

            phase_trends_df = QueryBuilder.build_phase_trends_query(df, days)
            phase_trends = phase_trends_df.collect()

            step_trends_df = QueryBuilder.build_step_trends_query(df, days)
            step_trends = step_trends_df.collect()

            # Calculate trend indicators
            trend_indicators = self._calculate_trend_indicators(
                [row.asDict() for row in volume_trends]
            )

            # Get date range for analysis period
            end_date = datetime.now()
            start_date = end_date - timedelta(days=days)

            analysis_result = {
                "analysis_period": {
                    "start_date": start_date.strftime("%Y-%m-%d"),
                    "end_date": end_date.strftime("%Y-%m-%d"),
                    "days_analyzed": days,
                },
                "volume_trends": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_executions": row["daily_executions"],
                        "successful_executions": row["successful_executions"],
                        "failed_executions": row["failed_executions"],
                        "success_rate": (
                            round(
                                (row["successful_executions"] / row["daily_executions"])
                                * 100,
                                2,
                            )
                            if row["daily_executions"] > 0
                            else 0
                        ),
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "total_rows_written": row["total_rows_written"],
                    }
                    for row in volume_trends
                ],
                "phase_trends": [
                    {
                        "phase": row["phase"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "total_rows_written": row["total_rows_written"],
                        "success_rate": round(
                            (row["successful_executions"] / row["execution_count"])
                            * 100,
                            2,
                        ),
                    }
                    for row in phase_trends
                ],
                "step_trends": [
                    {
                        "step": row["step"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": round(row["avg_execution_time"], 2),
                        "avg_validation_rate": round(row["avg_validation_rate"], 2),
                        "stddev_execution_time": round(row["stddev_execution_time"], 2),
                        "min_execution_time": round(row["min_execution_time"], 2),
                        "max_execution_time": round(row["max_execution_time"], 2),
                        "performance_grade": self._calculate_performance_grade(
                            row.asDict()
                        ),
                    }
                    for row in step_trends
                ],
                "trend_indicators": trend_indicators,
            }

            self.logger.info("Execution trends analysis completed")
            return cast(ExecutionTrends, analysis_result)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def _calculate_trend_indicators(
        self, volume_trends: list[Dict[str, Union[int, float]]]
    ) -> TrendIndicators:
        """Calculate trend indicators from volume trends."""
        if len(volume_trends) < 2:
            return {
                "execution_volume_trend": "insufficient_data",
                "success_rate_trend": "insufficient_data",
                "recent_executions": 0,
                "historical_avg_executions": 0.0,
                "recent_success_rate": 0.0,
                "historical_success_rate": 0.0,
            }

        # Calculate execution volume trend
        recent_executions = volume_trends[-1]["daily_executions"]
        historical_avg = sum(
            row["daily_executions"] for row in volume_trends[:-1]
        ) / len(volume_trends[:-1])

        execution_trend = (
            "increasing"
            if recent_executions > historical_avg * 1.1
            else "decreasing"
            if recent_executions < historical_avg * 0.9
            else "stable"
        )

        # Calculate success rate trend
        recent_success_rate = (
            (
                volume_trends[-1]["successful_executions"]
                / volume_trends[-1]["daily_executions"]
            )
            * 100
            if volume_trends[-1]["daily_executions"] > 0
            else 0
        )
        historical_success_rate = sum(
            (row["successful_executions"] / row["daily_executions"]) * 100
            for row in volume_trends[:-1]
            if row["daily_executions"] > 0
        ) / len([row for row in volume_trends[:-1] if row["daily_executions"] > 0])

        success_trend = (
            "improving"
            if recent_success_rate > historical_success_rate + 2
            else (
                "declining"
                if recent_success_rate < historical_success_rate - 2
                else "stable"
            )
        )

        return {
            "execution_volume_trend": execution_trend,
            "success_rate_trend": success_trend,
            "recent_executions": int(recent_executions),
            "historical_avg_executions": round(historical_avg, 2),
            "recent_success_rate": round(recent_success_rate, 2),
            "historical_success_rate": round(historical_success_rate, 2),
        }

    def _calculate_performance_grade(self, row: Dict[str, Union[int, float]]) -> str:
        """Calculate performance grade for a step."""
        avg_time = row["avg_execution_time"]
        stddev_time = row["stddev_execution_time"]

        # Consider both average time and consistency (low stddev)
        if avg_time < 60 and stddev_time < 30:  # Fast and consistent
            return "A"
        elif avg_time < 120 and stddev_time < 60:  # Reasonable and somewhat consistent
            return "B"
        elif avg_time < 300:  # Acceptable
            return "C"
        else:  # Slow
            return "D"

In [None]:
# Module: pipeline_builder.pipeline.monitor (pipeline_builder)
#
# Dependencies: models.pipeline, pipeline.models, pipeline_builder_base.logging

from __future__ import annotations

from typing import Any, Dict

# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import PipelineMetrics  # Removed: defined in notebook cells above
# from .models import PipelineMode, PipelineReport, PipelineStatus  # Removed: defined in notebook cells above


class SimplePipelineMonitor:
    """
    Simplified pipeline monitoring.

    This monitor provides basic execution tracking and reporting
    without complex metrics collection.
    """

    def __init__(self, logger: PipelineLogger | None = None):
        """Initialize the simplified monitor."""
        self.logger = logger or PipelineLogger()
        self._current_report: PipelineReport | None = None

    def start_execution(
        self,
        pipeline_id: str,
        mode: PipelineMode,
        bronze_steps: Dict[str, Any],
        silver_steps: Dict[str, Any],
        gold_steps: Dict[str, Any],
    ) -> PipelineReport:
        """Start monitoring a pipeline execution."""
        start_time = datetime.now()

        self._current_report = PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=f"exec_{pipeline_id}",
            status=PipelineStatus.RUNNING,
            mode=mode,
            start_time=start_time,
            end_time=None,
            duration_seconds=0.0,
            metrics=PipelineMetrics(
                total_steps=len(bronze_steps) + len(silver_steps) + len(gold_steps),
                successful_steps=0,
                failed_steps=0,
                total_duration=0.0,
            ),
            errors=[],
            warnings=[],
        )

        self.logger.info(f"Started monitoring pipeline: {pipeline_id}")
        return self._current_report

    def update_step_execution(
        self,
        step_name: str,
        step_type: str,
        success: bool,
        duration: float,
        error_message: str | None = None,
        rows_processed: int = 0,
        rows_written: int = 0,
    ) -> None:
        """Update step execution metrics."""
        if not self._current_report:
            return

        if success:
            self._current_report.metrics.successful_steps += 1
        else:
            self._current_report.metrics.failed_steps += 1
            if error_message:
                self._current_report.errors.append(f"{step_name}: {error_message}")

        self.logger.debug(
            f"Updated step {step_name}: success={success}, duration={duration:.2f}s"
        )

    def finish_execution(self, success: bool) -> PipelineReport:
        """Finish monitoring and return final report."""
        if not self._current_report:
            raise RuntimeError("No active execution to finish")

        end_time = datetime.now()
        total_duration = (end_time - self._current_report.start_time).total_seconds()

        # Update final metrics
        self._current_report.end_time = end_time
        self._current_report.duration_seconds = total_duration
        self._current_report.status = (
            PipelineStatus.COMPLETED if success else PipelineStatus.FAILED
        )
        self._current_report.metrics.total_duration = total_duration

        self.logger.info(
            f"Finished monitoring pipeline: {self._current_report.pipeline_id}"
        )
        return self._current_report


# Alias for backward compatibility
PipelineMonitor = SimplePipelineMonitor

In [None]:
# Module: pipeline_builder.models.factory (pipeline_builder)
#
# Dependencies: models.base, models.exceptions, models.pipeline, pipeline_builder.models.enums, pipeline_builder.models.execution, pipeline_builder.models.steps

from __future__ import annotations

# from .base import ParallelConfig, ValidationThresholds  # Removed: defined in notebook cells above
# from .enums import ExecutionMode  # Removed: defined in notebook cells above
# from .exceptions import PipelineConfigurationError, PipelineExecutionError  # Removed: defined in notebook cells above
# from .execution import ExecutionContext  # Removed: defined in notebook cells above
# from .pipeline import PipelineConfig  # Removed: defined in notebook cells above
# from .steps import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above


def create_pipeline_config(
    schema: str,
    bronze_threshold: float = 95.0,
    silver_threshold: float = 98.0,
    gold_threshold: float = 99.0,
    enable_parallel: bool = True,
    max_workers: int = 4,
    verbose: bool = True,
) -> PipelineConfig:
    """Factory function to create pipeline configuration."""
    thresholds = ValidationThresholds(
        bronze=bronze_threshold, silver=silver_threshold, gold=gold_threshold
    )
    parallel = ParallelConfig(enabled=enable_parallel, max_workers=max_workers)
    return PipelineConfig(
        schema=schema, thresholds=thresholds, parallel=parallel, verbose=verbose
    )


def create_execution_context(mode: ExecutionMode) -> ExecutionContext:
    """Factory function to create execution context."""
    return ExecutionContext(mode=mode, start_time=datetime.now(timezone.utc))


def validate_pipeline_config(config: PipelineConfig) -> None:
    """Validate a pipeline configuration."""
    try:
        config.validate()
    except PipelineExecutionError as e:
        raise PipelineConfigurationError(f"Invalid pipeline configuration: {e}") from e


def validate_step_config(step: BronzeStep | SilverStep | GoldStep) -> None:
    """Validate a step configuration."""
    try:
        step.validate()
    except PipelineExecutionError as e:
        raise PipelineConfigurationError(f"Invalid step configuration: {e}") from e


def serialize_pipeline_config(config: PipelineConfig) -> str:
    """Serialize pipeline configuration to JSON."""
    return config.to_json()


def deserialize_pipeline_config(json_str: str) -> PipelineConfig:
    """Deserialize pipeline configuration from JSON."""
    data = json.loads(json_str)
    return PipelineConfig(
        schema=data["schema"],
        thresholds=ValidationThresholds(
            bronze=data["thresholds"]["bronze"],
            silver=data["thresholds"]["silver"],
            gold=data["thresholds"]["gold"],
        ),
        parallel=ParallelConfig(
            enabled=data["parallel"]["enabled"],
            max_workers=data["parallel"]["max_workers"],
            timeout_secs=data["parallel"].get("timeout_secs", 300),
        ),
        verbose=data.get("verbose", True),
    )

In [None]:
# Module: pipeline_builder.writer.models (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.models.execution

from __future__ import annotations

import os
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, TypedDict

# from ..compat import types  # Removed: defined in notebook cells above
# from ..pipeline.models import PipelineReport  # Removed: defined in notebook cells above
# from .models import ExecutionContext, ExecutionResult, StepResult  # Removed: defined in notebook cells above

# Import specific types for convenience
BooleanType = types.BooleanType
FloatType = types.FloatType
IntegerType = types.IntegerType
StringType = types.StringType
# Use the appropriate StructField based on the engine

if os.environ.get("SPARKFORGE_ENGINE") == "mock":
    StructField = types.StructField
else:
    StructField = types.StructField
StructType = types.StructType
TimestampType = types.TimestampType

# ============================================================================
# Enums
# ============================================================================


class WriteMode(Enum):
    """Write mode for log operations."""

    OVERWRITE = "overwrite"
    APPEND = "append"
    MERGE = "merge"
    IGNORE = "ignore"


class LogLevel(Enum):
    """Log level for writer operations."""

    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"


# ============================================================================
# TypedDict Definitions
# ============================================================================


class LogRow(TypedDict):
    """
    Enhanced log row with full type safety and framework integration.

    This replaces the previous MinimalLogRow with proper integration
    with framework models and enhanced type safety.
    """

    # Run-level information
    run_id: str
    run_mode: Literal["initial", "incremental", "full_refresh", "validation_only"]
    run_started_at: datetime | None
    run_ended_at: datetime | None

    # Execution context
    execution_id: str
    pipeline_id: str
    schema: str

    # Step-level information
    phase: Literal["bronze", "silver", "gold", "pipeline"]
    step_name: str
    step_type: str

    # Timing information
    start_time: datetime | None
    end_time: datetime | None
    duration_secs: float

    # Table information
    table_fqn: str | None
    write_mode: Literal["overwrite", "append"] | None

    # Data metrics
    input_rows: int | None
    output_rows: int | None
    rows_written: int | None
    rows_processed: int
    table_total_rows: int | None  # Total rows in table after this write

    # Validation metrics
    valid_rows: int
    invalid_rows: int
    validation_rate: float

    # Execution status
    success: bool
    error_message: str | None

    # Performance metrics
    memory_usage_mb: float | None
    cpu_usage_percent: float | None

    # Metadata
    metadata: Dict[str, Any]


class WriterMetrics(TypedDict):
    """Metrics for writer operations."""

    total_writes: int
    successful_writes: int
    failed_writes: int
    total_duration_secs: float
    avg_write_duration_secs: float
    total_rows_written: int
    memory_usage_peak_mb: float


# ============================================================================
# Configuration Models
# ============================================================================


@dataclass
class WriterConfig:
    """
    Configuration for the LogWriter.

    Provides comprehensive configuration options for the writer module
    including table settings, performance tuning, and feature flags.
    """

    # Table configuration
    table_schema: str
    table_name: str
    write_mode: WriteMode = WriteMode.APPEND

    # Custom table naming patterns
    table_name_pattern: str | None = None  # e.g., "{schema}.{pipeline_id}_{timestamp}"
    table_suffix_pattern: str | None = None  # e.g., "_{run_mode}_{date}"

    # Partitioning and optimization
    partition_columns: list[str] | None = None
    partition_count: int | None = None
    compression: str = "snappy"

    # Schema options
    enable_schema_evolution: bool = True
    schema_validation_mode: str = "strict"  # strict, lenient, ignore
    auto_optimize_schema: bool = True

    # Performance settings
    batch_size: int = 1000
    max_file_size_mb: int = 128
    enable_optimization: bool = True
    parallel_write_threads: int = 4
    memory_fraction: float = 0.6

    # Feature flags
    enable_performance_monitoring: bool = True
    enable_data_quality_checks: bool = True
    enable_validation: bool = True
    enable_metrics_collection: bool = True
    enable_audit_trail: bool = True
    enable_backup_before_write: bool = False

    # Logging configuration
    log_level: LogLevel = LogLevel.INFO
    enable_detailed_logging: bool = False
    log_performance_metrics: bool = True
    log_data_quality_results: bool = True

    # Error handling
    max_retries: int = 3
    retry_delay_secs: float = 1.0
    fail_fast: bool = False
    retry_exponential_backoff: bool = True

    # Data quality thresholds
    min_validation_rate: float = 95.0
    max_invalid_rows_percent: float = 5.0
    enable_anomaly_detection: bool = False

    def validate(self) -> None:
        """Validate the configuration."""
        if not self.table_schema:
            raise ValueError("Table schema cannot be empty")
        if not self.table_name:
            raise ValueError("Table name cannot be empty")
        if self.batch_size <= 0:
            raise ValueError("Batch size must be positive")
        if self.max_file_size_mb <= 0:
            raise ValueError("Max file size must be positive")
        if self.max_retries < 0:
            raise ValueError("Max retries cannot be negative")
        if self.retry_delay_secs < 0:
            raise ValueError("Retry delay cannot be negative")
        if self.parallel_write_threads <= 0:
            raise ValueError("Parallel write threads must be positive")
        if not 0 < self.memory_fraction <= 1:
            raise ValueError("Memory fraction must be between 0 and 1")
        if self.schema_validation_mode not in ["strict", "lenient", "ignore"]:
            raise ValueError(
                "Schema validation mode must be 'strict', 'lenient', or 'ignore'"
            )
        if not 0 <= self.min_validation_rate <= 100:
            raise ValueError("Min validation rate must be between 0 and 100")
        if not 0 <= self.max_invalid_rows_percent <= 100:
            raise ValueError("Max invalid rows percent must be between 0 and 100")

    def generate_table_name(
        self,
        pipeline_id: str | None = None,
        run_mode: str | None = None,
        timestamp: str | None = None,
    ) -> str:
        """
        Generate dynamic table name based on patterns.

        Args:
            pipeline_id: Pipeline identifier
            run_mode: Run mode (initial, incremental, etc.)
            timestamp: Timestamp for naming

        Returns:
            Generated table name
        """
        table_name = self.table_name

        # Apply suffix pattern if provided
        if self.table_suffix_pattern:
            # Use explicit None checking instead of 'or' to avoid masking None values
            if run_mode is None:
                raise ValueError(
                    "run_mode cannot be None when using table_suffix_pattern"
                )
            if timestamp is None:
                raise ValueError(
                    "timestamp cannot be None when using table_suffix_pattern"
                )

            suffix_vars = {
                "run_mode": run_mode,
                "date": timestamp,
                "timestamp": timestamp,
            }
            suffix = self.table_suffix_pattern.format(**suffix_vars)
            table_name = f"{table_name}{suffix}"

        # Apply full pattern if provided
        if self.table_name_pattern:
            # Use explicit None checking instead of 'or' to avoid masking None values
            if pipeline_id is None:
                raise ValueError(
                    "pipeline_id cannot be None when using table_name_pattern"
                )
            if run_mode is None:
                raise ValueError(
                    "run_mode cannot be None when using table_name_pattern"
                )
            if timestamp is None:
                raise ValueError(
                    "timestamp cannot be None when using table_name_pattern"
                )

            pattern_vars = {
                "schema": self.table_schema,
                "table_name": table_name,
                "pipeline_id": pipeline_id,
                "run_mode": run_mode,
                "date": timestamp,
                "timestamp": timestamp,
            }
            return self.table_name_pattern.format(**pattern_vars)

        return table_name


# ============================================================================
# Spark Schema Definitions
# ============================================================================

# from ..compat import types  # noqa: E402  # Removed: defined in notebook cells above


def create_log_schema() -> types.StructType:
    """
    Create the Spark schema for log tables.

    Returns:
        StructType: Spark schema for log tables with proper types
    """
    return types.StructType(
        [
            # Run-level fields
            StructField("run_id", StringType(), False),
            StructField("run_mode", StringType(), False),
            StructField("run_started_at", TimestampType(), True),
            StructField("run_ended_at", TimestampType(), True),
            # Execution context
            StructField("execution_id", StringType(), False),
            StructField("pipeline_id", StringType(), False),
            StructField("schema", StringType(), False),
            # Step-level fields
            StructField("phase", StringType(), False),
            StructField("step_name", StringType(), False),
            StructField("step_type", StringType(), False),
            # Timing fields
            StructField("start_time", TimestampType(), True),
            StructField("end_time", TimestampType(), True),
            StructField("duration_secs", FloatType(), False),
            # Table fields
            StructField("table_fqn", StringType(), True),
            StructField("write_mode", StringType(), True),
            # Data metrics
            StructField("input_rows", IntegerType(), True),
            StructField("output_rows", IntegerType(), True),
            StructField("rows_written", IntegerType(), True),
            StructField("rows_processed", IntegerType(), False),
            StructField("table_total_rows", IntegerType(), True),
            # Validation metrics
            StructField("valid_rows", IntegerType(), False),
            StructField("invalid_rows", IntegerType(), False),
            StructField("validation_rate", FloatType(), False),
            # Execution status
            StructField("success", BooleanType(), False),
            StructField("error_message", StringType(), True),
            # Performance metrics
            StructField("memory_usage_mb", FloatType(), True),
            StructField("cpu_usage_percent", FloatType(), True),
            # Metadata (stored as JSON string)
            StructField("metadata", StringType(), True),
            # Timestamp fields for tracking
            StructField("created_at", StringType(), True),
            StructField("updated_at", StringType(), True),
        ]
    )


# ============================================================================
# Factory Functions
# ============================================================================


def create_log_row_from_step_result(
    step_result: StepResult,
    execution_context: ExecutionContext,
    run_id: str,
    run_mode: str,
    metadata: Dict[str, Any] | None = None,
) -> LogRow:
    """
    Create a LogRow from a StepResult and ExecutionContext.

    Args:
        step_result: The step result to convert
        execution_context: The execution context
        run_id: Unique run identifier
        run_mode: Mode of the run (initial, incremental, etc.)
        metadata: Additional metadata

    Returns:
        LogRow: Log row with all fields populated
    """
    return LogRow(
        # Run-level information
        run_id=run_id,
        run_mode=run_mode,  # type: ignore[typeddict-item]
        run_started_at=execution_context.started_at,
        run_ended_at=execution_context.ended_at,
        # Execution context
        execution_id=execution_context.execution_id,
        pipeline_id=execution_context.pipeline_id,
        schema=execution_context.schema,
        # Step-level information
        phase=step_result.phase.value,
        step_name=step_result.step_name,
        step_type=(
            step_result.step_type if step_result.step_type is not None else "unknown"
        ),
        # Timing information
        start_time=step_result.start_time,
        end_time=step_result.end_time,
        duration_secs=step_result.duration_secs,
        # Table information
        table_fqn=step_result.table_fqn,
        write_mode=step_result.write_mode,  # type: ignore[typeddict-item]
        # Data metrics
        input_rows=step_result.input_rows,
        output_rows=step_result.rows_processed,
        rows_written=step_result.rows_written,
        rows_processed=step_result.rows_processed,
        table_total_rows=None,
        # Validation metrics
        valid_rows=int(step_result.rows_processed * step_result.validation_rate / 100),
        invalid_rows=int(
            step_result.rows_processed * (100 - step_result.validation_rate) / 100
        ),
        validation_rate=step_result.validation_rate,
        # Execution status
        success=step_result.success,
        error_message=step_result.error_message,
        # Performance metrics
        memory_usage_mb=None,  # TODO: Add memory metrics to StepResult
        cpu_usage_percent=None,  # TODO: Add CPU metrics to StepResult
        # Metadata
        metadata=metadata or {},
    )


def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: str,
    metadata: Dict[str, Any] | None = None,
) -> list[LogRow]:
    """
    Create multiple LogRows from an ExecutionResult.

    Args:
        execution_result: The execution result to convert
        run_id: Unique run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List[LogRow]: List of log rows for each step
    """
    rows = []
    # Process step results from the execution result
    for step_result in execution_result.step_results:
        row = create_log_row_from_step_result(
            step_result=step_result,
            execution_context=execution_result.context,
            run_id=run_id,
            run_mode=run_mode,
            metadata=metadata,
        )
        rows.append(row)
    return rows


def create_log_rows_from_pipeline_report(
    pipeline_report: PipelineReport,
    run_id: str,
    run_mode: str,
    metadata: Dict[str, Any] | None = None,
) -> list[LogRow]:
    """
    Create multiple LogRows from a PipelineReport.

    Args:
        pipeline_report: The pipeline report to convert
        run_id: Unique run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List[LogRow]: List of log rows for each step
    """
    rows = []

    # Create a main log row for the pipeline execution
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,  # type: ignore[typeddict-item]
        "run_started_at": pipeline_report.start_time,
        "run_ended_at": pipeline_report.end_time,
        "execution_id": pipeline_report.execution_id,
        "pipeline_id": pipeline_report.pipeline_id,
        "schema": "default",  # PipelineReport doesn't have schema
        "phase": "pipeline",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": pipeline_report.start_time,
        "end_time": pipeline_report.end_time,
        "duration_secs": pipeline_report.duration_seconds,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": 0,
        "output_rows": 0,
        "rows_written": 0,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": pipeline_report.success,
        "error_message": pipeline_report.errors[0] if pipeline_report.errors else None,
        "memory_usage_mb": None,
        "cpu_usage_percent": None,
        "metadata": metadata or {},
    }
    rows.append(main_row)

    # Add step results from bronze, silver, and gold layers
    all_results = {}
    all_results.update(pipeline_report.bronze_results)
    all_results.update(pipeline_report.silver_results)
    all_results.update(pipeline_report.gold_results)

    for step_name, _step_data in all_results.items():
        # Create a simplified step row since we don't have full StepResult objects
        step_row: LogRow = {
            "run_id": run_id,
            "run_mode": run_mode,  # type: ignore[typeddict-item]
            "run_started_at": pipeline_report.start_time,
            "run_ended_at": pipeline_report.end_time,
            "execution_id": pipeline_report.execution_id,
            "pipeline_id": pipeline_report.pipeline_id,
            "schema": "default",
            "phase": "bronze"
            if step_name in pipeline_report.bronze_results
            else "silver"
            if step_name in pipeline_report.silver_results
            else "gold",
            "step_name": step_name,
            "step_type": "transform",
            "start_time": pipeline_report.start_time,
            "end_time": pipeline_report.end_time,
            "duration_secs": 0.0,  # Not available in PipelineReport
            "table_fqn": None,
            "write_mode": None,
            "input_rows": 0,
            "output_rows": 0,
            "rows_written": 0,
            "rows_processed": 0,
            "table_total_rows": None,
            "valid_rows": 0,
            "invalid_rows": 0,
            "validation_rate": 100.0,
            "success": True,  # Assume success if in results
            "error_message": None,
            "memory_usage_mb": None,
            "cpu_usage_percent": None,
            "metadata": metadata or {},
        }
        rows.append(step_row)

    return rows


# ============================================================================
# Validation Functions
# ============================================================================


def validate_log_row(row: LogRow) -> None:
    """
    Validate a log row for data quality.

    Args:
        row: The log row to validate

    Raises:
        ValueError: If the log row is invalid
    """
    # Validate required fields
    if not row["run_id"]:
        raise ValueError("Run ID cannot be empty")
    if not row["execution_id"]:
        raise ValueError("Execution ID cannot be empty")
    if not row["pipeline_id"]:
        raise ValueError("Pipeline ID cannot be empty")
    if not row["step_name"]:
        raise ValueError("Step name cannot be empty")

    # Validate numeric fields
    if row["duration_secs"] < 0:
        raise ValueError("Duration cannot be negative")
    if row["rows_processed"] < 0:
        raise ValueError("Rows processed cannot be negative")
    if row["valid_rows"] < 0:
        raise ValueError("Valid rows cannot be negative")
    if row["invalid_rows"] < 0:
        raise ValueError("Invalid rows cannot be negative")
    if not 0 <= row["validation_rate"] <= 100:
        raise ValueError("Validation rate must be between 0 and 100")

    # Validate logical consistency
    total_rows = row["valid_rows"] + row["invalid_rows"]
    if total_rows != row["rows_processed"]:
        raise ValueError("Valid + invalid rows must equal rows processed")


def validate_log_data(rows: list[LogRow]) -> Dict[str, Any]:
    """
    Validate a list of log rows.

    Args:
        rows: List of log rows to validate

    Returns:
        Dictionary with validation results
    """
    errors = []
    for i, row in enumerate(rows):
        try:
            validate_log_row(row)
        except ValueError as e:
            errors.append(f"Invalid log row at index {i}: {e}")

    return {"is_valid": len(errors) == 0, "errors": errors}

In [None]:
# Module: pipeline_builder.validation.data_validation (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.models.execution, pipeline_builder.models.types, pipeline_builder_base.errors, pipeline_builder_base.logging

from __future__ import annotations

from typing import Any, Dict

# from ..compat import Column, DataFrame  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..models import ColumnRules  # Removed: defined in notebook cells above
# from .errors import ValidationError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import StageStats  # Removed: defined in notebook cells above

logger = PipelineLogger("DataValidation")


def _convert_rule_to_expression(
    rule: str | list, column_name: str, functions: FunctionsProtocol | None = None
) -> Column:
    """Convert a string rule to a PySpark Column expression."""
    if functions is None:
        functions = get_default_functions()

    # Handle list-based rules like ["gt", 0]
    if isinstance(rule, list):
        if len(rule) == 0:
            return functions.lit(True)  # Empty rule means no validation
        elif len(rule) == 1:
            return _convert_rule_to_expression(rule[0], column_name, functions)
        elif len(rule) == 2:
            op, value = rule
            if op == "gt":
                return functions.col(column_name) > value
            elif op == "gte":
                return functions.col(column_name) >= value
            elif op == "lt":
                return functions.col(column_name) < value
            elif op == "lte":
                return functions.col(column_name) <= value
            elif op == "eq":
                return functions.col(column_name) == value
            elif op == "ne":
                return functions.col(column_name) != value
            else:
                # For unknown operators, assume it's a valid PySpark expression
                return functions.expr(f"{column_name} {op} {value}")
        elif len(rule) == 3:
            op, min_val, max_val = rule
            if op == "between":
                return functions.col(column_name).between(min_val, max_val)
            else:
                # For unknown operators, assume it's a valid PySpark expression
                return functions.expr(f"{column_name} {op} {min_val} {max_val}")
        else:
            # For complex rules, assume it's a valid PySpark expression
            return functions.expr(str(rule))

    # Handle string-based rules
    if rule == "not_null":
        return functions.col(column_name).isNotNull()
    elif rule == "positive":
        return functions.col(column_name) > 0
    elif rule == "non_negative":
        return functions.col(column_name) >= 0
    elif rule == "non_zero":
        return functions.col(column_name) != 0
    else:
        # For unknown rules, assume it's a valid PySpark expression
        return functions.expr(rule)


def _convert_rules_to_expressions(
    rules: ColumnRules,
    functions: FunctionsProtocol | None = None,
) -> Dict[str, list[str | Column]]:
    """Convert string rules to PySpark Column expressions."""
    if functions is None:
        functions = get_default_functions()

    converted_rules: Dict[str, list[str | Column]] = {}
    for column_name, rule_list in rules.items():
        converted_rule_list: list[str | Column] = []
        for rule in rule_list:
            if isinstance(rule, (str, list)):
                converted_rule_list.append(
                    _convert_rule_to_expression(rule, column_name, functions)
                )
            else:
                converted_rule_list.append(rule)
        converted_rules[column_name] = converted_rule_list
    return converted_rules


def and_all_rules(
    rules: ColumnRules, functions: FunctionsProtocol | None = None
) -> Column | bool:
    """Combine all validation rules with AND logic."""
    if not rules:
        return True

    if functions is None:
        functions = get_default_functions()

    converted_rules = _convert_rules_to_expressions(rules, functions)
    expressions = []
    for _, exprs in converted_rules.items():
        expressions.extend(exprs)

    if not expressions:
        return True

    # Filter out non-Column expressions and convert strings to Columns
    column_expressions = []
    for expr in expressions:
        # Check if it's a Column-like object (has column operations)
        if (
            hasattr(expr, "__and__")
            and hasattr(expr, "__invert__")
            and not isinstance(expr, str)
        ):
            column_expressions.append(expr)
        elif isinstance(expr, Column):
            column_expressions.append(expr)
        elif isinstance(expr, str):
            column_expressions.append(functions.expr(expr))

    if not column_expressions:
        return True

    pred = column_expressions[0]
    for e in column_expressions[1:]:
        pred = pred & e

    return pred


def apply_column_rules(
    df: DataFrame,
    rules: ColumnRules,
    stage: str,
    step: str,
    filter_columns_by_rules: bool = True,
    functions: FunctionsProtocol | None = None,
) -> tuple[DataFrame, DataFrame, StageStats]:
    """
    Apply validation rules to a DataFrame and return valid/invalid DataFrames with statistics.

    Args:
        df: DataFrame to validate
        rules: Dictionary mapping column names to validation rules
        stage: Pipeline stage name
        step: Step name within the stage
        filter_columns_by_rules: If True, output DataFrames only contain columns with rules

    Returns:
        Tuple of (valid_df, invalid_df, stats)
    """
    if rules is None:
        raise ValidationError("Validation rules cannot be None")

    # Handle empty rules - return all rows as valid
    if not rules:
        total_rows = df.count()
        duration = time.time() - time.time()  # 0 duration
        stats = StageStats(
            stage=stage,
            step=step,
            total_rows=total_rows,
            valid_rows=total_rows,
            invalid_rows=0,
            validation_rate=100.0,
            duration_secs=duration,
        )
        return (
            df,
            df.limit(0),
            stats,
        )  # Return original df as valid, empty df as invalid

    # Validate that all columns referenced in rules exist in the DataFrame
    df_columns = set(df.columns)
    rule_columns = set(rules.keys())
    missing_columns = rule_columns - df_columns

    if missing_columns:
        available_columns = sorted(df_columns)
        missing_columns_list = sorted(missing_columns)
        raise ValidationError(
            f"Columns referenced in validation rules do not exist in DataFrame. "
            f"Missing columns: {missing_columns_list}. "
            f"Available columns: {available_columns}. "
            f"Stage: {stage}, Step: {step}"
        )

    start_time = time.time()

    # Create validation predicate
    validation_predicate = and_all_rules(rules, functions)

    # Apply validation
    if validation_predicate is True:
        # No validation rules, return all data as valid
        valid_df = df
        invalid_df = df.limit(0)  # Empty DataFrame with same schema
        total_rows = df.count()
        valid_rows = total_rows
        invalid_rows = 0
    elif isinstance(validation_predicate, Column) or (
        hasattr(validation_predicate, "__and__")
        and hasattr(validation_predicate, "__invert__")
        and not isinstance(validation_predicate, bool)
    ):
        # Handle PySpark Column expressions
        valid_df = df.filter(validation_predicate)
        invalid_df = df.filter(~validation_predicate)
        total_rows = df.count()
        valid_rows = valid_df.count()
        invalid_rows = invalid_df.count()
    else:
        # Handle boolean False case (shouldn't happen with current logic)
        valid_df = df.limit(0)
        invalid_df = df
        total_rows = df.count()
        valid_rows = 0
        invalid_rows = total_rows

    # Apply column filtering if requested
    if filter_columns_by_rules:
        # Only keep columns that have validation rules
        rule_columns_list: list[str] = list(rules.keys())
        valid_df = valid_df.select(*rule_columns_list)
        # For invalid_df, also include the _failed_rules column if it exists
        invalid_columns: list[str] = rule_columns_list.copy()
        if "_failed_rules" in invalid_df.columns:
            invalid_columns.append("_failed_rules")
        invalid_df = invalid_df.select(*invalid_columns)

    # Calculate validation rate
    validation_rate = (valid_rows / total_rows * 100) if total_rows > 0 else 100.0

    # Create statistics
    duration = time.time() - start_time
    stats = StageStats(
        stage=stage,
        step=step,
        total_rows=total_rows,
        valid_rows=valid_rows,
        invalid_rows=invalid_rows,
        validation_rate=validation_rate,
        duration_secs=duration,
    )

    logger.info(
        f"Validation completed for {stage}.{step}: {validation_rate:.1f}% valid"
    )

    return valid_df, invalid_df, stats


def validate_dataframe_schema(df: DataFrame, expected_columns: list[str]) -> bool:
    """Validate that DataFrame has expected columns."""
    actual_columns = set(df.columns)
    expected_set = set(expected_columns)
    missing_columns = expected_set - actual_columns
    return len(missing_columns) == 0


def assess_data_quality(
    df: DataFrame,
    rules: ColumnRules | None = None,
    functions: FunctionsProtocol | None = None,
) -> Dict[str, Any]:
    """
    Assess data quality of a DataFrame.

    Args:
        df: DataFrame to assess
        rules: Optional validation rules

    Returns:
        Dictionary with quality metrics
    """
    try:
        total_rows = df.count()

        if total_rows == 0:
            return {
                "total_rows": 0,
                "valid_rows": 0,
                "invalid_rows": 0,
                "quality_rate": 100.0,
                "is_empty": True,
            }

        if rules:
            valid_df, invalid_df, stats = apply_column_rules(
                df, rules, "test", "test", functions=functions
            )
            return {
                "total_rows": stats.total_rows,
                "valid_rows": stats.valid_rows,
                "invalid_rows": stats.invalid_rows,
                "quality_rate": stats.validation_rate,
                "is_empty": False,
            }
        else:
            return {
                "total_rows": total_rows,
                "valid_rows": total_rows,
                "invalid_rows": 0,
                "quality_rate": 100.0,
                "is_empty": False,
            }
    except ValidationError as e:
        # Re-raise validation errors as they are specific and actionable
        raise e
    except Exception as e:
        # Log the unexpected error and re-raise with context
        import logging

        logger = logging.getLogger(__name__)
        logger.error(f"Unexpected error in assess_data_quality: {e}")
        raise ValidationError(
            f"Data quality assessment failed: {e}",
            context={"function": "assess_data_quality", "original_error": str(e)},
        ) from e

In [None]:
# Module: pipeline_builder.validation.pipeline_validation (pipeline_builder)
#
# Dependencies: pipeline_builder.models.execution, pipeline_builder.models.pipeline, pipeline_builder.models.steps, pipeline_builder_base.logging

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict

# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionContext, PipelineConfig  # Removed: defined in notebook cells above

# Type alias for step names
StepName = str


class StepValidator:
    """Protocol for custom step validators."""

    def validate(self, step: Any, context: ExecutionContext) -> list[str]:
        """Validate a step and return any validation errors."""
        return []


@dataclass
class ValidationResult:
    """Result of validation."""

    is_valid: bool
    errors: list[str]
    warnings: list[str]
    recommendations: list[str]

    def __bool__(self) -> bool:
        """Return whether validation passed."""
        return self.is_valid


class UnifiedValidator:
    """
    Unified validation system for both data and pipeline validation.

    This class provides a single interface for all validation needs,
    combining data validation and pipeline validation functionality.
    """

    def __init__(self, logger: PipelineLogger | None = None):
        """Initialize the unified validator."""
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger
        self.custom_validators: list[StepValidator] = []

    def add_validator(self, validator: StepValidator) -> None:
        """Add a custom step validator."""
        self.custom_validators.append(validator)
        self.logger.info(f"Added custom validator: {validator.__class__.__name__}")

    def validate_pipeline(
        self,
        config: PipelineConfig,
        bronze_steps: Dict[StepName, BronzeStep],
        silver_steps: Dict[StepName, SilverStep],
        gold_steps: Dict[StepName, GoldStep],
    ) -> ValidationResult:
        """Validate the entire pipeline configuration."""
        errors: list[str] = []
        warnings: list[str] = []
        recommendations: list[str] = []

        # Validate configuration
        config_errors = self._validate_config(config)
        errors.extend(config_errors)

        # Validate steps
        bronze_errors, bronze_warnings = self._validate_bronze_steps(bronze_steps)
        errors.extend(bronze_errors)
        warnings.extend(bronze_warnings)

        silver_errors, silver_warnings = self._validate_silver_steps(
            silver_steps, bronze_steps
        )
        errors.extend(silver_errors)
        warnings.extend(silver_warnings)

        gold_errors, gold_warnings = self._validate_gold_steps(gold_steps, silver_steps)
        errors.extend(gold_errors)
        warnings.extend(gold_warnings)

        # Validate dependencies
        dep_errors, dep_warnings = self._validate_dependencies(
            bronze_steps, silver_steps, gold_steps
        )
        errors.extend(dep_errors)
        warnings.extend(dep_warnings)

        is_valid = len(errors) == 0

        # Logging is handled by the builder to avoid duplicate messages
        return ValidationResult(
            is_valid=is_valid,
            errors=errors,
            warnings=warnings,
            recommendations=recommendations,
        )

    def validate_step(
        self, step: Any, step_type: str, context: ExecutionContext
    ) -> ValidationResult:
        """Validate a single step."""
        errors: list[str] = []
        warnings: list[str] = []

        # Run custom validators
        for validator in self.custom_validators:
            try:
                validator_errors = validator.validate(step, context)
                errors.extend(validator_errors)
            except Exception as e:
                errors.append(
                    f"Custom validator {validator.__class__.__name__} failed: {e}"
                )

        return ValidationResult(
            is_valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
            recommendations=[],
        )

    def _validate_config(self, config: PipelineConfig) -> list[str]:
        """Validate pipeline configuration."""
        errors = []

        if not config.schema:
            errors.append("Pipeline schema is required")

        # Table prefix is optional in simplified config
        # if not config.table_prefix:
        #     errors.append("Table prefix is required")

        return errors

    def _validate_bronze_steps(
        self, bronze_steps: Dict[StepName, BronzeStep]
    ) -> tuple[list[str], list[str]]:
        """Validate bronze steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in bronze_steps.items():
            # Simplified validation - just check that step has required basic attributes
            if not step.name:
                errors.append(f"Bronze step {step_name} missing name")

            if not step.rules:
                errors.append(f"Bronze step {step_name} missing validation rules")

        return errors, warnings

    def _validate_silver_steps(
        self,
        silver_steps: Dict[StepName, SilverStep],
        bronze_steps: Dict[StepName, BronzeStep],
    ) -> tuple[list[str], list[str]]:
        """Validate silver steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in silver_steps.items():
            if not step.source_bronze:
                errors.append(f"Silver step {step_name} missing source_bronze")

            # Check source_bronze exists
            if step.source_bronze not in bronze_steps:
                errors.append(
                    f"Silver step {step_name} depends on non-existent bronze step {step.source_bronze}"
                )

        return errors, warnings

    def _validate_gold_steps(
        self,
        gold_steps: Dict[StepName, GoldStep],
        silver_steps: Dict[StepName, SilverStep],
    ) -> tuple[list[str], list[str]]:
        """Validate gold steps."""
        errors = []
        warnings: list[str] = []

        for step_name, step in gold_steps.items():
            # Check source_silvers exist (if specified)
            if step.source_silvers:
                for silver_name in step.source_silvers:
                    if silver_name not in silver_steps:
                        errors.append(
                            f"Gold step {step_name} depends on non-existent silver step {silver_name}"
                        )

        return errors, warnings

    def _validate_dependencies(
        self,
        bronze_steps: Dict[StepName, BronzeStep],
        silver_steps: Dict[StepName, SilverStep],
        gold_steps: Dict[StepName, GoldStep],
    ) -> tuple[list[str], list[str]]:
        """Validate step dependencies."""
        errors = []
        warnings: list[str] = []

        # Check for circular dependencies
        all_steps = {**bronze_steps, **silver_steps, **gold_steps}

        for step_name, step in all_steps.items():
            # Check for circular dependencies in non-standard dependencies attribute
            # This is only for custom step types that might have a dependencies field
            if hasattr(step, "dependencies"):
                dependencies = getattr(step, "dependencies", None)
                if dependencies and isinstance(dependencies, (list, tuple, set)):
                    for dep in dependencies:
                        if hasattr(dep, "step_name") and dep.step_name == step_name:
                            errors.append(
                                f"Step {step_name} has circular dependency on itself"
                            )

        return errors, warnings

In [None]:
# Module: pipeline_builder.models.dependencies (pipeline_builder)
#
# Dependencies: pipeline_builder.models.base, pipeline_builder_base.errors

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict

# from .errors import PipelineValidationError  # Removed: defined in notebook cells above
# from .base import BaseModel  # Removed: defined in notebook cells above


@dataclass
class SilverDependencyInfo(BaseModel):
    """
    Dependency information for Silver steps.

    Attributes:
        step_name: Name of the silver step
        source_bronze: Source bronze step name
        depends_on_silvers: Set of silver step names this step depends on
        can_run_parallel: Whether this step can run in parallel
        execution_group: Execution group for parallel processing
    """

    step_name: str
    source_bronze: str
    depends_on_silvers: set[str]
    can_run_parallel: bool
    execution_group: int

    def validate(self) -> None:
        """Validate dependency information."""
        if not self.step_name or not isinstance(self.step_name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if not self.source_bronze or not isinstance(self.source_bronze, str):
            raise PipelineValidationError(
                "Source bronze step name must be a non-empty string"
            )
        if not isinstance(self.depends_on_silvers, set):
            raise PipelineValidationError("Depends on silvers must be a set")
        if self.execution_group < 0:
            raise PipelineValidationError("Execution group must be non-negative")


@dataclass
class CrossLayerDependency(BaseModel):
    """
    Represents a dependency between steps across different layers.

    Attributes:
        source_step: Name of the source step
        target_step: Name of the target step
        dependency_type: Type of dependency (data, validation, etc.)
        is_required: Whether this dependency is required for execution
    """

    source_step: str
    target_step: str
    dependency_type: str = "data"
    is_required: bool = True

    def validate(self) -> None:
        """Validate dependency information."""
        if not self.source_step or not isinstance(self.source_step, str):
            raise PipelineValidationError("Source step must be a non-empty string")
        if not self.target_step or not isinstance(self.target_step, str):
            raise PipelineValidationError("Target step must be a non-empty string")
        if self.source_step == self.target_step:
            raise PipelineValidationError("Source and target steps cannot be the same")


@dataclass
class UnifiedStepConfig(BaseModel):
    """
    Unified configuration for pipeline steps.

    Attributes:
        step_name: Name of the step
        step_type: Type of step (bronze/silver/gold)
        dependencies: List of step dependencies
        config: Step-specific configuration
    """

    step_name: str
    step_type: str
    dependencies: list[str]
    config: Dict[str, Any]

    def validate(self) -> None:
        """Validate unified step configuration."""
        if not self.step_name or not isinstance(self.step_name, str):
            raise PipelineValidationError("Step name must be a non-empty string")
        if self.step_type not in ["bronze", "silver", "gold"]:
            raise PipelineValidationError("Step type must be bronze, silver, or gold")
        if not isinstance(self.dependencies, list):
            raise PipelineValidationError("Dependencies must be a list")
        if not isinstance(self.config, dict):
            raise PipelineValidationError("Config must be a dictionary")


@dataclass
class UnifiedExecutionPlan(BaseModel):
    """
    Unified execution plan for pipeline steps.

    Attributes:
        steps: List of unified step configurations
        execution_order: Ordered list of step names for execution
        parallel_groups: Groups of steps that can run in parallel
    """

    steps: list[UnifiedStepConfig]
    execution_order: list[str]
    parallel_groups: list[list[str]]

    def validate(self) -> None:
        """Validate unified execution plan."""
        if not isinstance(self.steps, list):
            raise PipelineValidationError("Steps must be a list")
        if not isinstance(self.execution_order, list):
            raise PipelineValidationError("Execution order must be a list")
        if not isinstance(self.parallel_groups, list):
            raise PipelineValidationError("Parallel groups must be a list")

        # Validate that all steps in execution order exist
        step_names = {step.step_name for step in self.steps}
        for step_name in self.execution_order:
            if step_name not in step_names:
                raise PipelineValidationError(f"Step {step_name} not found in steps")

In [None]:
# Module: pipeline_builder.writer.monitoring (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.writer.models, pipeline_builder.writer.query_builder, pipeline_builder_base.logging, writer.exceptions

from __future__ import annotations

from typing import Dict, TypedDict

try:
    import psutil

    HAS_PSUTIL = True
except ImportError:
    HAS_PSUTIL = False
    psutil = None  # type: ignore[assignment, unused-ignore]

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .exceptions import WriterError  # Removed: defined in notebook cells above
# from .models import WriterMetrics  # Removed: defined in notebook cells above
# from .query_builder import QueryBuilder  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================


class OperationMetrics(TypedDict):
    """Metrics for a single operation."""

    operation_id: str
    success: bool
    duration_secs: float
    rows_written: int
    memory_usage_mb: float
    error_message: str | None
    timestamp: str


class SparkMemoryInfo(TypedDict, total=False):
    """Spark memory configuration."""

    executor_memory: str
    driver_memory: str


class MemoryUsageInfo(TypedDict):
    """Memory usage information structure."""

    total_mb: float
    available_mb: float
    used_mb: float
    percentage: float
    spark_memory: SparkMemoryInfo
    psutil_available: bool


class SuccessRateTrend(TypedDict):
    """Success rate trend data point."""

    date: str
    success_rate: float
    avg_validation_rate: float
    avg_execution_time: float


class PerformanceByPhase(TypedDict):
    """Performance metrics by phase."""

    phase: str
    avg_execution_time: float
    avg_validation_rate: float
    execution_count: int


class DataQualityTrend(TypedDict):
    """Data quality trend data point."""

    date: str
    avg_validation_rate: float
    min_validation_rate: float
    max_validation_rate: float


class PerformanceTrends(TypedDict):
    """Execution trends analysis structure."""

    success_rate_trend: list[SuccessRateTrend]
    performance_by_phase: list[PerformanceByPhase]
    data_quality_trend: list[DataQualityTrend]


class PerformanceAnomaly(TypedDict):
    """Performance anomaly data point."""

    step: str
    execution_time: float
    validation_rate: float
    success: bool


class QualityAnomaly(TypedDict):
    """Quality anomaly data point."""

    step: str
    validation_rate: float
    valid_rows: int
    invalid_rows: int


class AnomalyReport(TypedDict):
    """Anomaly detection results structure."""

    performance_anomalies: list[PerformanceAnomaly]
    quality_anomalies: list[QualityAnomaly]
    anomaly_score: float
    total_anomalies: int
    total_executions: int


class OverallStatistics(TypedDict):
    """Overall performance statistics."""

    total_executions: int
    successful_executions: int
    success_rate: float
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int


class PhaseStatistics(TypedDict):
    """Phase-wise performance statistics."""

    phase: str
    execution_count: int
    avg_execution_time: float
    avg_validation_rate: float
    total_rows_written: int


class RecentPerformance(TypedDict):
    """Recent performance data point."""

    date: str
    daily_executions: int
    avg_execution_time: float
    avg_validation_rate: float


class PerformanceReport(TypedDict):
    """Comprehensive performance report structure."""

    overall_statistics: OverallStatistics
    phase_statistics: list[PhaseStatistics]
    recent_performance: list[RecentPerformance]
    generated_at: str


class PerformanceMonitor:
    """Handles performance monitoring and metrics collection."""

    def __init__(self, spark: SparkSession, logger: PipelineLogger | None = None):
        """Initialize the performance monitor."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("PerformanceMonitor")
        else:
            self.logger = logger
        self.metrics: WriterMetrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        self.operation_start_times: Dict[str, float] = {}

    def start_operation(self, operation_id: str, operation_type: str) -> None:
        """
        Start monitoring an operation.

        Args:
            operation_id: Unique identifier for the operation
            operation_type: Type of operation being monitored
        """
        try:
            self.operation_start_times[operation_id] = time.time()
            self.logger.info(
                f"Started monitoring {operation_type} operation: {operation_id}"
            )

        except Exception as e:
            self.logger.error(
                f"Failed to start monitoring operation {operation_id}: {e}"
            )
            raise WriterError(
                f"Failed to start monitoring operation {operation_id}: {e}"
            ) from e

    def end_operation(
        self,
        operation_id: str,
        success: bool,
        rows_written: int = 0,
        error_message: str | None = None,
    ) -> OperationMetrics:
        """
        End monitoring an operation and update metrics.

        Args:
            operation_id: Unique identifier for the operation
            success: Whether the operation was successful
            rows_written: Number of rows written
            error_message: Error message if operation failed

        Returns:
            Dictionary containing operation metrics
        """
        try:
            if operation_id not in self.operation_start_times:
                self.logger.warning(f"Operation {operation_id} was not being monitored")
                # Return empty metrics matching the TypedDict
                return {
                    "operation_id": operation_id,
                    "success": False,
                    "duration_secs": 0.0,
                    "rows_written": 0,
                    "memory_usage_mb": 0.0,
                    "error_message": "Operation was not being monitored",
                    "timestamp": datetime.now().isoformat(),
                }

            # Calculate duration
            start_time = self.operation_start_times[operation_id]
            duration = time.time() - start_time

            # Update metrics
            self.metrics["total_writes"] += 1
            if success:
                self.metrics["successful_writes"] += 1
            else:
                self.metrics["failed_writes"] += 1

            self.metrics["total_duration_secs"] += duration
            self.metrics["total_rows_written"] += rows_written

            # Calculate average duration
            if self.metrics["total_writes"] > 0:
                self.metrics["avg_write_duration_secs"] = (
                    self.metrics["total_duration_secs"] / self.metrics["total_writes"]
                )

            # Update peak memory usage
            current_memory = self.get_memory_usage()["used_mb"]
            if current_memory > self.metrics["memory_usage_peak_mb"]:
                self.metrics["memory_usage_peak_mb"] = current_memory

            # Create operation metrics
            operation_metrics = {
                "operation_id": operation_id,
                "success": success,
                "duration_secs": duration,
                "rows_written": rows_written,
                "memory_usage_mb": current_memory,
                "error_message": error_message,
                "timestamp": datetime.now().isoformat(),
            }

            # Clean up
            del self.operation_start_times[operation_id]

            self.logger.info(
                f"Completed monitoring {operation_id}: {duration:.2f}s, {rows_written} rows"
            )
            return cast(OperationMetrics, operation_metrics)

        except Exception as e:
            self.logger.error(f"Failed to end monitoring operation {operation_id}: {e}")
            raise WriterError(
                f"Failed to end monitoring operation {operation_id}: {e}"
            ) from e

    def get_metrics(self) -> WriterMetrics:
        """Get current performance metrics."""
        return self.metrics.copy()

    def reset_metrics(self) -> None:
        """Reset performance metrics."""
        self.metrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        self.logger.info("Performance metrics reset")

    def get_memory_usage(self) -> MemoryUsageInfo:
        """
        Get current memory usage information.

        Returns:
            Dictionary containing memory usage details
        """
        # Check if psutil is available at all
        if not HAS_PSUTIL or psutil is None:
            self.logger.warning("psutil not available, returning basic memory info")
            return {
                "total_mb": 0.0,
                "available_mb": 0.0,
                "used_mb": 0.0,
                "percentage": 0.0,
                "spark_memory": {},
                "psutil_available": False,
            }

        try:
            # Get system memory info
            memory = psutil.virtual_memory()

            # Get Spark memory info if available
            spark_memory = {}
            try:
                spark_context = self.spark.sparkContext
                spark_memory = {
                    "executor_memory": spark_context.getConf().get(
                        "spark.executor.memory", "N/A"
                    ),
                    "driver_memory": spark_context.getConf().get(
                        "spark.driver.memory", "N/A"
                    ),
                }
            except Exception:
                pass

            memory_info = {
                "total_mb": round(memory.total / (1024 * 1024), 2),
                "available_mb": round(memory.available / (1024 * 1024), 2),
                "used_mb": round(memory.used / (1024 * 1024), 2),
                "percentage": memory.percent,
                "spark_memory": spark_memory,
                "psutil_available": True,
            }

            return cast(MemoryUsageInfo, memory_info)

        except Exception as e:
            self.logger.error(f"Failed to get memory usage: {e}")
            raise WriterError(f"Failed to get memory usage: {e}") from e

    def check_performance_thresholds(
        self, operation_metrics: OperationMetrics
    ) -> list[str]:
        """
        Check if performance thresholds are exceeded.

        Args:
            operation_metrics: Metrics for the operation

        Returns:
            List of threshold violations
        """
        violations = []

        try:
            # Check duration threshold (5 minutes)
            if operation_metrics.get("duration_secs", 0) > 300:
                violations.append("Operation duration exceeded 5 minutes")

            # Check memory usage threshold (8GB)
            if operation_metrics.get("memory_usage_mb", 0) > 8192:
                violations.append("Memory usage exceeded 8GB")

            # Check success rate threshold (95%)
            if self.metrics["total_writes"] > 0:
                success_rate = (
                    self.metrics["successful_writes"] / self.metrics["total_writes"]
                ) * 100
                if success_rate < 95.0:
                    violations.append(f"Success rate below 95%: {success_rate:.1f}%")

            return violations

        except Exception as e:
            self.logger.error(f"Failed to check performance thresholds: {e}")
            raise WriterError(f"Failed to check performance thresholds: {e}") from e


class AnalyticsEngine:
    """Handles analytics and trend analysis for writer operations."""

    def __init__(self, spark: SparkSession, logger: PipelineLogger | None = None):
        """Initialize the analytics engine."""
        self.spark = spark
        if logger is None:
            self.logger = PipelineLogger("AnalyticsEngine")
        else:
            self.logger = logger

    def analyze_execution_trends(self, df: DataFrame) -> PerformanceTrends:
        """
        Analyze execution trends from log data.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing trend analysis
        """
        try:
            self.logger.info("Analyzing execution trends")

            # Use query builder for all trend analyses
            trends = {}

            # Success rate trend using query builder
            success_trend_df = QueryBuilder.build_daily_trends_query(df, 30)
            success_trend = success_trend_df.collect()

            trends["success_rate_trend"] = [
                {
                    "date": row["date"],
                    "success_rate": (
                        row["successful_executions"] / row["daily_executions"]
                    )
                    * 100,
                    "avg_validation_rate": row.get("avg_validation_rate", 0),
                    "avg_execution_time": row["avg_execution_time"],
                }
                for row in success_trend
            ]

            # Performance trends using query builder
            performance_trend_df = QueryBuilder.build_phase_trends_query(df, 30)
            performance_trend = performance_trend_df.collect()

            trends["performance_by_phase"] = [
                {
                    "phase": row["phase"],
                    "avg_execution_time": row["avg_execution_time"],
                    "avg_validation_rate": row["avg_validation_rate"],
                    "execution_count": row["execution_count"],
                }
                for row in performance_trend
            ]

            # Data quality trends using query builder
            quality_trend_df = QueryBuilder.build_quality_trends_query(df, 30)
            quality_trend = quality_trend_df.collect()

            trends["data_quality_trend"] = [
                {
                    "date": row["date"],
                    "avg_validation_rate": row["avg_validation_rate"],
                    "min_validation_rate": row["min_validation_rate"],
                    "max_validation_rate": row["max_validation_rate"],
                }
                for row in quality_trend
            ]

            self.logger.info("Execution trends analysis completed")
            return cast(PerformanceTrends, trends)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def detect_anomalies(self, df: DataFrame) -> AnomalyReport:
        """
        Detect anomalies in execution data.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting anomalies in execution data")

            anomalies: AnomalyReport = {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": 0,
            }

            # Calculate performance thresholds using query builder
            performance_stats = QueryBuilder.calculate_statistics(df, "execution_time")
            performance_threshold = performance_stats["avg"] + (
                2 * performance_stats["stddev"]
            )

            # Detect performance anomalies using query builder
            performance_anomalies_df = QueryBuilder.build_performance_anomaly_query(
                df, performance_threshold
            ).select("step", "execution_time", "validation_rate", "success")

            performance_anomalies = performance_anomalies_df.collect()

            anomalies["performance_anomalies"] = [
                {
                    "step": row["step"],
                    "execution_time": row["execution_time"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                }
                for row in performance_anomalies
            ]

            # Detect data quality anomalies using query builder
            quality_anomalies_df = (
                QueryBuilder.build_quality_anomaly_query(df, 90.0)
                .select("step", "validation_rate", "valid_rows", "invalid_rows")
                .orderBy("validation_rate")
            )

            quality_anomalies = quality_anomalies_df.collect()

            anomalies["quality_anomalies"] = [
                {
                    "step": row["step"],
                    "validation_rate": row["validation_rate"],
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                }
                for row in quality_anomalies
            ]

            # Calculate anomaly score
            total_executions = df.count()
            anomaly_count = len(performance_anomalies) + len(quality_anomalies)
            anomaly_score = (
                (anomaly_count / total_executions) * 100 if total_executions > 0 else 0
            )

            anomalies["anomaly_score"] = float(round(anomaly_score, 2))
            anomalies["total_anomalies"] = int(anomaly_count)
            anomalies["total_executions"] = int(total_executions)

            self.logger.info(
                f"Anomaly detection completed: {anomaly_count} anomalies found"
            )
            return anomalies

        except Exception as e:
            self.logger.error(f"Failed to detect anomalies: {e}")
            raise WriterError(f"Failed to detect anomalies: {e}") from e

    def generate_performance_report(self, df: DataFrame) -> PerformanceReport:
        """
        Generate comprehensive performance report.

        Args:
            df: DataFrame containing log data

        Returns:
            Dictionary containing performance report
        """
        try:
            self.logger.info("Generating performance report")

            # Overall statistics using query builder
            overall_stats_df = df.agg(**QueryBuilder.get_common_aggregations())
            overall_stats = overall_stats_df.collect()[0]

            # Phase-wise statistics using query builder
            phase_stats_df = QueryBuilder.build_phase_trends_query(df, 30)
            phase_stats = phase_stats_df.collect()

            # Recent performance using query builder
            recent_performance_df = QueryBuilder.build_recent_performance_query(df, 7)
            recent_performance = recent_performance_df.collect()

            report = {
                "overall_statistics": {
                    "total_executions": overall_stats["total_executions"],
                    "successful_executions": overall_stats["successful_executions"],
                    "success_rate": (
                        (
                            overall_stats["successful_executions"]
                            / overall_stats["total_executions"]
                        )
                        * 100
                        if overall_stats["total_executions"] > 0
                        else 0
                    ),
                    "avg_execution_time": overall_stats["avg_execution_time"],
                    "avg_validation_rate": overall_stats["avg_validation_rate"],
                    "total_rows_written": overall_stats["total_rows_written"],
                },
                "phase_statistics": [
                    {
                        "phase": row["phase"],
                        "execution_count": row["execution_count"],
                        "avg_execution_time": row["avg_execution_time"],
                        "avg_validation_rate": row["avg_validation_rate"],
                        "total_rows_written": row["total_rows_written"],
                    }
                    for row in phase_stats
                ],
                "recent_performance": [
                    {
                        "date": row["date"].strftime("%Y-%m-%d"),
                        "daily_executions": row["daily_executions"],
                        "avg_execution_time": row["avg_execution_time"],
                        "avg_validation_rate": row["avg_validation_rate"],
                    }
                    for row in recent_performance
                ],
                "generated_at": datetime.now().isoformat(),
            }

            self.logger.info("Performance report generated successfully")
            return cast(PerformanceReport, report)

        except Exception as e:
            self.logger.error(f"Failed to generate performance report: {e}")
            raise WriterError(f"Failed to generate performance report: {e}") from e

In [None]:
# Module: pipeline_builder.writer.operations (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.models.execution, pipeline_builder.writer.models, pipeline_builder_base.logging, validation.utils, writer.exceptions

from __future__ import annotations

from typing import Callable, Dict, TypedDict, Union

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionResult, StepResult  # Removed: defined in notebook cells above
# from ..validation import get_dataframe_info  # Removed: defined in notebook cells above
# from .exceptions import WriterValidationError  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
# LogRow,
# create_log_rows_from_execution_result,
# create_log_schema,
# validate_log_data,
# )

# ============================================================================
# TypedDict Definitions
# ============================================================================


class DataQualityReport(TypedDict):
    """Data quality validation report."""

    is_valid: bool
    total_rows: int
    null_counts: Dict[str, int]
    validation_issues: list[str]
    failed_executions: int
    data_quality_score: float


class DataProcessor:
    """Handles data processing and transformation operations."""

    def __init__(
        self,
        spark: SparkSession,
        functions: FunctionsProtocol | None = None,
        logger: PipelineLogger | None = None,
    ):
        """Initialize the data processor."""
        self.spark = spark
        self.functions = functions if functions is not None else get_default_functions()
        self.logger = logger or PipelineLogger("DataProcessor")

    def process_execution_result(
        self,
        execution_result: ExecutionResult,
        run_id: str,
        run_mode: str = "initial",
        metadata: Union[Dict[str, Union[str, int, float, bool]], None] = None,
        table_total_rows_provider: Callable[[str | None], int | None] | None = None,
    ) -> list[LogRow]:
        """
        Process execution result into log rows.

        Args:
            execution_result: The execution result to process
            run_id: Unique run identifier
            run_mode: Mode of the run
            metadata: Additional metadata
            table_total_rows_provider: Optional callback to supply table row counts

        Returns:
            List of processed log rows

        Raises:
            WriterValidationError: If validation fails
        """
        try:
            self.logger.info(f"Processing execution result for run {run_id}")

            # Create log rows from execution result
            log_rows = create_log_rows_from_execution_result(
                execution_result, run_id, run_mode, metadata
            )

            # Validate log data
            validation_result = validate_log_data(log_rows)
            if not validation_result["is_valid"]:
                raise WriterValidationError(
                    f"Log data validation failed: {validation_result['errors']}",
                    validation_errors=validation_result["errors"],
                    context={"run_id": run_id, "log_rows_count": len(log_rows)},
                    suggestions=[
                        "Check data quality in source execution result",
                        "Verify all required fields are present",
                        "Ensure data types are correct",
                    ],
                )

            # Populate table_total_rows when possible
            if table_total_rows_provider is not None:
                for row in log_rows:
                    if row.get("table_total_rows") is None:
                        row["table_total_rows"] = table_total_rows_provider(
                            row.get("table_fqn")
                        )

            self.logger.info(f"Successfully processed {len(log_rows)} log rows")
            return log_rows

        except Exception as e:
            self.logger.error(f"Failed to process execution result: {e}")
            raise

    def process_step_results(
        self,
        step_results: Dict[str, StepResult],
        run_id: str,
        run_mode: str = "initial",
        metadata: Union[Dict[str, Union[str, int, float, bool]], None] = None,
    ) -> list[LogRow]:
        """
        Process step results into log rows.

        Args:
            step_results: Dictionary of step results
            run_id: Unique run identifier
            run_mode: Mode of the run
            metadata: Additional metadata

        Returns:
            List of processed log rows
        """
        try:
            self.logger.info(
                f"Processing {len(step_results)} step results for run {run_id}"
            )

            log_rows = []
            for step_name, step_result in step_results.items():
                # Create log row for each step
                log_row = LogRow(
                    run_id=run_id,
                    run_mode=run_mode,  # type: ignore[typeddict-item]
                    run_started_at=datetime.now(),
                    run_ended_at=datetime.now(),
                    execution_id=run_id,
                    pipeline_id=run_id,
                    schema="default",
                    phase=step_result.phase.value,
                    step_name=step_name,
                    step_type=step_result.phase.value,
                    start_time=step_result.start_time,
                    end_time=step_result.end_time,
                    duration_secs=step_result.duration_secs,
                    table_fqn=f"{step_result.phase.value}_{step_name}",
                    write_mode="append",
                    input_rows=step_result.rows_processed,
                    output_rows=step_result.rows_written,
                    rows_written=step_result.rows_written,
                    valid_rows=int(
                        step_result.rows_processed * step_result.validation_rate / 100
                    ),
                    invalid_rows=int(
                        step_result.rows_processed
                        * (100 - step_result.validation_rate)
                        / 100
                    ),
                    validation_rate=step_result.validation_rate,
                    success=step_result.success,
                    error_message=step_result.error_message,
                    metadata=metadata or {},
                    rows_processed=step_result.rows_processed,
                    table_total_rows=None,
                    memory_usage_mb=0.0,
                    cpu_usage_percent=0.0,
                )
                log_rows.append(log_row)

            self.logger.info(f"Successfully processed {len(log_rows)} step log rows")
            return log_rows

        except Exception as e:
            self.logger.error(f"Failed to process step results: {e}")
            raise

    def create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> DataFrame:
        """
        Create DataFrame from log rows.

        Args:
            log_rows: List of log rows to convert

        Returns:
            DataFrame containing the log rows
        """
        try:
            self.logger.info(f"Creating DataFrame from {len(log_rows)} log rows")

            # Convert log rows to dictionaries
            log_data = []
            for row in log_rows:
                row_dict = {
                    "run_id": row["run_id"],
                    "run_mode": row["run_mode"],
                    "run_started_at": row["run_started_at"],
                    "run_ended_at": row["run_ended_at"],
                    "execution_id": row["execution_id"],
                    "pipeline_id": row["pipeline_id"],
                    "schema": row["schema"],
                    "phase": row["phase"],
                    "step_name": row["step_name"],
                    "step_type": row["step_type"],
                    "start_time": row["start_time"],
                    "end_time": row["end_time"],
                    "duration_secs": row["duration_secs"],
                    "table_fqn": row["table_fqn"],
                    "write_mode": row["write_mode"],
                    "input_rows": row["input_rows"],
                    "output_rows": row["output_rows"],
                    "rows_written": row["rows_written"],
                    "rows_processed": row["rows_processed"],
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                    "error_message": row["error_message"],
                    "memory_usage_mb": row["memory_usage_mb"],
                    "cpu_usage_percent": row["cpu_usage_percent"],
                    "metadata": (
                        json.dumps(row["metadata"]) if row["metadata"] else None
                    ),
                    "created_at": datetime.now().isoformat(),  # Include timestamp directly as string
                }
                log_data.append(row_dict)

            # Create DataFrame with explicit schema for type safety and None value handling
            schema = create_log_schema()
            df = self.spark.createDataFrame(log_data, schema)  # type: ignore[attr-defined]

            self.logger.info("Successfully created DataFrame from log rows")
            return df

        except Exception as e:
            self.logger.error(f"Failed to create DataFrame from log rows: {e}")
            raise

    def validate_data_quality(self, df: DataFrame) -> DataQualityReport:
        """
        Validate data quality of the DataFrame.

        Args:
            df: DataFrame to validate

        Returns:
            Dictionary containing validation results
        """
        try:
            self.logger.info("Validating data quality")

            # Get DataFrame info
            df_info = get_dataframe_info(df)

            # Check for null values in critical columns
            critical_columns = ["run_id", "phase", "step", "success"]
            null_counts = {}

            for col_name in critical_columns:
                if col_name in df.columns:
                    null_count = df.filter(
                        self.functions.col(col_name).isNull()
                    ).count()
                    null_counts[col_name] = null_count

            # Check validation rates
            validation_issues = []
            if "validation_rate" in df.columns:
                low_validation = df.filter(
                    self.functions.col("validation_rate") < 95.0
                ).count()
                if low_validation > 0:
                    validation_issues.append(
                        f"{low_validation} records with validation rate < 95%"
                    )

            # Check for failed executions
            failed_executions = 0
            if "success" in df.columns:
                failed_executions = df.filter(~self.functions.col("success")).count()

            validation_result = {
                "is_valid": len(validation_issues) == 0 and failed_executions == 0,
                "total_rows": df_info["row_count"],
                "null_counts": null_counts,
                "validation_issues": validation_issues,
                "failed_executions": failed_executions,
                "data_quality_score": self._calculate_quality_score(
                    df_info, null_counts, validation_issues, failed_executions
                ),
            }

            self.logger.info(
                f"Data quality validation completed: {validation_result['is_valid']}"
            )
            return cast(DataQualityReport, validation_result)

        except Exception as e:
            self.logger.error(f"Failed to validate data quality: {e}")
            raise

    def _calculate_quality_score(
        self,
        df_info: Dict[str, Union[int, str]],
        null_counts: Dict[str, int],
        validation_issues: list[str],
        failed_executions: int,
    ) -> float:
        """Calculate data quality score."""
        try:
            total_rows = df_info["row_count"]
            if total_rows == 0:
                return 0.0

            # Ensure total_rows is an integer for division
            if not isinstance(total_rows, int):
                total_rows = int(total_rows) if total_rows else 0
            if total_rows == 0:
                return 0.0

            # Calculate null penalty
            null_penalty = sum(null_counts.values()) / total_rows

            # Calculate validation penalty
            validation_penalty = len(validation_issues) * 0.1

            # Calculate failure penalty
            failure_penalty = failed_executions / total_rows

            # Calculate quality score (0-100)
            quality_score = max(
                0.0, 100.0 - (null_penalty + validation_penalty + failure_penalty) * 100
            )

            return float(round(quality_score, 2))

        except Exception:
            return 0.0

    def apply_data_transformations(self, df: DataFrame) -> DataFrame:
        """
        Apply data transformations to the DataFrame.

        Args:
            df: DataFrame to transform

        Returns:
            Transformed DataFrame
        """
        try:
            self.logger.info("Applying data transformations")

            # Add computed columns
            df_transformed = df.withColumn(
                "processing_efficiency",
                self.functions.when(
                    self.functions.col("input_rows") > 0,
                    self.functions.col("output_rows")
                    / self.functions.col("input_rows")
                    * 100,
                ).otherwise(0),
            ).withColumn(
                "data_quality_score",
                self.functions.when(
                    self.functions.col("validation_rate") >= 95.0, "High"
                )
                .when(self.functions.col("validation_rate") >= 80.0, "Medium")
                .otherwise("Low"),
            )

            self.logger.info("Data transformations applied successfully")
            return df_transformed

        except Exception as e:
            self.logger.error(f"Failed to apply data transformations: {e}")
            raise

In [None]:
# Module: pipeline_builder_base.reporting (pipeline_builder_base)
#
# Dependencies: performance

from __future__ import annotations

from typing import TypedDict

# from .models import StageStats  # Removed: defined in notebook cells above
# from .validation import safe_divide  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================


class ValidationReport(TypedDict):
    """Validation report structure."""

    stage: str | None
    step: str | None
    total_rows: int
    valid_rows: int
    invalid_rows: int
    validation_rate: float
    duration_secs: float
    start_at: datetime
    end_at: datetime


class TransformReport(TypedDict):
    """Transform operation report structure."""

    input_rows: int
    output_rows: int
    duration_secs: float
    skipped: bool
    start_at: datetime
    end_at: datetime


class WriteReport(TypedDict):
    """Write operation report structure."""

    mode: str
    rows_written: int
    duration_secs: float
    table_fqn: str
    skipped: bool
    start_at: datetime
    end_at: datetime


class ExecutionSummary(TypedDict):
    """Execution summary nested structure."""

    total_steps: int
    successful_steps: int
    failed_steps: int
    success_rate: float
    failure_rate: float


class PerformanceMetrics(TypedDict):
    """Performance metrics nested structure."""

    total_duration_secs: float
    formatted_duration: str
    avg_validation_rate: float


class DataMetrics(TypedDict):
    """Data metrics nested structure."""

    total_rows_processed: int
    total_rows_written: int
    processing_efficiency: float


class SummaryReport(TypedDict):
    """Complete summary report structure."""

    execution_summary: ExecutionSummary
    performance_metrics: PerformanceMetrics
    data_metrics: DataMetrics


def create_validation_dict(
    stats: StageStats | None, *, start_at: datetime, end_at: datetime
) -> ValidationReport:
    """
    Create a validation report dictionary from stage stats.

    Args:
        stats: Stage statistics
        start_at: Start time
        end_at: End time

    Returns:
        Validation report dictionary
    """
    if stats is None:
        return {
            "stage": None,
            "step": None,
            "total_rows": 0,
            "valid_rows": 0,
            "invalid_rows": 0,
            "validation_rate": 0.0,
            "duration_secs": (end_at - start_at).total_seconds(),
            "start_at": start_at,
            "end_at": end_at,
        }

    return {
        "stage": stats.stage,
        "step": stats.step,
        "total_rows": stats.total_rows,
        "valid_rows": stats.valid_rows,
        "invalid_rows": stats.invalid_rows,
        "validation_rate": stats.validation_rate,
        "duration_secs": stats.duration_secs,
        "start_at": start_at,
        "end_at": end_at,
    }


def create_transform_dict(
    *,
    input_rows: int,
    output_rows: int,
    start_at: datetime,
    end_at: datetime,
    skipped: bool = False,
) -> TransformReport:
    """
    Create a transform report dictionary.

    Args:
        input_rows: Number of input rows
        output_rows: Number of output rows
        start_at: Start time
        end_at: End time
        skipped: Whether the transform was skipped

    Returns:
        Transform report dictionary
    """
    return {
        "input_rows": input_rows,
        "output_rows": output_rows,
        "duration_secs": (end_at - start_at).total_seconds(),
        "skipped": skipped,
        "start_at": start_at,
        "end_at": end_at,
    }


def create_write_dict(
    *,
    mode: str,
    rows_written: int,
    table_fqn: str,
    start_at: datetime,
    end_at: datetime,
    skipped: bool = False,
) -> WriteReport:
    """
    Create a write report dictionary.

    Args:
        mode: Write mode
        rows_written: Number of rows written
        table_fqn: Fully qualified table name
        start_at: Start time
        end_at: End time
        skipped: Whether the write was skipped

    Returns:
        Write report dictionary
    """
    return {
        "mode": mode,
        "rows_written": rows_written,
        "duration_secs": (end_at - start_at).total_seconds(),
        "table_fqn": table_fqn,
        "skipped": skipped,
        "start_at": start_at,
        "end_at": end_at,
    }


def format_duration(seconds: float) -> str:
    """
    Format duration in seconds to human-readable string.

    Args:
        seconds: Duration in seconds

    Returns:
        Formatted duration string
    """
    if seconds < 60:
        return f"{seconds:.2f}s"
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = seconds % 60
        return f"{minutes}m {secs:.2f}s"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = seconds % 60
        return f"{hours}h {minutes}m {secs:.2f}s"


def create_summary_report(
    *,
    total_steps: int,
    successful_steps: int,
    failed_steps: int,
    total_duration_secs: float,
    total_rows_processed: int,
    total_rows_written: int,
    avg_validation_rate: float,
) -> SummaryReport:
    """
    Create a complete summary report.

    Args:
        total_steps: Total number of steps
        successful_steps: Number of successful steps
        failed_steps: Number of failed steps
        total_duration_secs: Total duration in seconds
        total_rows_processed: Total rows processed
        total_rows_written: Total rows written
        avg_validation_rate: Average validation rate

    Returns:
        Complete summary report
    """
    success_rate = safe_divide(successful_steps, total_steps, 0.0) * 100
    failure_rate = 100.0 - success_rate
    processing_efficiency = (
        safe_divide(total_rows_written, total_rows_processed, 0.0) * 100
    )

    return {
        "execution_summary": {
            "total_steps": total_steps,
            "successful_steps": successful_steps,
            "failed_steps": failed_steps,
            "success_rate": success_rate,
            "failure_rate": failure_rate,
        },
        "performance_metrics": {
            "total_duration_secs": total_duration_secs,
            "formatted_duration": format_duration(total_duration_secs),
            "avg_validation_rate": avg_validation_rate,
        },
        "data_metrics": {
            "total_rows_processed": total_rows_processed,
            "total_rows_written": total_rows_written,
            "processing_efficiency": processing_efficiency,
        },
    }

In [None]:
# Module: pipeline_builder.performance (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, table_operations

from __future__ import annotations

import logging
from contextlib import contextmanager
from typing import Any, Callable

# from .compat import DataFrame  # Removed: defined in notebook cells above

logger = logging.getLogger(__name__)


def now_dt() -> datetime:
    """Get current UTC datetime."""
    return datetime.now(timezone.utc)


def format_duration(seconds: float) -> str:
    """
    Format duration in seconds to human-readable string.

    Args:
        seconds: Duration in seconds

    Returns:
        Formatted duration string
    """
    if seconds < 60:
        return f"{seconds:.2f}s"
    elif seconds < 3600:
        minutes = seconds / 60
        return f"{minutes:.2f}m"
    else:
        hours = seconds / 3600
        return f"{hours:.2f}h"


def time_operation(operation_name: str = "operation") -> Callable[[Callable], Callable]:
    """Decorator to time operations and log performance."""

    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            start_time = time.time()
            logger.info(f"Starting {operation_name}...")

            try:
                result = func(*args, **kwargs)
                duration = time.time() - start_time
                logger.info(f"Completed {operation_name} in {duration:.3f}s")
                return result
            except Exception as e:
                duration = time.time() - start_time
                logger.error(f"Failed {operation_name} after {duration:.3f}s: {e}")
                raise

        return wrapper

    return decorator


@contextmanager
def performance_monitor(
    operation_name: str, max_duration: float | None = None
) -> Generator[None, None, None]:
    """Context manager to monitor operation performance."""
    start_time = time.time()
    logger.info(f"Starting {operation_name}...")

    try:
        yield
        duration = time.time() - start_time
        logger.info(f"Completed {operation_name} in {duration:.3f}s")

        if max_duration and duration > max_duration:
            logger.warning(
                f"{operation_name} took {duration:.3f}s, exceeding threshold of {max_duration}s"
            )

    except Exception as e:
        duration = time.time() - start_time
        logger.error(f"Failed {operation_name} after {duration:.3f}s: {e}")
        raise


@time_operation("write operation")
def time_write_operation(
    mode: str, df: DataFrame, fqn: str, **options: Any
) -> tuple[int, float, datetime, datetime]:
    """
    Time a write operation and return results with timing info.

    Args:
        mode: Write mode (overwrite/append)
        df: DataFrame to write
        fqn: Fully qualified table name
        **options: Additional write options

    Returns:
        Tuple of (rows_written, duration_secs, start_time, end_time)

    Raises:
        ValueError: If mode is invalid
        TableOperationError: If write operation fails
    """
    # from .table_operations import write_append_table, write_overwrite_table  # Removed: defined in notebook cells above

    start = now_dt()
    t0 = time.time()

    try:
        if mode == "overwrite":
            rows = write_overwrite_table(df, fqn, **options)
        elif mode == "append":
            rows = write_append_table(df, fqn, **options)
        else:
            raise ValueError(
                f"Unknown write mode '{mode}'. Supported modes: overwrite, append"
            )

        t1 = time.time()
        end = now_dt()
        duration = round(t1 - t0, 3)

        logger.info(f"Write operation completed: {rows} rows in {duration}s to {fqn}")
        return rows, duration, start, end

    except Exception as e:
        t1 = time.time()
        end = now_dt()
        duration = round(t1 - t0, 3)
        logger.error(f"Write operation failed after {duration}s: {e}")
        raise


def monitor_performance(
    operation_name: str, max_duration: float | None = None
) -> Callable:
    """
    Decorator factory for performance monitoring.

    Args:
        operation_name: Name of the operation
        max_duration: Maximum allowed duration in seconds

    Returns:
        Decorator function
    """

    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            with performance_monitor(operation_name, max_duration):
                return func(*args, **kwargs)

        return wrapper

    return decorator

In [None]:
# Module: pipeline_builder.table_operations (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.performance, pipeline_builder_base.errors

from __future__ import annotations

import logging

# from .compat import AnalysisException, DataFrame, SparkSession  # Removed: defined in notebook cells above
# from .errors import TableOperationError  # Removed: defined in notebook cells above
# from .performance import time_operation  # Removed: defined in notebook cells above

logger = logging.getLogger(__name__)


def fqn(schema: str, table: str) -> str:
    """
    Create a fully qualified table name.

    Args:
        schema: Database schema name
        table: Table name

    Returns:
        Fully qualified table name

    Raises:
        ValueError: If schema or table is empty
    """
    if not schema or not table:
        raise ValueError("Schema and table names cannot be empty")
    return f"{schema}.{table}"


@time_operation("table write (overwrite)")
def write_overwrite_table(
    df: DataFrame, fqn: str, **options: str | int | float | bool
) -> int:
    """
    Write DataFrame to table in overwrite mode.

    Args:
        df: DataFrame to write
        fqn: Fully qualified table name
        **options: Additional write options

    Returns:
        Number of rows written

    Raises:
        TableOperationError: If write operation fails
    """
    try:
        # Cache DataFrame for potential multiple operations
        df.cache()
        cnt: int = df.count()
        writer = (
            df.write.format("parquet")
            .mode("overwrite")
            .option("overwriteSchema", "true")
        )

        # Apply additional options
        for key, value in options.items():
            writer = writer.option(key, value)

        writer.saveAsTable(fqn)
        logger.info(f"Successfully wrote {cnt} rows to {fqn} in overwrite mode")
        return cnt

    except Exception as e:
        raise TableOperationError(f"Failed to write table {fqn}: {e}") from e


@time_operation("table write (append)")
def write_append_table(
    df: DataFrame, fqn: str, **options: str | int | float | bool
) -> int:
    """
    Write DataFrame to table in append mode.

    Args:
        df: DataFrame to write
        fqn: Fully qualified table name
        **options: Additional write options

    Returns:
        Number of rows written

    Raises:
        TableOperationError: If write operation fails
    """
    try:
        # Cache DataFrame for potential multiple operations
        df.cache()
        cnt: int = df.count()
        writer = df.write.format("parquet").mode("append")

        # Apply additional options
        for key, value in options.items():
            writer = writer.option(key, value)

        writer.saveAsTable(fqn)
        logger.info(f"Successfully wrote {cnt} rows to {fqn} in append mode")
        return cnt

    except Exception as e:
        raise TableOperationError(f"Failed to write table {fqn}: {e}") from e


def read_table(spark: SparkSession, fqn: str) -> DataFrame:
    """
    Read data from a table.

    Args:
        spark: Spark session
        fqn: Fully qualified table name

    Returns:
        DataFrame with table data

    Raises:
        TableOperationError: If read operation fails
    """
    try:
        df = spark.table(fqn)
        logger.info(f"Successfully read table {fqn}")
        return df
    except AnalysisException as e:
        raise TableOperationError(f"Table {fqn} does not exist: {e}") from e
    except Exception as e:
        raise TableOperationError(f"Failed to read table {fqn}: {e}") from e


def table_exists(spark: SparkSession, fqn: str) -> bool:
    """
    Check if a table exists.

    Args:
        spark: Spark session
        fqn: Fully qualified table name

    Returns:
        True if table exists, False otherwise
    """
    try:
        spark.table(fqn).count()
        return True
    except AnalysisException:
        logger.debug(f"Table {fqn} does not exist (AnalysisException)")
        return False
    except Exception as e:
        logger.warning(f"Error checking if table {fqn} exists: {e}")
        return False


def drop_table(spark: SparkSession, fqn: str) -> bool:
    """
    Drop a table if it exists.

    Args:
        spark: Spark session
        fqn: Fully qualified table name

    Returns:
        True if table was dropped, False if it didn't exist
    """
    try:
        if table_exists(spark, fqn):
            # Use Java SparkSession to access external catalog
            jspark_session = spark._jsparkSession
            external_catalog = jspark_session.sharedState().externalCatalog()

            # Parse fully qualified name
            if "." in fqn:
                database_name, table_name = fqn.split(".", 1)
            else:
                database_name = "default"
                table_name = fqn

            # Drop the table using external catalog
            # Parameters: db, table, ignoreIfNotExists, purge
            external_catalog.dropTable(database_name, table_name, True, True)
            logger.info(f"Dropped table {fqn}")
            return True
        return False
    except Exception as e:
        logger.warning(f"Failed to drop table {fqn}: {e}")
        return False

In [None]:
# Module: pipeline_builder.writer.storage (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.table_operations, pipeline_builder.writer.models, pipeline_builder_base.logging, writer.exceptions

from __future__ import annotations

from typing import Dict, TypedDict, Union

# from ..compat import DataFrame, SparkSession, types  # Removed: defined in notebook cells above

# Handle optional Delta Lake dependency
try:
    from delta.tables import DeltaTable

    HAS_DELTA = True
except ImportError:
    DeltaTable = None  # type: ignore[misc, assignment]
    HAS_DELTA = False

# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from ..table_operations import table_exists  # Removed: defined in notebook cells above
# from .exceptions import WriterTableError  # Removed: defined in notebook cells above
# from .models import LogRow, WriteMode, WriterConfig, create_log_schema  # Removed: defined in notebook cells above

# ============================================================================
# TypedDict Definitions
# ============================================================================


class WriteResult(TypedDict):
    """Write operation result structure."""

    table_name: str
    write_mode: str
    rows_written: int
    timestamp: str
    success: bool


class OptimizeResultSkipped(TypedDict):
    """Optimize operation result when skipped."""

    table_name: str
    optimization_completed: bool  # False
    skipped: bool  # True
    reason: str
    timestamp: str


class TableInfo(TypedDict, total=False):
    """Table information structure."""

    table_name: str
    row_count: int
    details: list[dict[str, str | int | float | bool | None]]
    history_count: int
    last_modified: str | None
    history: list[dict[str, str | int | float | bool | None]]
    timestamp: str


class OptimizeResultCompleted(TypedDict):
    """Optimize operation result when completed."""

    table_name: str
    optimization_completed: bool  # True
    timestamp: str
    table_info: TableInfo


# Union type for optimize result
OptimizeResult = Union[OptimizeResultSkipped, OptimizeResultCompleted]


class VacuumResultSkipped(TypedDict):
    """Vacuum operation result when skipped."""

    table_name: str
    vacuum_completed: bool  # False
    skipped: bool  # True
    reason: str
    retention_hours: int
    timestamp: str


class VacuumResultCompleted(TypedDict):
    """Vacuum operation result when completed."""

    table_name: str
    vacuum_completed: bool  # True
    retention_hours: int
    timestamp: str


# Union type for vacuum result
VacuumResult = Union[VacuumResultSkipped, VacuumResultCompleted]


class StorageManager:
    """Handles storage operations for the writer."""

    def __init__(
        self,
        spark: SparkSession,
        config: WriterConfig,
        functions: FunctionsProtocol | None = None,
        logger: PipelineLogger | None = None,
    ):
        """Initialize the storage manager."""
        self.spark = spark
        self.config = config
        self.functions = functions if functions is not None else get_default_functions()
        if logger is None:
            self.logger = PipelineLogger("StorageManager")
        else:
            self.logger = logger
        self.table_fqn = f"{config.table_schema}.{config.table_name}"

    def create_table_if_not_exists(self, schema: types.StructType) -> None:
        """
        Create the log table if it doesn't exist.

        Args:
            schema: Spark schema for the table

        Raises:
            WriterTableError: If table creation fails
        """
        try:
            self.logger.info(f"Creating table if not exists: {self.table_fqn}")

            # Extract schema name from table_fqn (format: "schema.table")
            schema_name = (
                self.table_fqn.split(".")[0] if "." in self.table_fqn else None
            )

            # CRITICAL: Ensure schema exists before creating table (required in mock-spark due to DuckDB threading)
            # This is especially important for LogWriter which creates tables in different schemas
            if schema_name:
                try:
                    # Use SQL to ensure schema exists (more reliable than storage API in some contexts)
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
                except Exception as e:
                    # If SQL fails, try storage API
                    try:
                        if hasattr(self.spark, "storage") and hasattr(
                            self.spark.storage, "create_schema"
                        ):
                            self.spark.storage.create_schema(schema_name)
                    except Exception:
                        # If both fail, log warning but continue (schema might already exist)
                        self.logger.debug(
                            f"Could not create schema '{schema_name}': {e}"
                        )

            if not table_exists(self.spark, self.table_fqn):
                # Create empty DataFrame with schema
                empty_df = self.spark.createDataFrame([], schema)

                # CRITICAL: Ensure schema exists RIGHT BEFORE saveAsTable (mock-spark DuckDB threading fix)
                # DuckDB connections in worker threads don't see schemas created earlier
                if schema_name:
                    try:
                        self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
                    except Exception:
                        pass  # Schema might already exist

                # Write to Delta table
                (
                    empty_df.write.format("delta")
                    .mode("overwrite")
                    .option("overwriteSchema", "true")
                    .saveAsTable(self.table_fqn)
                )

                self.logger.info(f"Table created successfully: {self.table_fqn}")
            else:
                self.logger.info(f"Table already exists: {self.table_fqn}")

        except Exception as e:
            raise WriterTableError(
                f"Failed to create table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="create_table",
                context={"schema": str(schema)},
                suggestions=[
                    "Check table permissions",
                    "Verify schema configuration",
                    "Ensure Delta Lake is properly configured",
                ],
            ) from e

    def write_dataframe(
        self,
        df: DataFrame,
        write_mode: WriteMode = WriteMode.APPEND,
        partition_columns: list[str] | None = None,
    ) -> WriteResult:
        """
        Write DataFrame to the log table.

        Args:
            df: DataFrame to write
            write_mode: Write mode for the operation
            partition_columns: Columns to partition by

        Returns:
            Dictionary containing write results

        Raises:
            WriterTableError: If write operation fails
        """
        try:
            self.logger.info(
                f"Writing DataFrame to {self.table_fqn} with mode {write_mode.value}"
            )

            # Prepare DataFrame for writing
            df_prepared = self._prepare_dataframe_for_write(df)

            # Configure write options
            writer = df_prepared.write.format("delta").mode(write_mode.value)

            # Add partitioning if specified
            if partition_columns:
                writer = writer.partitionBy(*partition_columns)

            # Add table-specific options
            if write_mode == WriteMode.OVERWRITE:
                writer = writer.option("overwriteSchema", "true")
            elif write_mode == WriteMode.APPEND:
                # Enable schema evolution for append mode to handle new columns
                writer = writer.option("mergeSchema", "true")

            # Execute write operation
            writer.saveAsTable(self.table_fqn)

            # Get write statistics
            row_count = df_prepared.count()

            write_result = {
                "table_name": self.table_fqn,
                "write_mode": write_mode.value,
                "rows_written": row_count,
                "timestamp": datetime.now().isoformat(),
                "success": True,
            }

            self.logger.info(f"Successfully wrote {row_count} rows to {self.table_fqn}")
            return cast(WriteResult, write_result)

        except Exception as e:
            # Safely get row count for error context
            try:
                row_count = df.count() if hasattr(df, "count") else 0
            except Exception:
                row_count = 0

            raise WriterTableError(
                f"Failed to write DataFrame to {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="write_dataframe",
                context={"write_mode": write_mode.value, "row_count": row_count},
                suggestions=[
                    "Check table permissions",
                    "Verify DataFrame schema matches table schema",
                    "Ensure sufficient storage space",
                    "Check for schema evolution conflicts",
                ],
            ) from e

    def write_batch(
        self, log_rows: list[LogRow], write_mode: WriteMode = WriteMode.APPEND
    ) -> WriteResult:
        """
        Write a batch of log rows to the table.

        Args:
            log_rows: List of log rows to write
            write_mode: Write mode for the operation

        Returns:
            Dictionary containing write results
        """
        try:
            self.logger.info(f"Writing batch of {len(log_rows)} log rows")

            # Convert log rows to DataFrame
            df = self._create_dataframe_from_log_rows(log_rows)

            # Write DataFrame
            return self.write_dataframe(df, write_mode)

        except Exception as e:
            self.logger.error(f"Failed to write batch: {e}")
            raise

    def optimize_table(self) -> OptimizeResult:
        """
        Optimize the Delta table for better performance.

        Returns:
            Dictionary containing optimization results
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, optimize operation skipped for {self.table_fqn}"
            )
            return {
                "table_name": self.table_fqn,
                "optimization_completed": False,
                "skipped": True,
                "reason": "Delta Lake not available",
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(f"Optimizing table: {self.table_fqn}")

            # Run OPTIMIZE command using Delta Lake Python API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)
            # Note: optimize() method may not be available in all Delta Lake versions
            if hasattr(delta_table, "optimize"):
                delta_table.optimize()
            else:
                # Fallback: use SQL command
                self.spark.sql(f"OPTIMIZE {self.table_fqn}")

            # Get table statistics
            table_info = self.get_table_info()

            optimization_result = {
                "table_name": self.table_fqn,
                "optimization_completed": True,
                "timestamp": datetime.now().isoformat(),
                "table_info": table_info,
            }

            self.logger.info(f"Table optimization completed: {self.table_fqn}")
            return cast(OptimizeResult, optimization_result)

        except Exception as e:
            self.logger.error(f"Failed to optimize table {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to optimize table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="optimize_table",
                suggestions=[
                    "Check table permissions",
                    "Verify table exists",
                    "Ensure sufficient resources for optimization",
                ],
            ) from e

    def vacuum_table(self, retention_hours: int = 168) -> VacuumResult:
        """
        Vacuum the Delta table to remove old files.

        Args:
            retention_hours: Hours of retention for old files

        Returns:
            Dictionary containing vacuum results
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, vacuum operation skipped for {self.table_fqn}"
            )
            return {
                "table_name": self.table_fqn,
                "vacuum_completed": False,
                "skipped": True,
                "reason": "Delta Lake not available",
                "retention_hours": retention_hours,
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(
                f"Vacuuming table: {self.table_fqn} (retention: {retention_hours}h)"
            )

            # Run VACUUM command using Delta Lake API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)
            delta_table.vacuum(retentionHours=retention_hours)

            vacuum_result = {
                "table_name": self.table_fqn,
                "vacuum_completed": True,
                "retention_hours": retention_hours,
                "timestamp": datetime.now().isoformat(),
            }

            self.logger.info(f"Table vacuum completed: {self.table_fqn}")
            return cast(VacuumResult, vacuum_result)

        except Exception as e:
            self.logger.error(f"Failed to vacuum table {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to vacuum table {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="vacuum_table",
                suggestions=[
                    "Check table permissions",
                    "Verify retention period is valid",
                    "Ensure table exists",
                ],
            ) from e

    def get_table_info(self) -> TableInfo:
        """
        Get information about the log table.

        Returns:
            Dictionary containing table information
        """
        if not HAS_DELTA:
            self.logger.warning(
                f"Delta Lake not available, using basic table info for {self.table_fqn}"
            )
            # Get basic info without Delta Lake
            row_count = self.spark.table(self.table_fqn).count()
            return {
                "table_name": self.table_fqn,
                "row_count": row_count,
                "details": [],
                "history": [],
                "timestamp": datetime.now().isoformat(),
            }

        try:
            self.logger.info(f"Getting table info for: {self.table_fqn}")

            # Get table details using Delta Lake API
            delta_table = DeltaTable.forName(self.spark, self.table_fqn)

            # Get table details using Delta Lake Python API
            # Note: detail() method may not be available in all Delta Lake versions
            if hasattr(delta_table, "detail"):
                table_details = delta_table.detail().collect()
            else:
                # Fallback: use SQL command
                table_details = self.spark.sql(
                    f"DESCRIBE DETAIL {self.table_fqn}"
                ).collect()

            # Get table history
            table_history = delta_table.history().collect()

            # Get row count
            row_count = self.spark.table(self.table_fqn).count()

            table_info = {
                "table_name": self.table_fqn,
                "row_count": row_count,
                "details": [dict(row.asDict()) for row in table_details],
                "history_count": len(table_history),
                "last_modified": (
                    table_history[0]["timestamp"] if table_history else None
                ),
            }

            self.logger.info(f"Table info retrieved: {row_count} rows")
            return cast(TableInfo, table_info)

        except Exception as e:
            self.logger.error(f"Failed to get table info for {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to get table info for {self.table_fqn}: {e}",
                table_name=self.table_fqn,
                operation="get_table_info",
            ) from e

    def query_logs(
        self,
        limit: int | None = None,
        filters: Union[Dict[str, Union[str, int, float, bool]], None] = None,
    ) -> DataFrame:
        """
        Query logs from the table.

        Args:
            limit: Maximum number of rows to return
            filters: Filters to apply to the query

        Returns:
            DataFrame containing query results
        """
        try:
            self.logger.info(f"Querying logs from: {self.table_fqn}")

            # Start with the base table
            result_df = self.spark.table(self.table_fqn)

            # Apply filters if provided using PySpark functions
            if filters:
                for column, value in filters.items():
                    if isinstance(value, str):
                        result_df = result_df.filter(
                            self.functions.col(column) == self.functions.lit(value)
                        )
                    else:
                        result_df = result_df.filter(
                            self.functions.col(column) == value
                        )

            # Add ordering using PySpark functions
            # from ..compat import desc  # Removed: defined in notebook cells above

            result_df = result_df.orderBy(desc("created_at"))

            # Apply limit if specified
            if limit:
                result_df = result_df.limit(limit)

            self.logger.info(f"Query executed successfully: {result_df.count()} rows")
            return result_df

        except Exception as e:
            self.logger.error(f"Failed to query logs from {self.table_fqn}: {e}")
            raise WriterTableError(
                f"Failed to query logs: {e}",
                table_name=self.table_fqn,
                operation="query_logs",
                suggestions=[
                    "Check table exists",
                    "Verify query syntax",
                    "Check column names in filters",
                ],
            ) from e

    def _prepare_dataframe_for_write(self, df: DataFrame) -> DataFrame:
        """Prepare DataFrame for writing to Delta table."""
        try:
            # Add metadata columns if not present
            from datetime import datetime

            current_time_str = datetime.now().isoformat()

            if "created_at" not in df.columns:
                df = df.withColumn("created_at", self.functions.lit(current_time_str))

            if "updated_at" not in df.columns:
                df = df.withColumn("updated_at", self.functions.lit(current_time_str))

            return df

        except Exception as e:
            self.logger.error(f"Failed to prepare DataFrame for write: {e}")
            raise

    def _create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> DataFrame:
        """Create DataFrame from log rows."""
        try:
            # Convert log rows to dictionaries
            from datetime import datetime

            current_time_str = datetime.now().isoformat()

            log_data = []
            for row in log_rows:
                row_dict = {
                    "run_id": row["run_id"],
                    "run_mode": row["run_mode"],
                    "run_started_at": row["run_started_at"],
                    "run_ended_at": row["run_ended_at"],
                    "execution_id": row["execution_id"],
                    "pipeline_id": row["pipeline_id"],
                    "schema": row["schema"],
                    "phase": row["phase"],
                    "step_name": row["step_name"],
                    "step_type": row["step_type"],
                    "start_time": row["start_time"],
                    "end_time": row["end_time"],
                    "duration_secs": row["duration_secs"],
                    "table_fqn": row["table_fqn"],
                    "write_mode": row["write_mode"],
                    "input_rows": row["input_rows"],
                    "output_rows": row["output_rows"],
                    "rows_written": row["rows_written"],
                    "rows_processed": row["rows_processed"],
                    "table_total_rows": row.get(
                        "table_total_rows"
                    ),  # Include table_total_rows metric
                    "valid_rows": row["valid_rows"],
                    "invalid_rows": row["invalid_rows"],
                    "validation_rate": row["validation_rate"],
                    "success": row["success"],
                    "error_message": row["error_message"],
                    "memory_usage_mb": row["memory_usage_mb"],
                    "cpu_usage_percent": row["cpu_usage_percent"],
                    "metadata": row["metadata"],
                    "created_at": current_time_str,  # Include timestamp directly as string
                }
                log_data.append(row_dict)

            # Create DataFrame with explicit schema for type safety and None value handling
            schema = create_log_schema()
            df = self.spark.createDataFrame(log_data, schema)  # type: ignore[attr-defined]
            return df

        except Exception as e:
            self.logger.error(f"Failed to create DataFrame from log rows: {e}")
            raise

    @property
    def table_schema(self) -> str:
        """Get the table schema."""
        return self.config.table_schema

    @property
    def table_name(self) -> str:
        """Get the table name."""
        return self.config.table_name

In [None]:
# Module: pipeline_builder.pipeline.runner (pipeline_builder)
#
# Dependencies: models.pipeline, models.steps, pipeline.models, pipeline_builder.compat, pipeline_builder.execution, pipeline_builder.functions, pipeline_builder_base.logging

from __future__ import annotations

from typing import Any, Dict, Optional

from abstracts.reports.run import Report
from abstracts.runner import Runner

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
# from ..execution import ExecutionEngine, ExecutionResult  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from ..models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
# ExecutionMode,
# PipelineConfig,
# PipelineMetrics,
# )
# from .models import PipelineMode, PipelineReport, PipelineStatus  # Removed: defined in notebook cells above


class SimplePipelineRunner(Runner):
    """
    Simplified pipeline runner that delegates to the execution engine.

    This runner focuses on orchestration and reporting, delegating
    actual execution to the simplified ExecutionEngine.

    Implements abstracts.Runner interface while maintaining backward compatibility
    with additional methods (run_full_refresh, run_validation_only).
    """

    def __init__(
        self,
        spark: SparkSession,
        config: PipelineConfig,
        bronze_steps: Optional[Dict[str, BronzeStep]] = None,
        silver_steps: Optional[Dict[str, SilverStep]] = None,
        gold_steps: Optional[Dict[str, GoldStep]] = None,
        logger: Optional[PipelineLogger] = None,
        functions: Optional[FunctionsProtocol] = None,
        # Abstracts.Runner compatibility - these will be set if using abstracts interface
        steps: Optional[list[BronzeStep | SilverStep | GoldStep]] = None,
        engine: Optional[
            Any
        ] = None,  # Engine from abstracts, but we use ExecutionEngine
    ):
        """
        Initialize the simplified pipeline runner.

        Args:
            spark: Active SparkSession instance
            config: Pipeline configuration
            bronze_steps: Bronze steps dictionary
            silver_steps: Silver steps dictionary
            gold_steps: Gold steps dictionary
            logger: Optional logger instance
            functions: Optional functions object for PySpark operations
            steps: Optional list of steps (for abstracts.Runner compatibility)
            engine: Optional engine (for abstracts.Runner compatibility, ignored)
        """
        # Initialize abstracts.Runner with empty lists (we'll use our own step storage)
        # This satisfies the abstract base class requirement
        # Use Any for engine type to avoid type checking issues with _DummyEngine

        dummy_engine: Any = _DummyEngine()
        super().__init__(steps=[], engine=engine or dummy_engine)

        self.spark = spark
        self.config = config
        self.bronze_steps = bronze_steps or {}
        self.silver_steps = silver_steps or {}
        self.gold_steps = gold_steps or {}
        self.logger = logger or PipelineLogger()
        self.functions = functions
        self.execution_engine = ExecutionEngine(spark, config, self.logger, functions)

        # If steps provided (from abstracts interface), convert to step dictionaries
        if steps:
            for step in steps:
                if isinstance(step, BronzeStep):
                    self.bronze_steps[step.name] = step
                elif isinstance(step, SilverStep):
                    self.silver_steps[step.name] = step
                elif isinstance(step, GoldStep):
                    self.gold_steps[step.name] = step

    def run_pipeline(
        self,
        steps: list[BronzeStep | SilverStep | GoldStep],
        mode: PipelineMode = PipelineMode.INITIAL,
        bronze_sources: Dict[str, DataFrame] | None = None,
    ) -> PipelineReport:
        """
        Run a complete pipeline.

        Args:
            steps: List of pipeline steps to execute
            mode: Pipeline execution mode
            bronze_sources: Optional bronze source data

        Returns:
            PipelineReport with execution results
        """
        start_time = datetime.now()
        pipeline_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        # Convert PipelineMode to ExecutionMode
        execution_mode = self._convert_mode(mode)

        try:
            self.logger.info(f"Starting pipeline execution: {pipeline_id}")

            # Prepare bronze sources if provided
            if bronze_sources:
                # Add bronze sources to context for execution
                context = {}
                for step in steps:
                    if isinstance(step, BronzeStep) and step.name in bronze_sources:
                        context[step.name] = bronze_sources[step.name]
            else:
                context = {}

            # Execute pipeline using the execution engine
            result = self.execution_engine.execute_pipeline(
                steps, execution_mode, context=context
            )

            # Convert execution result to pipeline report
            report = self._create_pipeline_report(
                pipeline_id=pipeline_id,
                mode=mode,
                start_time=start_time,
                execution_result=result,
            )

            self.logger.info(f"Completed pipeline execution: {pipeline_id}")
            return report

        except Exception as e:
            self.logger.error(f"Pipeline execution failed: {e}")
            return self._create_error_report(
                pipeline_id=pipeline_id, mode=mode, start_time=start_time, error=str(e)
            )

    def run_initial_load(
        self,
        bronze_sources: Optional[Dict[str, Source]] = None,
        steps: Optional[
            list
        ] = None,  # Backward compatibility: old signature accepted steps as first arg
    ) -> Report:  # PipelineReport satisfies Report Protocol
        """
        Run initial load pipeline.

        Implements abstracts.Runner.run_initial_load interface.
        Also supports backward-compatible signature with steps parameter.

        Args:
            bronze_sources: Dictionary mapping bronze step names to Source (DataFrame), or None
            steps: Optional list of steps (for backward compatibility with old signature)
        """
        # Handle backward compatibility: if first arg is a list, treat it as steps
        if isinstance(bronze_sources, list):
            # Old signature: run_initial_load([steps])
            steps = bronze_sources
            bronze_sources = None

        # Convert Source (Protocol) to DataFrame if needed
        # Source Protocol is satisfied by DataFrame, so we accept any DataFrame-like object
        bronze_sources_df: Optional[Dict[str, DataFrame]] = None
        if bronze_sources:
            bronze_sources_df = {}
            for name, source in bronze_sources.items():
                # Check if it's a DataFrame-like object (has DataFrame-like attributes)
                # This works for both PySpark DataFrame and mock_spark DataFrame
                if not (
                    hasattr(source, "columns")
                    and hasattr(source, "count")
                    and hasattr(source, "filter")
                ):
                    raise TypeError(
                        f"bronze_sources must contain DataFrame-like objects, got {type(source)}"
                    )
                bronze_sources_df[name] = source

        # Use provided steps or stored steps
        if steps is None:
            steps = (
                list(self.bronze_steps.values())
                + list(self.silver_steps.values())
                + list(self.gold_steps.values())
            )

        # PipelineReport satisfies Report Protocol structurally
        return self.run_pipeline(steps, PipelineMode.INITIAL, bronze_sources_df)  # type: ignore[return-value]

    def run_incremental(
        self,
        bronze_sources: Optional[Dict[str, Source]] = None,
        steps: Optional[
            list
        ] = None,  # Backward compatibility: old signature accepted steps as first arg
    ) -> Report:  # PipelineReport satisfies Report Protocol
        """
        Run incremental pipeline with all stored steps.

        Implements abstracts.Runner.run_incremental interface.
        Also supports backward-compatible signature with steps parameter.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to Source (DataFrame), or None
            steps: Optional list of steps (for backward compatibility with old signature)

        Returns:
            Report (PipelineReport) with execution results
        """
        # Handle backward compatibility: if first arg is a list, treat it as steps
        if isinstance(bronze_sources, list):
            # Old signature: run_incremental([steps])
            steps = bronze_sources
            bronze_sources = None

        # Convert Source (Protocol) to DataFrame if needed
        # Source Protocol is satisfied by DataFrame, so we accept any DataFrame-like object
        bronze_sources_df: Optional[Dict[str, DataFrame]] = None
        if bronze_sources:
            bronze_sources_df = {}
            for name, source in bronze_sources.items():
                # Check if it's a DataFrame-like object (has DataFrame-like attributes)
                # This works for both PySpark DataFrame and mock_spark DataFrame
                if not (
                    hasattr(source, "columns")
                    and hasattr(source, "count")
                    and hasattr(source, "filter")
                ):
                    raise TypeError(
                        f"bronze_sources must contain DataFrame-like objects, got {type(source)}"
                    )
                bronze_sources_df[name] = source

        # Use provided steps or stored steps
        if steps is None:
            steps = (
                list(self.bronze_steps.values())
                + list(self.silver_steps.values())
                + list(self.gold_steps.values())
            )

        # PipelineReport satisfies Report Protocol structurally
        return self.run_pipeline(steps, PipelineMode.INCREMENTAL, bronze_sources_df)  # type: ignore[return-value]

    def run_full_refresh(
        self,
        bronze_sources: Dict[str, DataFrame] | None = None,
    ) -> PipelineReport:
        """
        Run full refresh pipeline with all stored steps.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to DataFrames

        Returns:
            PipelineReport with execution results
        """
        steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )
        return self.run_pipeline(steps, PipelineMode.FULL_REFRESH, bronze_sources)

    def run_validation_only(
        self,
        bronze_sources: Dict[str, DataFrame] | None = None,
    ) -> PipelineReport:
        """
        Run validation-only pipeline with all stored steps.

        Args:
            bronze_sources: Optional dictionary mapping bronze step names to DataFrames

        Returns:
            PipelineReport with execution results
        """
        steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )
        return self.run_pipeline(steps, PipelineMode.VALIDATION_ONLY, bronze_sources)

    def _convert_mode(self, mode: PipelineMode) -> ExecutionMode:
        """Convert PipelineMode to ExecutionMode."""
        mode_map = {
            PipelineMode.INITIAL: ExecutionMode.INITIAL,
            PipelineMode.INCREMENTAL: ExecutionMode.INCREMENTAL,
            PipelineMode.FULL_REFRESH: ExecutionMode.FULL_REFRESH,
            PipelineMode.VALIDATION_ONLY: ExecutionMode.VALIDATION_ONLY,
        }
        return mode_map.get(mode, ExecutionMode.INITIAL)

    def _create_pipeline_report(
        self,
        pipeline_id: str,
        mode: PipelineMode,
        start_time: datetime,
        execution_result: ExecutionResult,
    ) -> PipelineReport:
        """Create a pipeline report from execution result."""
        end_time = execution_result.end_time or datetime.now()
        duration = (end_time - start_time).total_seconds()

        # Count successful and failed steps
        steps = execution_result.steps or []
        successful_steps = [s for s in steps if s.status.value == "completed"]
        failed_steps = [s for s in steps if s.status.value == "failed"]

        # Import StepType for layer filtering
        # from ..execution import StepType  # Removed: defined in notebook cells above

        # Organize step results by layer (bronze/silver/gold)
        bronze_results = {}
        silver_results = {}
        gold_results = {}

        for step_result in steps:
            step_info = {
                "status": step_result.status.value,
                "duration": step_result.duration,
                "rows_processed": step_result.rows_processed,
                "output_table": step_result.output_table,
                "start_time": step_result.start_time.isoformat(),
                "end_time": step_result.end_time.isoformat()
                if step_result.end_time
                else None,
                "write_mode": step_result.write_mode,
                "validation_rate": step_result.validation_rate,
                "rows_written": step_result.rows_written,
                "input_rows": step_result.input_rows,
            }

            # Add error if present
            if step_result.error:
                step_info["error"] = step_result.error

            # Add dataframe if available in context (for users who want to access output)
            if hasattr(execution_result, "context"):
                context = getattr(execution_result, "context", None)
                if (
                    context
                    and isinstance(context, dict)
                    and step_result.step_name in context
                ):
                    step_info["dataframe"] = context[step_result.step_name]

            # Categorize by step type
            if step_result.step_type.value == "bronze":
                bronze_results[step_result.step_name] = step_info
            elif step_result.step_type.value == "silver":
                silver_results[step_result.step_name] = step_info
            elif step_result.step_type.value == "gold":
                gold_results[step_result.step_name] = step_info

        # Aggregate row counts from step results
        total_rows_processed = sum(s.rows_processed or 0 for s in steps)
        # For rows_written, only count Silver/Gold steps (those with output_table)
        total_rows_written = sum(
            s.rows_processed or 0 for s in steps if s.output_table is not None
        )

        # Calculate durations by layer
        bronze_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.BRONZE
        )
        silver_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.SILVER
        )
        gold_duration = sum(
            s.duration or 0 for s in steps if s.step_type == StepType.GOLD
        )

        return PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=execution_result.execution_id,
            status=(
                PipelineStatus.COMPLETED
                if execution_result.status == "completed"
                else PipelineStatus.FAILED
            ),
            mode=mode,
            start_time=start_time,
            end_time=end_time,
            duration_seconds=duration,
            metrics=PipelineMetrics(
                total_steps=len(steps),
                successful_steps=len(successful_steps),
                failed_steps=len(failed_steps),
                total_duration=duration,
                bronze_duration=bronze_duration,
                silver_duration=silver_duration,
                gold_duration=gold_duration,
                total_rows_processed=total_rows_processed,
                total_rows_written=total_rows_written,
                parallel_efficiency=execution_result.parallel_efficiency,
            ),
            bronze_results=bronze_results,
            silver_results=silver_results,
            gold_results=gold_results,
            errors=[s.error for s in failed_steps if s.error],
            warnings=[],
            execution_groups_count=execution_result.execution_groups_count,
            max_group_size=execution_result.max_group_size,
        )

    def _create_error_report(
        self, pipeline_id: str, mode: PipelineMode, start_time: datetime, error: str
    ) -> PipelineReport:
        """Create an error pipeline report."""
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()

        return PipelineReport(
            pipeline_id=pipeline_id,
            execution_id=f"error_{pipeline_id}",
            status=PipelineStatus.FAILED,
            mode=mode,
            start_time=start_time,
            end_time=end_time,
            duration_seconds=duration,
            metrics=PipelineMetrics(
                total_steps=0,
                successful_steps=0,
                failed_steps=0,
                total_duration=duration,
            ),
            errors=[error],
            warnings=[],
        )


class _DummyEngine:
    """Dummy engine for Runner.__init__ compatibility."""

    pass


# Alias for backward compatibility
PipelineRunner = SimplePipelineRunner

# Explicitly clear abstract methods since they are implemented
# Python's ABC mechanism sometimes doesn't recognize implementations with positional-only args
if hasattr(SimplePipelineRunner, "__abstractmethods__"):
    SimplePipelineRunner.__abstractmethods__ = frozenset()

In [None]:
# Module: pipeline_builder.execution (pipeline_builder)
#
# Dependencies: compat, dependencies, models.pipeline, models.steps, pipeline_builder.functions, pipeline_builder_base.errors, pipeline_builder_base.logging, table_operations, validation.data_validation

from __future__ import annotations

import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict

# from .compat import DataFrame, F, SparkSession, is_mock_spark  # Removed: defined in notebook cells above
# from .functions import FunctionsProtocol  # Removed: defined in notebook cells above
# from .models import BronzeStep, GoldStep, SilverStep  # Removed: defined in notebook cells above
# from .table_operations import fqn  # Removed: defined in notebook cells above
# from .validation import apply_column_rules  # Removed: defined in notebook cells above
# from .dependencies import DependencyAnalyzer  # Removed: defined in notebook cells above
# from .errors import ExecutionError  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
# ExecutionContext,
# ExecutionMode,
# PipelineConfig,
# PipelineMetrics,
# StepResult,
# )


class StepStatus(Enum):
    """Step execution status."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    SKIPPED = "skipped"


class StepType(Enum):
    """Types of pipeline steps."""

    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"


@dataclass
class StepExecutionResult:
    """Result of step execution."""

    step_name: str
    step_type: StepType
    status: StepStatus
    start_time: datetime
    end_time: datetime | None = None
    duration: float | None = None
    error: str | None = None
    rows_processed: int | None = None
    output_table: str | None = None
    write_mode: str | None = None
    validation_rate: float = 100.0
    rows_written: int | None = None
    input_rows: int | None = None

    def __post_init__(self) -> None:
        if self.end_time and self.start_time:
            self.duration = (self.end_time - self.start_time).total_seconds()


@dataclass
class ExecutionResult:
    """Result of pipeline execution."""

    execution_id: str
    mode: ExecutionMode
    start_time: datetime
    end_time: datetime | None = None
    duration: float | None = None
    status: str = "running"
    steps: list[StepExecutionResult] | None = None
    error: str | None = None
    parallel_efficiency: float = 0.0
    execution_groups_count: int = 0
    max_group_size: int = 0

    def __post_init__(self) -> None:
        if self.steps is None:
            self.steps = []
        if self.end_time and self.start_time:
            self.duration = (self.end_time - self.start_time).total_seconds()


class ExecutionEngine:
    """
    Simplified execution engine for the framework pipelines.

    This engine handles both individual step execution and full pipeline execution
    with a clean, unified interface.
    """

    def __init__(
        self,
        spark: SparkSession,
        config: PipelineConfig,
        logger: PipelineLogger | None = None,
        functions: FunctionsProtocol | None = None,
    ):
        """
        Initialize the execution engine.

        Args:
            spark: Active SparkSession instance
            config: Pipeline configuration
            logger: Optional logger instance
            functions: Optional functions object for PySpark operations
        """
        self.spark = spark
        self.config = config
        if logger is None:
            self.logger = PipelineLogger()
        else:
            self.logger = logger

        # Store functions for validation
        if functions is None:
            # from .functions import get_default_functions  # Removed: defined in notebook cells above

            self.functions = get_default_functions()
        else:
            self.functions = functions

    def _ensure_schema_exists(self, schema: str) -> None:
        """
        Ensure a schema exists, creating it if necessary.

        Args:
            schema: Schema name to create

        Raises:
            ExecutionError: If schema creation fails
        """
        # Check if schema already exists
        try:
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema in databases:
                return  # Schema already exists, nothing to do
        except Exception:
            pass  # If we can't check, try to create anyway

        try:
            # Try using mock-spark storage API if available (for mock-spark compatibility)
            if hasattr(self.spark, "storage") and hasattr(
                self.spark.storage, "create_schema"
            ):
                try:
                    self.spark.storage.create_schema(schema)
                    # Verify it was created
                    databases = [db.name for db in self.spark.catalog.listDatabases()]
                    if schema in databases:
                        return  # Success
                    else:
                        raise ExecutionError(
                            f"Schema '{schema}' creation via storage API failed - schema not in catalog. "
                            f"Available databases: {databases}"
                        )
                except Exception as storage_error:
                    # If storage API fails, fall through to SQL approach
                    self.logger.debug(
                        f"Storage API schema creation failed: {storage_error}, trying SQL"
                    )

            # Fall back to SQL for real Spark or if storage API not available
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            # Verify it was created
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema not in databases:
                raise ExecutionError(
                    f"Schema '{schema}' creation via SQL failed - schema not in catalog. "
                    f"Available databases: {databases}"
                )
        except ExecutionError:
            raise  # Re-raise ExecutionError
        except Exception as e:
            # Wrap other exceptions
            raise ExecutionError(f"Failed to create schema '{schema}': {str(e)}") from e

    def _ensure_materialized_for_validation(
        self, df: DataFrame, rules: Dict[str, Any]
    ) -> DataFrame:
        """
        Force DataFrame materialization before validation to avoid CTE optimization issues.

        Mock-spark's CTE optimization can fail when validation rules reference columns
        created by transforms (via withColumn). By materializing the DataFrame first,
        we ensure all columns are available in the validation context.

        Args:
            df: DataFrame to potentially materialize
            rules: Validation rules dictionary

        Returns:
            Materialized DataFrame (or original if materialization not needed/available)
        """
        # Check if rules reference columns that might be new (not in original input)
        # For now, we'll materialize if rules exist and we're in mock-spark mode
        # This is a conservative approach to avoid CTE issues
        try:
            # Check if we're using mock-spark
            # from .compat import is_mock_spark  # Removed: defined in notebook cells above

            if is_mock_spark() and rules:
                # Force full materialization by collecting and recreating DataFrame
                # This bypasses CTE optimization entirely
                try:
                    # Get schema first
                    schema = df.schema

                    # Collect data to force full materialization
                    # This bypasses CTE optimization in mock-spark
                    collected_data = df.collect()

                    # Convert Row objects to dictionaries to preserve column names
                    # This fixes a bug in mock-spark's Polars backend where Row objects
                    # lose column names during materialization after filter operations
                    if collected_data and hasattr(collected_data[0], "asDict"):
                        # Convert Row objects to dictionaries
                        dict_data = [row.asDict() for row in collected_data]
                    elif collected_data:
                        # Fallback: try to convert to dict if possible
                        try:
                            dict_data = [dict(row) for row in collected_data]
                        except (TypeError, ValueError):
                            # If conversion fails, use original data
                            dict_data = collected_data
                    else:
                        dict_data = collected_data

                    # Recreate DataFrame from dictionary data
                    # This ensures all columns are fully materialized with correct names
                    df = self.spark.createDataFrame(dict_data, schema)
                except Exception as e:
                    # If materialization fails, try alternative: just cache and count
                    try:
                        if hasattr(df, "cache"):
                            df = df.cache()
                        _ = df.count()  # Force evaluation
                    except Exception:
                        # If all materialization attempts fail, return original
                        # Validation will still be attempted
                        self.logger.debug(f"Could not materialize DataFrame: {e}")
                        pass
        except Exception:
            # If we can't determine mock-spark status or materialization fails,
            # return original DataFrame
            pass

        return df

    def execute_step(
        self,
        step: BronzeStep | SilverStep | GoldStep,
        context: Dict[str, DataFrame],
        mode: ExecutionMode = ExecutionMode.INITIAL,
    ) -> StepExecutionResult:
        """
        Execute a single pipeline step.

        Args:
            step: The step to execute
            context: Execution context with available DataFrames
            mode: Execution mode

        Returns:
            StepExecutionResult with execution details
        """
        start_time = datetime.now()
        # Determine step type based on class
        if isinstance(step, BronzeStep):
            step_type = StepType.BRONZE
        elif isinstance(step, SilverStep):
            step_type = StepType.SILVER
        elif isinstance(step, GoldStep):
            step_type = StepType.GOLD
        else:
            raise ValueError(f"Unknown step type: {type(step)}")

        result = StepExecutionResult(
            step_name=step.name,
            step_type=step_type,
            status=StepStatus.RUNNING,
            start_time=start_time,
        )

        try:
            # Use logger's step_start method for consistent formatting with emoji and uppercase
            self.logger.step_start(step_type.value, step.name)

            # Execute the step based on type
            if isinstance(step, BronzeStep):
                output_df = self._execute_bronze_step(step, context)
            elif isinstance(step, SilverStep):
                output_df = self._execute_silver_step(step, context, mode)
            elif isinstance(step, GoldStep):
                output_df = self._execute_gold_step(step, context)

            # Apply validation if not in validation-only mode
            validation_rate = 100.0
            invalid_rows = 0
            if mode != ExecutionMode.VALIDATION_ONLY:
                # All step types (Bronze, Silver, Gold) have rules attribute
                if step.rules:
                    # CRITICAL: Force materialization before validation to avoid CTE optimization issues
                    # When transforms create new columns with withColumn(), mock-spark's CTE optimization
                    # can fail because those columns aren't visible in CTE context during validation.
                    # Materializing ensures all columns are available.
                    output_df = self._ensure_materialized_for_validation(
                        output_df, step.rules
                    )
                    output_df, _, validation_stats = apply_column_rules(
                        output_df,
                        step.rules,
                        "pipeline",
                        step.name,
                        functions=self.functions,
                    )
                    # Capture validation stats for logging (handle different return types for test mocking)
                    if validation_stats is not None:
                        validation_rate = getattr(
                            validation_stats, "validation_rate", 100.0
                        )
                        invalid_rows = getattr(validation_stats, "invalid_rows", 0)

            # Write output if not in validation-only mode
            # Note: Bronze steps only validate data, they don't write to tables
            if mode != ExecutionMode.VALIDATION_ONLY and not isinstance(
                step, BronzeStep
            ):
                # Use table_name attribute for SilverStep and GoldStep
                table_name = getattr(step, "table_name", step.name)
                schema = getattr(step, "schema", None)

                # Validate schema is provided
                if schema is None:
                    raise ExecutionError(
                        f"Step '{step.name}' requires a schema to be specified. "
                        f"Silver and Gold steps must have a valid schema for table operations. "
                        f"Please provide a schema when creating the step."
                    )

                output_table = fqn(schema, table_name)

                # CRITICAL FIX for mock-spark threading issue:
                # DuckDB connections in worker threads don't see schemas created in other threads.
                # This issue persists in mock-spark 3.1.0 and earlier versions.
                # We MUST create schema in THIS thread's connection right before saveAsTable.
                # Try multiple methods to ensure schema is visible to this DuckDB connection.
                try:
                    # Method 1: Try SQL (most reliable for DuckDB in mock-spark)
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
                    # Method 2: Also try storage API if available (redundancy for mock-spark)
                    if hasattr(self.spark, "storage") and hasattr(
                        self.spark.storage, "create_schema"
                    ):
                        try:
                            self.spark.storage.create_schema(schema)
                        except Exception:
                            pass  # SQL might be enough, continue
                    # Method 3: Try catalog API as well
                    try:
                        self.spark.catalog.createDatabase(schema, ignoreIfExists=True)
                    except Exception:
                        pass  # SQL might be enough, continue
                except Exception as e:
                    # If all methods fail, raise error - schema creation is critical
                    raise ExecutionError(
                        f"Failed to create schema '{schema}' before table creation: {e}"
                    ) from e

                # Determine write mode
                # - Gold steps always use overwrite to prevent duplicate aggregates
                # - Silver steps append during incremental runs to preserve history
                # - All other modes overwrite
                if isinstance(step, GoldStep):
                    write_mode_str = "overwrite"
                elif mode == ExecutionMode.INCREMENTAL:
                    write_mode_str = "append"
                else:  # INITIAL or FULL_REFRESH
                    write_mode_str = "overwrite"

                output_df.write.mode(write_mode_str).saveAsTable(output_table)
                result.output_table = output_table
                result.rows_processed = output_df.count()

                # Set write mode in result for tracking
                result.write_mode = write_mode_str
            elif isinstance(step, BronzeStep):
                # Bronze steps only validate data, don't write to tables
                result.rows_processed = output_df.count()
                result.write_mode = None
            else:  # VALIDATION_ONLY mode
                # Validation-only mode doesn't write to tables
                result.rows_processed = output_df.count()
                result.write_mode = None

            result.status = StepStatus.COMPLETED
            result.end_time = datetime.now()
            result.duration = (result.end_time - result.start_time).total_seconds()

            # Populate result fields
            rows_processed = result.rows_processed or 0
            # For Silver/Gold steps, rows_written equals rows_processed (since we write the output)
            # For Bronze steps, rows_written is None (they don't write to tables)
            rows_written = rows_processed if not isinstance(step, BronzeStep) else None

            result.rows_written = rows_written
            result.input_rows = rows_processed
            result.validation_rate = (
                validation_rate if validation_rate is not None else 100.0
            )

            # Use logger's step_complete method for consistent formatting with emoji and uppercase
            self.logger.step_complete(
                step_type.value,
                step.name,
                result.duration,
                rows_processed=rows_processed,
                rows_written=rows_written,
                invalid_rows=invalid_rows,
                validation_rate=validation_rate,
            )

        except Exception as e:
            result.status = StepStatus.FAILED
            result.error = str(e)
            result.end_time = datetime.now()
            result.duration = (result.end_time - result.start_time).total_seconds()

            # Log step failure
            self.logger.error(
                f"❌ Failed {step_type.value.upper()} step: {step.name} ({result.duration:.2f}s) - {str(e)}"
            )
            raise ExecutionError(f"Step execution failed: {e}") from e

        return result

    def execute_pipeline(
        self,
        steps: list[BronzeStep | SilverStep | GoldStep],
        mode: ExecutionMode = ExecutionMode.INITIAL,
        max_workers: int = 4,
        context: Dict[str, DataFrame] | None = None,
    ) -> ExecutionResult:
        """
        Execute a complete pipeline with smart dependency-aware parallel execution.

        This method automatically analyzes step dependencies and executes independent
        steps in parallel across all layers (Bronze, Silver, Gold). Steps within each
        execution group run concurrently using ThreadPoolExecutor, while groups are
        executed sequentially to respect dependencies.

        Parallelism is controlled by the PipelineConfig.parallel settings:
        - If parallel.enabled is True, uses parallel.max_workers (default: 4)
        - If parallel.enabled is False, executes sequentially (max_workers=1)
        - The max_workers parameter is ignored; config settings take precedence

        Args:
            steps: List of steps to execute
            mode: Execution mode (INITIAL, INCREMENTAL, FULL_REFRESH, VALIDATION_ONLY)
            max_workers: Deprecated - use PipelineConfig.parallel.max_workers instead
            context: Optional initial execution context with DataFrames

        Returns:
            ExecutionResult with execution details and parallel execution metrics

        Example:
            >>> # Default config enables parallel execution with 4 workers
            >>> config = PipelineConfig.create_default(schema="my_schema")
            >>> engine = ExecutionEngine(spark, config)
            >>> result = engine.execute_pipeline(steps=[bronze, silver1, silver2, gold])
            >>> print(f"Parallel efficiency: {result.parallel_efficiency:.2f}")
            >>> print(f"Execution groups: {result.execution_groups_count}")
        """
        execution_id = str(uuid.uuid4())
        start_time = datetime.now()

        result = ExecutionResult(
            execution_id=execution_id,
            mode=mode,
            start_time=start_time,
            status="running",
        )

        try:
            # Logging is handled by the runner to avoid duplicate messages
            # Ensure all required schemas exist before parallel execution (required in mock-spark due to DuckDB threading)
            # This MUST happen in the main thread before any worker threads start
            # Collect unique schemas from all steps
            required_schemas = set()
            for step in steps:
                if hasattr(step, "schema") and step.schema:
                    schema_value = step.schema
                    # Handle both string schemas and Mock objects (for tests)
                    if isinstance(schema_value, str):
                        required_schemas.add(schema_value)
            # Create all required schemas upfront - always try to create, don't rely on catalog checks
            # This is critical for mock-spark where DuckDB connections in worker threads
            # don't see schemas created via catalog API, so we must create them in main thread first
            for schema in required_schemas:
                try:
                    # Always try to create schema - CREATE SCHEMA IF NOT EXISTS is idempotent
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
                    # Also use _ensure_schema_exists as backup (tries multiple methods)
                    self._ensure_schema_exists(schema)
                except Exception as e:
                    # Log but don't fail - schema might already exist or creation might work later
                    self.logger.debug(f"Schema '{schema}' pre-creation attempt: {e}")

            # Validate context parameter
            if context is None:
                context = {}
            elif not isinstance(context, dict):
                raise TypeError(f"context must be a dictionary, got {type(context)}")

            # Group steps by type for dependency analysis
            bronze_steps = [s for s in steps if isinstance(s, BronzeStep)]
            silver_steps = [s for s in steps if isinstance(s, SilverStep)]
            gold_steps = [s for s in steps if isinstance(s, GoldStep)]

            # Build dependency graph and get execution groups
            analyzer = DependencyAnalyzer()
            analysis = analyzer.analyze_dependencies(
                bronze_steps={s.name: s for s in bronze_steps},
                silver_steps={s.name: s for s in silver_steps},
                gold_steps={s.name: s for s in gold_steps},
            )

            execution_groups = analysis.execution_groups
            result.execution_groups_count = len(execution_groups)
            result.max_group_size = (
                max(len(group) for group in execution_groups) if execution_groups else 0
            )

            # Log dependency analysis results
            self.logger.info(
                f"Dependency analysis complete: {len(execution_groups)} execution groups, "
                f"max group size: {result.max_group_size}"
            )

            # Determine worker count from config
            # After PipelineConfig.__post_init__, parallel is always ParallelConfig
            # But handle mocked configs gracefully
            # from .models import ParallelConfig  # Removed: defined in notebook cells above

            if isinstance(self.config.parallel, ParallelConfig):
                if self.config.parallel.enabled:
                    workers = self.config.parallel.max_workers
                    self.logger.info(
                        f"Parallel execution enabled with {workers} workers"
                    )
                else:
                    workers = 1
                    self.logger.info("Sequential execution mode")
            elif hasattr(self.config.parallel, "enabled"):
                # Handle Mock or other types with enabled attribute
                enabled = getattr(self.config.parallel, "enabled", True)
                if enabled:
                    workers = getattr(self.config.parallel, "max_workers", 4)
                else:
                    workers = 1
            else:
                # Fallback for tests with mock configs
                workers = 1
                self.logger.info("Sequential execution mode (default)")

            # Thread-safe context management
            context_lock = threading.Lock()

            # Create a mapping of step names to step objects
            step_map = {s.name: s for s in steps}

            # Track timing for parallel efficiency calculation
            group_timings = []

            # Execute each group in parallel
            for group_idx, group in enumerate(execution_groups):
                group_start = datetime.now()
                self.logger.info(
                    f"Executing group {group_idx + 1}/{len(execution_groups)}: "
                    f"{len(group)} steps - {', '.join(group)}"
                )

                if workers > 1:
                    # Parallel execution
                    with ThreadPoolExecutor(max_workers=workers) as executor:
                        futures = {}
                        for step_name in group:
                            if step_name not in step_map:
                                self.logger.warning(
                                    f"Step {step_name} in execution group but not found in step list"
                                )
                                continue

                            step = step_map[step_name]
                            future = executor.submit(
                                self._execute_step_safe,
                                step,
                                context,
                                mode,
                                context_lock,
                            )
                            futures[future] = step_name

                        # Wait for all steps in group to complete
                        for future in as_completed(futures):
                            step_name = futures[future]
                            try:
                                step_result = future.result()
                                if result.steps is not None:
                                    result.steps.append(step_result)

                                if step_result.status == StepStatus.FAILED:
                                    self.logger.error(
                                        f"Step {step_name} failed: {step_result.error}"
                                    )
                            except Exception as e:
                                self.logger.error(
                                    f"Exception executing step {step_name}: {e}"
                                )
                                # Determine correct step type
                                step_obj = step_map.get(step_name)
                                if step_obj is not None and isinstance(
                                    step_obj, BronzeStep
                                ):
                                    step_type_enum = StepType.BRONZE
                                elif step_obj is not None and isinstance(
                                    step_obj, SilverStep
                                ):
                                    step_type_enum = StepType.SILVER
                                elif step_obj is not None and isinstance(
                                    step_obj, GoldStep
                                ):
                                    step_type_enum = StepType.GOLD
                                else:
                                    step_type_enum = StepType.BRONZE  # fallback

                                # Create failed step result
                                step_result = StepExecutionResult(
                                    step_name=step_name,
                                    step_type=step_type_enum,
                                    status=StepStatus.FAILED,
                                    error=str(e),
                                    start_time=datetime.now(),
                                    end_time=datetime.now(),
                                    duration=0.0,
                                )
                                if result.steps is not None:
                                    result.steps.append(step_result)
                else:
                    # Sequential execution (workers == 1)
                    for step_name in group:
                        if step_name not in step_map:
                            self.logger.warning(
                                f"Step {step_name} in execution group but not found in step list"
                            )
                            continue

                        step = step_map[step_name]
                        try:
                            step_result = self._execute_step_safe(
                                step, context, mode, context_lock
                            )
                            if result.steps is not None:
                                result.steps.append(step_result)

                            if step_result.status == StepStatus.FAILED:
                                self.logger.error(
                                    f"Step {step_name} failed: {step_result.error}"
                                )
                        except Exception as e:
                            self.logger.error(
                                f"Exception executing step {step_name}: {e}"
                            )
                            # Determine correct step type
                            step_obj = step_map.get(step_name)
                            if step_obj is not None and isinstance(
                                step_obj, BronzeStep
                            ):
                                step_type_enum = StepType.BRONZE
                            elif step_obj is not None and isinstance(
                                step_obj, SilverStep
                            ):
                                step_type_enum = StepType.SILVER
                            elif step_obj is not None and isinstance(
                                step_obj, GoldStep
                            ):
                                step_type_enum = StepType.GOLD
                            else:
                                step_type_enum = StepType.BRONZE  # fallback

                            step_result = StepExecutionResult(
                                step_name=step_name,
                                step_type=step_type_enum,
                                status=StepStatus.FAILED,
                                error=str(e),
                                start_time=datetime.now(),
                                end_time=datetime.now(),
                                duration=0.0,
                            )
                            if result.steps is not None:
                                result.steps.append(step_result)

                group_end = datetime.now()
                group_duration = (group_end - group_start).total_seconds()
                group_timings.append((len(group), group_duration))
                self.logger.info(
                    f"Group {group_idx + 1} completed in {group_duration:.2f}s"
                )

            # Calculate parallel efficiency
            if result.steps:
                total_step_time = sum(
                    s.duration for s in result.steps if s.duration is not None
                )
                total_wall_time = (datetime.now() - start_time).total_seconds()

                if total_wall_time > 0 and workers > 1:
                    # Efficiency = (total sequential time / total parallel time) / workers
                    # This gives a ratio of how well we utilized parallelism
                    ideal_parallel_time = total_step_time / workers
                    result.parallel_efficiency = min(
                        (ideal_parallel_time / total_wall_time) * 100, 100.0
                    )
                else:
                    result.parallel_efficiency = (
                        100.0  # Sequential execution is 100% efficient
                    )

            # Determine overall pipeline status based on step results
            if result.steps is None:
                result.steps = []
            step_results: list[StepExecutionResult] = result.steps
            failed_steps = [s for s in step_results if s.status == StepStatus.FAILED]

            if failed_steps:
                result.status = "failed"
                self.logger.error(
                    f"Pipeline execution failed: {len(failed_steps)} steps failed"
                )
            else:
                result.status = "completed"
                self.logger.info(
                    f"Completed pipeline execution: {execution_id} - "
                    f"Parallel efficiency: {result.parallel_efficiency:.1f}%"
                )

            result.end_time = datetime.now()

        except Exception as e:
            result.status = "failed"
            result.error = str(e)
            result.end_time = datetime.now()
            self.logger.error(f"Pipeline execution failed: {e}")
            raise ExecutionError(f"Pipeline execution failed: {e}") from e

        return result

    def _execute_step_safe(
        self,
        step: BronzeStep | SilverStep | GoldStep,
        context: Dict[str, DataFrame],
        mode: ExecutionMode,
        context_lock: threading.Lock,
    ) -> StepExecutionResult:
        """
        Execute a step with thread-safe context access.

        This method wraps execute_step() to provide thread-safe access to the
        shared execution context when running steps in parallel.

        Args:
            step: The step to execute
            context: Shared execution context with available DataFrames
            mode: Execution mode
            context_lock: Threading lock for thread-safe context access

        Returns:
            StepExecutionResult with execution details
        """
        # CRITICAL: Ensure schema exists in THIS worker thread before execution
        # mock-spark has DuckDB threading issues where schemas created in one thread
        # are not visible to DuckDB connections in other threads. We serialize schema creation
        # with a lock, but the real fix is in execute_step() where we CREATE SCHEMA right before saveAsTable.
        # This is just a safety check.
        if hasattr(step, "schema") and step.schema:
            with context_lock:
                # Try to ensure schema exists (serialized to avoid race conditions)
                try:
                    # Use SQL to ensure schema exists (more reliable than storage API in threads)
                    self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {step.schema}")
                except Exception as e:
                    self.logger.debug(
                        f"Schema '{step.schema}' creation in worker thread (non-critical): {e}"
                    )

        # Read from context with lock to get a snapshot
        with context_lock:
            local_context = dict(context)

        # Execute step (this can happen in parallel without lock)
        result = self.execute_step(step, local_context, mode)

        # Write to context with lock (for Silver/Gold steps that write tables)
        if result.status == StepStatus.COMPLETED and not isinstance(step, BronzeStep):
            with context_lock:
                # Get table name and schema from step
                table_name = getattr(step, "table_name", step.name)
                schema = getattr(step, "schema", None)

                if schema is not None:
                    # Add the step's output table to context for downstream steps
                    context[step.name] = self.spark.table(fqn(schema, table_name))
                else:
                    self.logger.warning(
                        f"Step '{step.name}' completed but has no schema. "
                        f"Cannot add to context for downstream steps."
                    )

        return result

    def _execute_bronze_step(
        self, step: BronzeStep, context: Dict[str, DataFrame]
    ) -> DataFrame:
        """Execute a bronze step."""
        # Bronze steps require data to be provided in context
        # This is the expected behavior - bronze steps validate existing data
        if step.name not in context:
            raise ExecutionError(
                f"Bronze step '{step.name}' requires data to be provided in context. "
                f"Bronze steps are for validating existing data, not creating it. "
                f"Please provide data using bronze_sources parameter or context dictionary. "
                f"Available context keys: {list(context.keys())}"
            )

        df = context[step.name]

        # Validate that the DataFrame is not empty (optional check)
        if df.count() == 0:
            self.logger.warning(
                f"Bronze step '{step.name}' received empty DataFrame. "
                f"This may indicate missing or invalid data source."
            )

        return df

    def _execute_silver_step(
        self,
        step: SilverStep,
        context: Dict[str, DataFrame],
        mode: ExecutionMode,
    ) -> DataFrame:
        """Execute a silver step."""

        # Get source bronze data
        if step.source_bronze not in context:
            raise ExecutionError(
                f"Source bronze step {step.source_bronze} not found in context"
            )

        bronze_df = context[step.source_bronze]

        if mode == ExecutionMode.INCREMENTAL:
            bronze_df = self._filter_incremental_bronze_input(step, bronze_df)

        # Apply transform with source bronze data and empty silvers dict
        return step.transform(self.spark, bronze_df, {})

    def _filter_incremental_bronze_input(
        self, step: SilverStep, bronze_df: DataFrame
    ) -> DataFrame:
        """
        Filter bronze input rows that were already processed in previous incremental runs.

        Uses the source bronze step's incremental column and the silver step's watermark
        column to eliminate rows whose incremental value is less than or equal to the
        last processed watermark.
        """

        incremental_col = getattr(step, "source_incremental_col", None)
        watermark_col = getattr(step, "watermark_col", None)
        schema = getattr(step, "schema", None)
        table_name = getattr(step, "table_name", step.name)

        if not incremental_col or not watermark_col or schema is None:
            return bronze_df

        if incremental_col not in getattr(bronze_df, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: incremental column '{incremental_col}' "
                f"not present in bronze DataFrame; skipping incremental filter"
            )
            return bronze_df

        output_table = fqn(schema, table_name)

        try:
            existing_table = self.spark.table(output_table)
        except Exception as exc:
            self.logger.debug(
                f"Silver step {step.name}: unable to read existing table {output_table} "
                f"for incremental filter: {exc}"
            )
            return bronze_df

        if watermark_col not in getattr(existing_table, "columns", []):
            self.logger.debug(
                f"Silver step {step.name}: watermark column '{watermark_col}' "
                f"not present in existing table {output_table}; skipping incremental filter"
            )
            return bronze_df

        try:
            watermark_rows = existing_table.select(watermark_col).collect()
        except Exception as exc:
            self.logger.warning(
                f"Silver step {step.name}: failed to collect watermark values "
                f"from {output_table}: {exc}"
            )
            return bronze_df

        if not watermark_rows:
            return bronze_df

        cutoff_value = None
        for row in watermark_rows:
            value = None
            if hasattr(row, "__getitem__"):
                try:
                    value = row[watermark_col]
                except Exception:
                    try:
                        value = row[0]
                    except Exception:
                        value = None
            if value is None and hasattr(row, "asDict"):
                value = row.asDict().get(watermark_col)
            if value is None:
                continue
            cutoff_value = value if cutoff_value is None else max(cutoff_value, value)

        if cutoff_value is None:
            return bronze_df

        try:
            filtered_df = bronze_df.filter(F.col(incremental_col) > F.lit(cutoff_value))
        except Exception as exc:
            if self._using_mock_spark():
                mock_df = self._filter_bronze_rows_mock(
                    bronze_df, incremental_col, cutoff_value
                )
                if mock_df is not None:
                    self.logger.debug(
                        f"Silver step {step.name}: applied mock fallback filter "
                        f"for {incremental_col} > {cutoff_value}"
                    )
                    filtered_df = mock_df
                else:
                    self.logger.warning(
                        f"Silver step {step.name}: failed to filter bronze rows using "
                        f"{incremental_col} > {cutoff_value}: {exc!r}"
                    )
                    return bronze_df
            else:
                self.logger.warning(
                    f"Silver step {step.name}: failed to filter bronze rows using "
                    f"{incremental_col} > {cutoff_value}: {exc!r}"
                )
                return bronze_df

        self.logger.info(
            f"Silver step {step.name}: filtering bronze rows where "
            f"{incremental_col} <= {cutoff_value}"
        )
        return filtered_df

    def _using_mock_spark(self) -> bool:
        """Determine if current spark session is backed by mock-spark."""

        try:
            spark_module = type(self.spark).__module__
        except Exception:
            spark_module = ""
        return is_mock_spark() or "mock_spark" in spark_module

    def _filter_bronze_rows_mock(
        self, bronze_df: DataFrame, incremental_col: str, cutoff_value: object
    ) -> DataFrame | None:
        """
        Mock-spark fallback: collect rows and filter in-memory when column operations fail.
        """

        try:
            rows = bronze_df.collect()
            schema = bronze_df.schema
        except Exception:
            return None

        filtered_rows = []
        for row in rows:
            value = self._extract_row_value(row, incremental_col)
            if value is None:
                continue
            try:
                if value > cutoff_value:
                    filtered_rows.append(row)
            except Exception:
                continue

        if not filtered_rows:
            try:
                return bronze_df.limit(0)
            except Exception:
                pass
            return self.spark.createDataFrame([], schema)

        try:
            column_order: list[str] = []
            if hasattr(schema, "__iter__"):
                column_order = [getattr(field, "name", field) for field in schema]
            if not column_order and hasattr(schema, "fieldNames"):
                column_order = list(schema.fieldNames())
            if not column_order and hasattr(schema, "names"):
                column_order = list(schema.names)
            if not column_order:
                column_order = list(getattr(bronze_df, "columns", []))
            structured_rows = []
            for row in filtered_rows:
                if hasattr(row, "asDict"):
                    row_dict = row.asDict()
                    structured_rows.append(
                        tuple(row_dict.get(col) for col in column_order)
                    )
                else:
                    structured_rows.append(tuple(row))
            return self.spark.createDataFrame(
                structured_rows, schema, verifySchema=False
            )
        except Exception:
            return None

    @staticmethod
    def _extract_row_value(row: Any, column: str) -> object | None:
        """Safely extract a column value from a Row-like object."""
        if hasattr(row, "__getitem__"):
            try:
                return row[column]
            except Exception:
                try:
                    return row[0]
                except Exception:
                    pass
        if hasattr(row, "asDict"):
            try:
                return row.asDict().get(column)
            except Exception:
                return None
        return None

    def _execute_gold_step(
        self, step: GoldStep, context: Dict[str, DataFrame]
    ) -> DataFrame:
        """Execute a gold step."""

        # Build silvers dict from source_silvers
        silvers = {}
        if step.source_silvers is not None:
            for silver_name in step.source_silvers:
                if silver_name not in context:
                    raise ExecutionError(
                        f"Source silver {silver_name} not found in context"
                    )
                silvers[silver_name] = context[silver_name]

        return step.transform(self.spark, silvers)


# Backward compatibility aliases
UnifiedExecutionEngine = ExecutionEngine
UnifiedStepExecutionResult = StepExecutionResult

In [None]:
# Module: pipeline_builder.pipeline.builder (pipeline_builder)
#
# Dependencies: models.base, models.pipeline, models.steps, pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.pipeline.runner, pipeline_builder.types, pipeline_builder_base.errors, pipeline_builder_base.logging, validation.data_validation, validation.pipeline_validation

from __future__ import annotations

from typing import Any, Dict

from abstracts.builder import PipelineBuilder as AbstractsPipelineBuilder

# from ..compat import DataFrame, SparkSession  # Removed: defined in notebook cells above
# from ..engine import SparkEngine  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from ..models import (  # Removed: defined in notebook cells above
# BronzeStep,
# GoldStep,
# SilverStep,
# )
# from .errors import (  # Removed: defined in notebook cells above
# ConfigurationError as PipelineConfigurationError,
# ExecutionError as StepError,
# )
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import (  # Removed: defined in notebook cells above
# ParallelConfig,
# PipelineConfig,
# ValidationThresholds,
# )
# from ..types import (  # Removed: defined in notebook cells above
# ColumnRules,
# GoldTransformFunction,
# SilverTransformFunction,
# StepName,
# TableName,
# )
# from ..validation import UnifiedValidator as UnifiedValidator  # Removed: defined in notebook cells above
# from ..validation import _convert_rules_to_expressions  # Removed: defined in notebook cells above
# from .runner import PipelineRunner  # Removed: defined in notebook cells above


class PipelineBuilder:
    """
    Production-ready builder for creating data pipelines with Bronze → Silver → Gold architecture.

    The PipelineBuilder provides a fluent API for constructing robust data pipelines with
    comprehensive validation, automatic dependency management, and enterprise-grade features.

    Key Features:
    - **Fluent API**: Chain methods for intuitive pipeline construction
    - **Robust Validation**: Early error detection with clear validation messages
    - **Auto-inference**: Automatic dependency detection and validation
    - **String Rules**: Convert human-readable rules to PySpark expressions
    - **Multi-schema Support**: Cross-schema data flows for enterprise environments
    - **Comprehensive Error Handling**: Detailed error messages with suggestions

    Validation Requirements:
        All pipeline steps must have validation rules. Invalid configurations are rejected
        during construction with clear error messages.

    Example:
        from the framework import PipelineBuilder
        from pyspark.sql import functions as F

        # Initialize builder
        builder = PipelineBuilder(spark=spark, schema="analytics")

        # Bronze: Raw data validation (required)
        builder.with_bronze_rules(
            name="events",
            rules={"user_id": ["not_null"], "timestamp": ["not_null"]},  # String rules
            incremental_col="timestamp"
        )

        # Silver: Data transformation (required)
        builder.add_silver_transform(
            name="clean_events",
            source_bronze="events",
            transform=lambda spark, df, silvers: df.filter(F.col("value") > 0),
            rules={"value": ["gt", 0]},  # String rules
            table_name="clean_events"
        )

        # Gold: Business analytics (required)
        builder.add_gold_transform(
            name="daily_metrics",
            transform=lambda spark, silvers: silvers["clean_events"].groupBy("date").agg(F.count("*").alias("count")),
            rules={"count": ["gt", 0]},  # String rules
            table_name="daily_metrics",
            source_silvers=["clean_events"]
        )

        # Build and execute pipeline
        pipeline = builder.to_pipeline()
        result = pipeline.run_initial_load(bronze_sources={"events": source_df})

    String Rules Support:
        You can use human-readable string rules that are automatically converted to PySpark expressions:

        - "not_null" → F.col("column").isNotNull()
        - "gt", value → F.col("column") > value
        - "lt", value → F.col("column") < value
        - "eq", value → F.col("column") == value
        - "in", [values] → F.col("column").isin(values)
        - "between", min, max → F.col("column").between(min, max)

    Args:
        spark: Active SparkSession instance
        schema: Target schema name for pipeline tables
        quality_thresholds: Validation thresholds for each layer (default: Bronze=90%, Silver=95%, Gold=98%)
        parallel_config: Parallel execution configuration
        logger: Optional logger instance

    Raises:
        ValidationError: If validation rules are invalid or missing
        ConfigurationError: If configuration parameters are invalid
        StepError: If step dependencies cannot be resolved

    Example:
        >>> from the framework import PipelineBuilder
        >>> from pyspark.sql import SparkSession, functions as F
        >>>
        >>> spark = SparkSession.builder.appName("My Pipeline").getOrCreate()
        >>> builder = PipelineBuilder(spark=spark, schema="my_schema")
        >>>
        >>> # Bronze layer - raw data validation
        >>> builder.with_bronze_rules(
        ...     name="events",
        ...     rules={"user_id": [F.col("user_id").isNotNull()]},
        ...     incremental_col="timestamp"
        ... )
        >>>
        >>> # Silver layer - data transformation
        >>> builder.add_silver_transform(
        ...     name="clean_events",
        ...     source_bronze="events",
        ...     transform=lambda spark, df, silvers: df.filter(F.col("status") == "active"),
        ...     rules={"status": [F.col("status").isNotNull()]},
        ...     table_name="clean_events",
        ...     watermark_col="timestamp"
        ... )
        >>>
        >>> # Gold layer - business analytics
        >>> builder.add_gold_transform(
        ...     name="user_analytics",
        ...     transform=lambda spark, silvers: silvers["clean_events"].groupBy("user_id").count(),
        ...     rules={"user_id": [F.col("user_id").isNotNull()]},
        ...     table_name="user_analytics",
        ...     source_silvers=["clean_events"]
        ... )
        >>>
        >>> # Build and execute pipeline
        >>> pipeline = builder.to_pipeline()
        >>> result = pipeline.initial_load(bronze_sources={"events": source_df})
    """

    def __init__(
        self,
        *,
        spark: SparkSession,
        schema: str,
        min_bronze_rate: float = 95.0,
        min_silver_rate: float = 98.0,
        min_gold_rate: float = 99.0,
        verbose: bool = True,
        functions: FunctionsProtocol | None = None,
    ) -> None:
        """
        Initialize a new PipelineBuilder instance.

        Args:
            spark: Active SparkSession instance for data processing
            schema: Database schema name where tables will be created
            min_bronze_rate: Minimum data quality rate for Bronze layer (0-100)
            min_silver_rate: Minimum data quality rate for Silver layer (0-100)
            min_gold_rate: Minimum data quality rate for Gold layer (0-100)
            verbose: Enable verbose logging output

        Raises:
            ValueError: If quality rates are not between 0 and 100
            RuntimeError: If Spark session is not active
        """
        # Validate inputs
        if not spark:
            raise PipelineConfigurationError(
                "Spark session is required",
                suggestions=[
                    "Ensure SparkSession is properly initialized",
                    "Check Spark configuration",
                ],
            )
        if not schema:
            raise PipelineConfigurationError(
                "Schema name cannot be empty",
                suggestions=[
                    "Provide a valid schema name",
                    "Check database configuration",
                ],
            )

        # Store configuration
        thresholds = ValidationThresholds(
            bronze=min_bronze_rate, silver=min_silver_rate, gold=min_gold_rate
        )
        # Use default parallel config (enabled with 4 workers)
        parallel_config = ParallelConfig.create_default()
        self.config = PipelineConfig(
            schema=schema,
            thresholds=thresholds,
            parallel=parallel_config,
            verbose=verbose,
        )

        # Initialize components
        self.spark = spark
        self.logger = PipelineLogger(verbose=verbose)
        self.validator = UnifiedValidator(self.logger)
        self.functions = functions if functions is not None else get_default_functions()

        # Expose schema for backward compatibility
        self.schema = schema
        self.pipeline_id = (
            f"pipeline_{schema}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        )

        # Expose validators for backward compatibility
        self.validators = self.validator.custom_validators

        # Pipeline definition
        self.bronze_steps: Dict[str, BronzeStep] = {}
        self.silver_steps: Dict[str, SilverStep] = {}
        self.gold_steps: Dict[str, GoldStep] = {}

        # Create SparkEngine for abstracts layer
        self.spark_engine = SparkEngine(
            spark=self.spark,
            config=self.config,
            logger=self.logger,
            functions=self.functions,
        )

        # Create abstracts.PipelineBuilder with SparkEngine injection
        # We'll use PipelineRunner as the runner class
        self._abstracts_builder = AbstractsPipelineBuilder(
            runner_cls=PipelineRunner,
            engine=self.spark_engine,
        )

        self.logger.info(f"🔧 PipelineBuilder initialized (schema: {schema})")

    def with_bronze_rules(
        self,
        *,
        name: StepName,
        rules: ColumnRules,
        incremental_col: str | None = None,
        description: str | None = None,
        schema: str | None = None,
    ) -> PipelineBuilder:
        """
        Add Bronze layer validation rules for raw data ingestion.

        Bronze steps represent the first layer of the Medallion Architecture,
        handling raw data ingestion and initial validation. All Bronze steps
        must have non-empty validation rules.

        Args:
            name: Unique identifier for this Bronze step
            rules: Dictionary mapping column names to validation rule lists.
                   Supports both PySpark Column expressions and string rules:
                   - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                   - String: {"user_id": ["not_null"], "age": ["gt", 0]}
            incremental_col: Column name for incremental processing (e.g., "timestamp", "updated_at").
                            If provided, enables incremental processing with append mode.
            description: Optional description of this Bronze step
            schema: Optional schema name for reading bronze data. If not provided, uses the builder's default schema.

        Returns:
            Self for method chaining

        Raises:
            ValidationError: If rules are empty or invalid
            ConfigurationError: If step name conflicts or configuration is invalid

        Example:
            >>> # Using PySpark Column expressions
            >>> builder.with_bronze_rules(
            ...     name="events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     incremental_col="timestamp"
            ... )

            >>> # Using string rules (automatically converted)
            >>> builder.with_bronze_rules(
            ...     name="users",
            ...     rules={"user_id": ["not_null"], "age": ["gt", 0], "status": ["in", ["active", "inactive"]]},
            ...     incremental_col="updated_at"
            ... )

        String Rules Support:
            - "not_null" → F.col("column").isNotNull()
            - "gt", value → F.col("column") > value
            - "lt", value → F.col("column") < value
            - "eq", value → F.col("column") == value
            - "in", [values] → F.col("column").isin(values)
            - "between", min, max → F.col("column").between(min, max)
            ...     name="user_events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     incremental_col="timestamp",
            ...     schema="raw_data"  # Read from different schema
            ... )
        """
        if not name:
            raise StepError(
                "Bronze step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "bronze"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        if name in self.bronze_steps:
            raise StepError(
                f"Bronze step '{name}' already exists",
                context={"step_name": name, "step_type": "bronze"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            )

        # Validate schema if provided
        if schema is not None:
            self._validate_schema(schema)

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Create bronze step
        bronze_step = BronzeStep(
            name=name,
            rules=converted_rules,
            incremental_col=incremental_col,
            schema=schema,
        )

        self.bronze_steps[name] = bronze_step
        self.logger.info(f"✅ Added Bronze step: {name}")

        return self

    def with_silver_rules(
        self,
        *,
        name: StepName,
        table_name: TableName,
        rules: ColumnRules,
        watermark_col: str | None = None,
        description: str | None = None,
        schema: str | None = None,
    ) -> PipelineBuilder:
        """
        Add existing Silver layer table for validation and monitoring.

        This method is used when you have an existing Silver table that you want to
        include in the pipeline for validation and monitoring purposes, but don't
        need to transform the data.

        Args:
            name: Unique identifier for this Silver step
            table_name: Existing Delta table name
            rules: Dictionary mapping column names to validation rule lists
            watermark_col: Column name for watermarking (optional)
            description: Optional description of this Silver step
            schema: Optional schema name for reading silver data. If not provided, uses the builder's default schema.

        Returns:
            Self for method chaining

        Example:
            >>> builder.with_silver_rules(
            ...     name="existing_clean_events",
            ...     table_name="clean_events",
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     watermark_col="updated_at",
            ...     schema="staging"  # Read from different schema
            ... )
        """
        if not name:
            raise StepError(
                "Silver step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "silver"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        if name in self.silver_steps:
            raise StepError(
                f"Silver step '{name}' already exists",
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            )

        # Validate schema if provided
        if schema is not None:
            self._validate_schema(schema)

        # Create SilverStep for existing table
        # Create a dummy transform function for existing tables
        def dummy_transform_func(
            spark: SparkSession,
            bronze_df: DataFrame,
            prior_silvers: Dict[str, DataFrame],
        ) -> DataFrame:
            return bronze_df

        # Type the function properly
        dummy_transform: SilverTransformFunction = dummy_transform_func

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        silver_step = SilverStep(
            name=name,
            source_bronze="",  # No source for existing tables
            transform=dummy_transform,
            rules=converted_rules,
            table_name=table_name,
            watermark_col=watermark_col,
            existing=True,
            schema=schema,
            source_incremental_col=None,
        )

        self.silver_steps[name] = silver_step
        self.logger.info(f"✅ Added existing Silver step: {name}")

        return self

    def add_validator(self, validator: Any) -> PipelineBuilder:
        """
        Add a custom step validator to the pipeline.

        Custom validators allow you to add additional validation logic
        beyond the built-in validation rules.

        Args:
            validator: Custom validator implementing StepValidator protocol

        Returns:
            Self for method chaining

        Example:
            >>> class CustomValidator(StepValidator):
            ...     def validate(self, step, context):
            ...         if step.name == "special_step":
            ...             return ["Special validation failed"]
            ...         return []
            >>>
            >>> builder.add_validator(CustomValidator())
        """
        self.validator.add_validator(validator)
        return self

    def add_silver_transform(
        self,
        *,
        name: StepName,
        source_bronze: StepName | None = None,
        transform: SilverTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        watermark_col: str | None = None,
        description: str | None = None,
        depends_on: list[StepName] | None = None,
        schema: str | None = None,
    ) -> PipelineBuilder:
        """
        Add Silver layer transformation step for data cleaning and enrichment.

        Silver steps represent the second layer of the Medallion Architecture,
        transforming raw Bronze data into clean, business-ready datasets. All Silver steps
        must have non-empty validation rules and a valid transform function.

        Args:
            name: Unique identifier for this Silver step
            source_bronze: Name of the Bronze step this Silver step depends on.
                          If not provided, will automatically infer from the most recent
                          with_bronze_rules() call. If no bronze steps exist, will raise an error.
            transform: Transformation function with signature:
                     (spark: SparkSession, bronze_df: DataFrame, prior_silvers: Dict[str, DataFrame]) -> DataFrame
                     Must be callable and cannot be None.
            rules: Dictionary mapping column names to validation rule lists.
                   Supports both PySpark Column expressions and string rules:
                   - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                   - String: {"user_id": ["not_null"], "age": ["gt", 0]}
            table_name: Target Delta table name where results will be stored
            watermark_col: Column name for watermarking (e.g., "timestamp", "updated_at").
                          If provided, enables incremental processing with append mode.
            description: Optional description of this Silver step
            depends_on: List of other Silver step names that must complete before this step.
            schema: Optional schema name for writing silver data. If not provided, uses the builder's default schema.

        Returns:
            Self for method chaining

        Raises:
            ValidationError: If rules are empty, transform is None, or configuration is invalid
            ConfigurationError: If step name conflicts or dependencies cannot be resolved

        Example:
            >>> def clean_user_events(spark, bronze_df, prior_silvers):
            ...     return (bronze_df
            ...         .filter(F.col("user_id").isNotNull())
            ...         .withColumn("event_date", F.date_trunc("day", "timestamp"))
            ...     )
            >>>
            >>> # Using PySpark Column expressions
            >>> builder.add_silver_transform(
            ...     name="clean_events",
            ...     source_bronze="user_events",
            ...     transform=clean_user_events,
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="clean_events"
            ... )

            >>> # Using string rules (automatically converted)
            >>> builder.add_silver_transform(
            ...     name="enriched_events",
            ...     source_bronze="user_events",
            ...     transform=lambda spark, df, silvers: df.withColumn("processed_at", F.current_timestamp()),
            ...     rules={"user_id": ["not_null"], "processed_at": ["not_null"]},
            ...     table_name="enriched_events",
            ...     watermark_col="processed_at"
            ... )

        String Rules Support:
            - "not_null" → F.col("column").isNotNull()
            - "gt", value → F.col("column") > value
            - "lt", value → F.col("column") < value
            - "eq", value → F.col("column") == value
            - "in", [values] → F.col("column").isin(values)
            - "between", min, max → F.col("column").between(min, max)
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="clean_user_events",
            ...     watermark_col="timestamp"
            ... )
            >>>
            >>> # Auto-infer source_bronze from most recent with_bronze_rules()
            >>> builder.add_silver_transform(
            ...     name="enriched_events",
            ...     transform=enrich_user_events,
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="enriched_user_events",
            ...     schema="processing"  # Write to different schema
            ... )
        """
        if not name:
            raise StepError(
                "Silver step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "silver"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        if name in self.silver_steps:
            raise StepError(
                f"Silver step '{name}' already exists",
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            )

        # Auto-infer source_bronze if not provided
        if source_bronze is None:
            if not self.bronze_steps:
                raise StepError(
                    "No bronze steps available for auto-inference",
                    context={"step_name": name, "step_type": "silver"},
                    suggestions=[
                        "Add a bronze step first using with_bronze_rules()",
                        "Explicitly specify source_bronze parameter",
                    ],
                )

            # Use the most recently added bronze step
            source_bronze = list(self.bronze_steps.keys())[-1]
            self.logger.info(f"🔍 Auto-inferred source_bronze: {source_bronze}")

        # Validate that the source_bronze exists
        if source_bronze not in self.bronze_steps:
            raise StepError(
                f"Bronze step '{source_bronze}' not found",
                context={"step_name": name, "step_type": "silver"},
                suggestions=[
                    f"Available bronze steps: {list(self.bronze_steps.keys())}",
                    "Add the bronze step first using with_bronze_rules()",
                ],
            )

        # Note: Dependency validation is deferred to validate_pipeline()
        # This allows for more flexible pipeline construction

        # Use builder's schema if not provided
        if schema is None:
            schema = self.config.schema
        else:
            self._validate_schema(schema)

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Capture the incremental column from the source bronze step (if any)
        source_incremental_col = self.bronze_steps[source_bronze].incremental_col

        # Create silver step
        silver_step = SilverStep(
            name=name,
            source_bronze=source_bronze,
            transform=transform,
            rules=converted_rules,
            table_name=table_name,
            watermark_col=watermark_col,
            schema=schema,
            source_incremental_col=source_incremental_col,
        )

        self.silver_steps[name] = silver_step
        self.logger.info(f"✅ Added Silver step: {name} (source: {source_bronze})")

        return self

    def add_gold_transform(
        self,
        *,
        name: StepName,
        transform: GoldTransformFunction,
        rules: ColumnRules,
        table_name: TableName,
        source_silvers: list[StepName] | None = None,
        description: str | None = None,
        schema: str | None = None,
    ) -> PipelineBuilder:
        """
        Add Gold layer transformation step for business analytics and aggregations.

        Gold steps represent the third layer of the Medallion Architecture,
        creating business-ready datasets for analytics and reporting. All Gold steps
        must have non-empty validation rules and a valid transform function.

        Args:
            name: Unique identifier for this Gold step
            transform: Transformation function with signature:
                     (spark: SparkSession, silvers: Dict[str, DataFrame]) -> DataFrame
                     Must be callable and cannot be None.
            rules: Dictionary mapping column names to validation rule lists.
                   Supports both PySpark Column expressions and string rules:
                   - PySpark: {"user_id": [F.col("user_id").isNotNull()]}
                   - String: {"user_id": ["not_null"], "count": ["gt", 0]}
            table_name: Target Delta table name where results will be stored
            source_silvers: List of Silver step names this Gold step depends on.
                           If not provided, will automatically use all available Silver steps.
                           If no Silver steps exist, will raise an error.
            description: Optional description of this Gold step
            schema: Optional schema name for writing gold data. If not provided, uses the builder's default schema.

        Returns:
            Self for method chaining

        Raises:
            ValidationError: If rules are empty, transform is None, or configuration is invalid
            ConfigurationError: If step name conflicts or dependencies cannot be resolved

        Example:
            >>> def user_daily_metrics(spark, silvers):
            ...     events_df = silvers["clean_events"]
            ...     return (events_df
            ...         .groupBy("user_id", "event_date")
            ...         .agg(F.count("*").alias("event_count"))
            ...     )
            >>>
            >>> # Using PySpark Column expressions
            >>> builder.add_gold_transform(
            ...     name="user_metrics",
            ...     transform=user_daily_metrics,
            ...     rules={"user_id": [F.col("user_id").isNotNull()]},
            ...     table_name="user_daily_metrics",
            ...     source_silvers=["clean_events"]
            ... )

            >>> # Using string rules (automatically converted)
            >>> builder.add_gold_transform(
            ...     name="daily_analytics",
            ...     transform=lambda spark, silvers: silvers["clean_events"].groupBy("date").agg(F.count("*").alias("count")),
            ...     rules={"date": ["not_null"], "count": ["gt", 0]},
            ...     table_name="daily_analytics",
            ...     source_silvers=["clean_events"]
            ... )

        String Rules Support:
            - "not_null" → F.col("column").isNotNull()
            - "gt", value → F.col("column") > value
            - "lt", value → F.col("column") < value
            - "eq", value → F.col("column") == value
            - "in", [values] → F.col("column").isin(values)
            - "between", min, max → F.col("column").between(min, max)
            >>> # Auto-infer source_silvers from all available Silver steps
            >>> builder.add_gold_transform(
            ...     name="daily_analytics",
            ...     transform=daily_analytics,
            ...     rules={"event_date": [F.col("event_date").isNotNull()]},
            ...     table_name="daily_analytics",
            ...     schema="analytics"  # Write to different schema
            ... )
        """
        if not name:
            raise StepError(
                "Gold step name cannot be empty",
                context={"step_name": name or "unknown", "step_type": "gold"},
                suggestions=[
                    "Provide a valid step name",
                    "Check step naming conventions",
                ],
            )

        if name in self.gold_steps:
            raise StepError(
                f"Gold step '{name}' already exists",
                context={"step_name": name, "step_type": "gold"},
                suggestions=[
                    "Use a different step name",
                    "Remove the existing step first",
                ],
            )

        # Auto-infer source_silvers if not provided
        if source_silvers is None:
            if not self.silver_steps:
                raise StepError(
                    "No silver steps available for auto-inference",
                    context={"step_name": name, "step_type": "gold"},
                    suggestions=[
                        "Add a silver step first using add_silver_transform()",
                        "Explicitly specify source_silvers parameter",
                    ],
                )

            # Use all available silver steps
            source_silvers = list(self.silver_steps.keys())
            self.logger.info(f"🔍 Auto-inferred source_silvers: {source_silvers}")

        # Validate that all source_silvers exist
        invalid_silvers = [s for s in source_silvers if s not in self.silver_steps]
        if invalid_silvers:
            raise StepError(
                f"Silver steps not found: {invalid_silvers}",
                context={"step_name": name, "step_type": "gold"},
                suggestions=[
                    f"Available silver steps: {list(self.silver_steps.keys())}",
                    "Add the missing silver steps first using add_silver_transform()",
                ],
            )

        # Note: Dependency validation is deferred to validate_pipeline()
        # This allows for more flexible pipeline construction

        # Use builder's schema if not provided
        if schema is None:
            schema = self.config.schema
        else:
            self._validate_schema(schema)

        # Convert string rules to PySpark Column objects
        converted_rules = _convert_rules_to_expressions(rules, self.functions)

        # Create gold step
        gold_step = GoldStep(
            name=name,
            transform=transform,
            rules=converted_rules,
            table_name=table_name,
            source_silvers=source_silvers,
            schema=schema,
        )

        self.gold_steps[name] = gold_step
        self.logger.info(f"✅ Added Gold step: {name} (sources: {source_silvers})")

        return self

    def validate_pipeline(self) -> list[str]:
        """
        Validate the entire pipeline configuration.

        Returns:
            List of validation errors (empty if valid)
        """
        validation_result = self.validator.validate_pipeline(
            self.config, self.bronze_steps, self.silver_steps, self.gold_steps
        )

        if validation_result.errors:
            self.logger.error(
                f"Pipeline validation failed with {len(validation_result.errors)} errors"
            )
            for error in validation_result.errors:
                self.logger.error(f"  - {error}")
        else:
            self.logger.info("✅ Pipeline validation passed")

        return validation_result.errors

    # ============================================================================
    # PRESET CONFIGURATIONS AND HELPER METHODS
    # ============================================================================

    @classmethod
    def for_development(
        cls,
        spark: SparkSession,
        schema: str,
        functions: FunctionsProtocol | None = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """
        Create a PipelineBuilder optimized for development with relaxed validation.

        Args:
            spark: Active SparkSession instance
            schema: Database schema name
            **kwargs: Additional configuration parameters

        Returns:
            PipelineBuilder instance with development-optimized settings

        Example:
            >>> builder = PipelineBuilder.for_development(
            ...     spark=spark,
            ...     schema="dev_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=80.0,  # Relaxed validation
            min_silver_rate=85.0,
            min_gold_rate=90.0,
            verbose=True,
            functions=functions,
            **kwargs,
        )

    @classmethod
    def for_production(
        cls,
        spark: SparkSession,
        schema: str,
        functions: FunctionsProtocol | None = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """
        Create a PipelineBuilder optimized for production with strict validation.

        Args:
            spark: Active SparkSession instance
            schema: Database schema name
            **kwargs: Additional configuration parameters

        Returns:
            PipelineBuilder instance with production-optimized settings

        Example:
            >>> builder = PipelineBuilder.for_production(
            ...     spark=spark,
            ...     schema="prod_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=95.0,  # Strict validation
            min_silver_rate=98.0,
            min_gold_rate=99.0,
            verbose=False,
            functions=functions,
            **kwargs,
        )

    @classmethod
    def for_testing(
        cls,
        spark: SparkSession,
        schema: str,
        functions: FunctionsProtocol | None = None,
        **kwargs: Any,
    ) -> PipelineBuilder:
        """
        Create a PipelineBuilder optimized for testing with minimal validation.

        Args:
            spark: Active SparkSession instance
            schema: Database schema name
            **kwargs: Additional configuration parameters

        Returns:
            PipelineBuilder instance with testing-optimized settings

        Example:
            >>> builder = PipelineBuilder.for_testing(
            ...     spark=spark,
            ...     schema="my_schema"
            ... )
        """
        return cls(
            spark=spark,
            schema=schema,
            min_bronze_rate=70.0,  # Very relaxed validation
            min_silver_rate=75.0,
            min_gold_rate=80.0,
            verbose=True,
            functions=functions,
            **kwargs,
        )

    # ============================================================================
    # VALIDATION HELPER METHODS
    # ============================================================================

    @staticmethod
    def not_null_rules(
        columns: list[str], functions: FunctionsProtocol | None = None
    ) -> ColumnRules:
        """
        Create validation rules for non-null constraints on multiple columns.

        Args:
            columns: List of column names to validate for non-null
            functions: Optional functions object for column operations

        Returns:
            Dictionary of validation rules

        Example:
            >>> rules = PipelineBuilder.not_null_rules(["user_id", "timestamp", "value"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "user_id": [F.col("user_id").isNotNull()],
            >>> #     "timestamp": [F.col("timestamp").isNotNull()],
            >>> #     "value": [F.col("value").isNotNull()]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {col: [functions.col(col).isNotNull()] for col in columns}

    @staticmethod
    def positive_number_rules(
        columns: list[str], functions: FunctionsProtocol | None = None
    ) -> ColumnRules:
        """
        Create validation rules for positive number constraints on multiple columns.

        Args:
            columns: List of column names to validate for positive numbers
            functions: Optional functions object for column operations

        Returns:
            Dictionary of validation rules

        Example:
            >>> rules = PipelineBuilder.positive_number_rules(["value", "count"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "value": [F.col("value").isNotNull(), F.col("value") > 0],
            >>> #     "count": [F.col("count").isNotNull(), F.col("count") > 0]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [functions.col(col).isNotNull(), functions.col(col) > 0]  # type: ignore[list-item]
            for col in columns
        }

    @staticmethod
    def string_not_empty_rules(
        columns: list[str], functions: FunctionsProtocol | None = None
    ) -> ColumnRules:
        """
        Create validation rules for non-empty string constraints on multiple columns.

        Args:
            columns: List of column names to validate for non-empty strings
            functions: Optional functions object for column operations

        Returns:
            Dictionary of validation rules

        Example:
            >>> rules = PipelineBuilder.string_not_empty_rules(["name", "category"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "name": [F.col("name").isNotNull(), F.length(F.col("name")) > 0],
            >>> #     "category": [F.col("category").isNotNull(), F.length(F.col("category")) > 0]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [
                functions.col(col).isNotNull(),
                functions.length(functions.col(col)) > 0,  # type: ignore[list-item]
            ]
            for col in columns
        }

    @staticmethod
    def timestamp_rules(
        columns: list[str], functions: FunctionsProtocol | None = None
    ) -> ColumnRules:
        """
        Create validation rules for timestamp constraints on multiple columns.

        Args:
            columns: List of column names to validate as timestamps
            functions: Optional functions object for column operations

        Returns:
            Dictionary of validation rules

        Example:
            >>> rules = PipelineBuilder.timestamp_rules(["created_at", "updated_at"])
            >>> # Equivalent to:
            >>> # {
            >>> #     "created_at": [F.col("created_at").isNotNull(), F.col("created_at").isNotNull()],
            >>> #     "updated_at": [F.col("updated_at").isNotNull(), F.col("updated_at").isNotNull()]
            >>> # }
        """
        if functions is None:
            functions = get_default_functions()
        return {
            col: [functions.col(col).isNotNull(), functions.col(col).isNotNull()]
            for col in columns
        }

    @staticmethod
    def detect_timestamp_columns(df_schema: Any) -> list[str]:
        """
        Detect timestamp columns from a DataFrame schema.

        Args:
            df_schema: DataFrame schema or list of column names with types

        Returns:
            List of column names that appear to be timestamps

        Example:
            >>> timestamp_cols = PipelineBuilder.detect_timestamp_columns(df.schema)
            >>> # Returns columns like ["timestamp", "created_at", "updated_at"]
        """
        timestamp_keywords = [
            "timestamp",
            "created_at",
            "updated_at",
            "event_time",
            "process_time",
            "ingestion_time",
            "load_time",
            "modified_at",
            "date_time",
            "ts",
        ]

        if hasattr(df_schema, "fields"):
            # DataFrame schema
            columns = [field.name.lower() for field in df_schema.fields]
        else:
            # List of column names
            columns = [col.lower() for col in df_schema]

        # Find columns that match timestamp patterns
        timestamp_cols = []
        for col in columns:
            if any(keyword in col for keyword in timestamp_keywords):
                timestamp_cols.append(col)

        return timestamp_cols

    def _validate_schema(self, schema: str) -> None:
        """
        Validate that a schema exists and is accessible.

        Args:
            schema: Schema name to validate

        Raises:
            StepError: If schema doesn't exist or is not accessible
        """
        try:
            # Check if schema exists using catalog API
            databases = [db.name for db in self.spark.catalog.listDatabases()]
            if schema not in databases:
                raise StepError(
                    f"Schema '{schema}' does not exist",
                    context={
                        "step_name": "schema_validation",
                        "step_type": "validation",
                    },
                    suggestions=[
                        f"Create the schema first: CREATE SCHEMA IF NOT EXISTS {schema}",
                        "Check schema permissions",
                        "Verify schema name spelling",
                    ],
                )
            self.logger.debug(f"✅ Schema '{schema}' is accessible")
        except StepError:
            # Re-raise StepError as-is
            raise
        except Exception as e:
            raise StepError(
                f"Schema '{schema}' is not accessible: {str(e)}",
                context={"step_name": "schema_validation", "step_type": "validation"},
                suggestions=[
                    f"Create the schema first: CREATE SCHEMA IF NOT EXISTS {schema}",
                    "Check schema permissions",
                    "Verify schema name spelling",
                ],
            ) from e

    def _create_schema_if_not_exists(self, schema: str) -> None:
        """
        Create a schema if it doesn't exist.

        Args:
            schema: Schema name to create
        """
        try:
            # Use SQL to create schema
            self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            self.logger.info(f"✅ Schema '{schema}' created or already exists")
        except Exception as e:
            raise StepError(
                f"Failed to create schema '{schema}': {str(e)}",
                context={"step_name": "schema_creation", "step_type": "validation"},
                suggestions=[
                    "Check schema permissions",
                    "Verify schema name is valid",
                    "Check for naming conflicts",
                ],
            ) from e

    def _get_effective_schema(self, step_schema: str | None) -> str:
        """
        Get the effective schema for a step, falling back to the builder's default schema.

        Args:
            step_schema: Schema specified for the step

        Returns:
            The effective schema name
        """
        return step_schema if step_schema is not None else self.schema

    def to_pipeline(self) -> PipelineRunner:
        """
        Build and return a PipelineRunner for executing this pipeline.

        Returns:
            PipelineRunner instance ready for execution (implements abstracts.Runner)

        Raises:
            ValueError: If pipeline validation fails
        """
        # Validate pipeline before building
        validation_errors = self.validate_pipeline()
        if validation_errors:
            raise ValueError(
                f"Pipeline validation failed with {len(validation_errors)} errors: {', '.join(validation_errors)}"
            )

        # Build steps list for abstracts.PipelineBuilder validation
        all_steps = (
            list(self.bronze_steps.values())
            + list(self.silver_steps.values())
            + list(self.gold_steps.values())
        )

        # Use abstracts.PipelineBuilder to validate steps
        # This ensures step validation follows the abstracts interface
        # Type cast needed because BronzeStep/SilverStep/GoldStep satisfy Step Protocol
        try:
            from abstracts.step import Step as AbstractsStep

            steps_for_validation: list[AbstractsStep] = all_steps  # type: ignore[assignment]
            self._abstracts_builder.validate_steps(steps_for_validation)
        except ValueError as e:
            raise ValueError(f"Step validation failed: {e}") from e

        # Create PipelineRunner with proper configuration
        # PipelineRunner implements abstracts.Runner, so this satisfies the interface
        # Note: steps and engine are optional parameters for abstracts compatibility
        # but we pass them to ensure the runner is properly initialized
        runner = PipelineRunner(
            spark=self.spark,
            config=self.config,
            bronze_steps=self.bronze_steps,
            silver_steps=self.silver_steps,
            gold_steps=self.gold_steps,
            logger=self.logger,
            functions=self.functions,
            steps=all_steps
            if all_steps
            else None,  # Pass steps for abstracts.Runner compatibility
            engine=self.spark_engine,  # Pass engine for abstracts.Runner compatibility
        )

        self.logger.info(
            f"🚀 Pipeline built successfully with {len(self.bronze_steps)} bronze, {len(self.silver_steps)} silver, {len(self.gold_steps)} gold steps"
        )

        return runner

In [None]:
# Module: pipeline_builder.writer.core (pipeline_builder)
#
# Dependencies: pipeline_builder.compat, pipeline_builder.functions, pipeline_builder.models.execution, pipeline_builder.writer.models, pipeline_builder.writer.monitoring, pipeline_builder_base.logging, validation.utils, writer.analytics, writer.exceptions, writer.operations, writer.storage

from __future__ import annotations

from typing import Any, Dict

# from ..compat import SparkSession  # Removed: defined in notebook cells above
# from ..functions import FunctionsProtocol, get_default_functions  # Removed: defined in notebook cells above
# from .logging import PipelineLogger  # Removed: defined in notebook cells above
# from .models import ExecutionResult, StepResult  # Removed: defined in notebook cells above
# from ..pipeline.models import PipelineReport  # Removed: defined in notebook cells above
# from ..table_operations import table_exists  # Removed: defined in notebook cells above
# from .analytics import (  # Removed: defined in notebook cells above
# DataQualityAnalyzer,
# ExecutionTrends,
# QualityAnomalies,
# QualityTrends,
# TrendAnalyzer,
# )
# from .exceptions import WriterConfigurationError, WriterError  # Removed: defined in notebook cells above
# from .models import LogRow, WriteMode, WriterConfig, WriterMetrics, create_log_schema  # Removed: defined in notebook cells above
# from .monitoring import (  # Removed: defined in notebook cells above
# AnalyticsEngine,
# AnomalyReport,
# MemoryUsageInfo,
# PerformanceMonitor,
# PerformanceReport,
# )
# from .operations import DataProcessor, DataQualityReport  # Removed: defined in notebook cells above
# from .storage import (  # Removed: defined in notebook cells above
# OptimizeResult,
# StorageManager,
# TableInfo,
# VacuumResult,
# WriteResult,
# )


def time_write_operation(
    operation_func: Any, *args: Any, **kwargs: Any
) -> tuple[int, float, Any, Any]:
    """
    Time a write operation and return metrics.

    Args:
        operation_func: Function to time
        *args: Arguments for the function
        **kwargs: Keyword arguments for the function

    Returns:
        Tuple of (rows_written, duration_secs, start_time, end_time)
    """
    import time
    from datetime import datetime

    start_time = datetime.now()
    start_ts = time.time()

    try:
        result = operation_func(*args, **kwargs)
        rows_written = result.get("rows_written", 0) if isinstance(result, dict) else 0
    except Exception:
        rows_written = 0

    end_time = datetime.now()
    duration_secs = time.time() - start_ts

    return rows_written, duration_secs, start_time, end_time


def validate_log_data(log_rows: list[LogRow]) -> None:
    """
    Validate log data for quality and consistency.

    Args:
        log_rows: List of log rows to validate

    Raises:
        WriterValidationError: If validation fails
    """
    if not log_rows:
        return

    # Basic validation - check required fields
    required_fields = {"run_id", "phase", "step_name"}
    for i, row in enumerate(log_rows):
        missing_fields = required_fields - set(row.keys())
        if missing_fields:
            # from .exceptions import WriterValidationError  # Removed: defined in notebook cells above

            raise WriterValidationError(
                f"Log row {i} missing required fields: {missing_fields}",
                validation_errors=[f"Missing fields: {missing_fields}"],
                context={"row_index": i, "row": row},
            )


def create_log_rows_from_execution_result(
    execution_result: ExecutionResult,
    run_id: str,
    run_mode: str = "initial",
    metadata: Dict[str, Any] | None = None,
) -> list[LogRow]:
    """
    Create log rows from an execution result.

    Args:
        execution_result: The execution result
        run_id: Run identifier
        run_mode: Mode of the run
        metadata: Additional metadata

    Returns:
        List of log rows
    """

    log_rows = []

    # Create a main log row for the execution
    main_row: LogRow = {
        "run_id": run_id,
        "run_mode": run_mode,  # type: ignore[typeddict-item]
        "run_started_at": getattr(execution_result, "start_time", None),
        "run_ended_at": getattr(execution_result, "end_time", None),
        "execution_id": getattr(execution_result, "execution_id", run_id),
        "pipeline_id": getattr(execution_result, "pipeline_id", "unknown"),
        "schema": getattr(execution_result, "schema", "default"),
        "phase": "bronze",
        "step_name": "pipeline_execution",
        "step_type": "pipeline",
        "start_time": getattr(execution_result, "start_time", None),
        "end_time": getattr(execution_result, "end_time", None),
        "duration_secs": getattr(execution_result, "duration", 0.0) or 0.0,
        "table_fqn": None,
        "write_mode": None,
        "input_rows": 0,
        "output_rows": 0,
        "rows_written": 0,
        "rows_processed": 0,
        "table_total_rows": None,
        "valid_rows": 0,
        "invalid_rows": 0,
        "validation_rate": 100.0,
        "success": getattr(execution_result, "status", "unknown") == "completed",
        "error_message": getattr(execution_result, "error", None),
        "memory_usage_mb": 0.0,
        "cpu_usage_percent": 0.0,
        "metadata": {},
    }

    log_rows.append(main_row)

    # Add step results if available
    if hasattr(execution_result, "steps"):
        steps = getattr(execution_result, "steps", None)
        if steps and isinstance(steps, (list, tuple)):
            for step in steps:
                step_row: LogRow = {
                    "run_id": run_id,
                    "run_mode": run_mode,  # type: ignore[typeddict-item]
                    "run_started_at": getattr(execution_result, "start_time", None),
                    "run_ended_at": getattr(execution_result, "end_time", None),
                    "execution_id": getattr(execution_result, "execution_id", run_id),
                    "pipeline_id": getattr(execution_result, "pipeline_id", "unknown"),
                    "schema": getattr(execution_result, "schema", "default"),
                    "phase": getattr(step, "step_type", "bronze").lower(),  # type: ignore[typeddict-item]
                    "step_name": getattr(step, "step_name", "unknown"),
                    "step_type": getattr(step, "step_type", "unknown"),
                    "start_time": getattr(step, "start_time", None),
                    "end_time": getattr(step, "end_time", None),
                    "duration_secs": getattr(step, "duration", 0.0),
                    "table_fqn": getattr(step, "output_table", None),
                    "write_mode": getattr(step, "write_mode", None),
                    "input_rows": getattr(step, "input_rows", 0),
                    "output_rows": getattr(step, "rows_processed", 0),
                    "rows_written": getattr(step, "rows_written", 0),
                    "rows_processed": getattr(step, "rows_processed", 0),
                    "table_total_rows": None,
                    "valid_rows": 0,
                    "invalid_rows": 0,
                    "validation_rate": 100.0,
                    "success": getattr(step, "status", "unknown") == "completed",
                    "error_message": getattr(step, "error", None),
                    "memory_usage_mb": 0.0,
                    "cpu_usage_percent": 0.0,
                    "metadata": {},
                }
                log_rows.append(step_row)

    return log_rows


class LogWriter:
    """
    Refactored LogWriter with modular architecture.

    This class orchestrates the various writer components to provide
    comprehensive logging functionality for pipeline execution results.

    Components:
    - DataProcessor: Handles data processing and transformations
    - StorageManager: Manages Delta Lake storage operations
    - PerformanceMonitor: Tracks performance metrics
    - AnalyticsEngine: Provides analytics and trend analysis
    - DataQualityAnalyzer: Analyzes data quality metrics
    - TrendAnalyzer: Analyzes execution trends
    """

    def __init__(
        self,
        spark: SparkSession,
        schema: str | None = None,
        table_name: str | None = None,
        config: WriterConfig | None = None,
        functions: FunctionsProtocol | None = None,
        logger: PipelineLogger | None = None,
    ) -> None:
        """
        Initialize the LogWriter with modular components.

        Args:
            spark: Spark session
            schema: Database schema name (simplified API)
            table_name: Table name (simplified API)
            config: Writer configuration (deprecated, use schema and table_name instead)
            functions: Functions protocol (optional, uses default if not provided)
            logger: Pipeline logger (optional)

        Raises:
            WriterConfigurationError: If configuration is invalid

        Example (new simplified API):
            >>> writer = LogWriter(spark, schema="analytics", table_name="pipeline_logs")

        Example (old API, deprecated):
            >>> config = WriterConfig(table_schema="analytics", table_name="pipeline_logs")
            >>> writer = LogWriter(spark, config=config)
        """
        self.spark = spark

        # Handle both old and new API
        if config is not None:
            # Old API: config provided
            import warnings

            warnings.warn(
                "Passing WriterConfig is deprecated. Use LogWriter(spark, schema='...', table_name='...') instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            self.config = config
        elif schema is not None and table_name is not None:
            # New API: schema and table_name provided
            self.config = WriterConfig(
                table_schema=schema, table_name=table_name, write_mode=WriteMode.APPEND
            )
        else:
            raise WriterConfigurationError(
                "Must provide either (schema and table_name) or config parameter",
                config_errors=["Missing required parameters"],
                suggestions=[
                    "Use: LogWriter(spark, schema='my_schema', table_name='my_table')",
                    "Or: LogWriter(spark, config=WriterConfig(...))",
                ],
            )

        self.functions = functions if functions is not None else get_default_functions()
        if logger is None:
            self.logger = PipelineLogger("LogWriter")
        else:
            self.logger = logger

        # Validate configuration
        try:
            self.config.validate()
        except ValueError as e:
            raise WriterConfigurationError(
                f"Invalid writer configuration: {e}",
                config_errors=[str(e)],
                context={"config": self.config.__dict__},
                suggestions=[
                    "Check configuration values",
                    "Ensure all required fields are provided",
                    "Verify numeric values are positive",
                ],
            ) from e

        # Initialize components
        self._initialize_components()

        # Initialize metrics
        self.metrics: WriterMetrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }

        # Initialize schema
        self.schema = create_log_schema()

        # Set table FQN for compatibility
        self.table_fqn = f"{self.config.table_schema}.{self.config.table_name}"

        # Cache table row counts to avoid repeated counts within a single write operation
        self._table_total_rows_cache: dict[str, int | None] = {}

        self.logger.info(f"LogWriter initialized for table: {self.table_fqn}")

    def _initialize_components(self) -> None:
        """Initialize all writer components."""
        # Data processing component
        self.data_processor = DataProcessor(self.spark, self.functions, self.logger)

        # Storage management component
        self.storage_manager = StorageManager(
            self.spark, self.config, self.functions, self.logger
        )

        # Performance monitoring component
        self.performance_monitor = PerformanceMonitor(self.spark, self.logger)

        # Analytics components
        self.analytics_engine = AnalyticsEngine(self.spark, self.logger)
        self.quality_analyzer = DataQualityAnalyzer(self.spark, self.logger)
        self.trend_analyzer = TrendAnalyzer(self.spark, self.logger)

    def write_execution_result(
        self,
        execution_result: ExecutionResult,
        run_id: str | None = None,
        run_mode: str = "initial",
        metadata: Dict[str, Any] | None = None,
    ) -> Dict[str, Any]:
        """
        Write execution result to log table.

        Args:
            execution_result: The execution result to write
            run_id: Unique run identifier (generated if not provided)
            run_mode: Mode of the run (initial, incremental, etc.)
            metadata: Additional metadata

        Returns:
            Dict containing write results and metrics

        Raises:
            WriterValidationError: If validation fails
            WriterTableError: If table operations fail
            WriterPerformanceError: If performance thresholds exceeded
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(
                operation_id, "write_execution_result"
            )

            # Log operation start
            self.logger.info(f"Writing execution result for run {run_id}")

            # Process execution result
            log_rows = self.data_processor.process_execution_result(
                execution_result,
                run_id,
                run_mode,
                metadata,
                table_total_rows_provider=self._get_table_total_rows,
            )

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            # Check performance thresholds
            threshold_violations = (
                self.performance_monitor.check_performance_thresholds(operation_metrics)
            )
            if threshold_violations:
                self.logger.warning(
                    f"Performance threshold violations: {threshold_violations}"
                )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
                "threshold_violations": threshold_violations,
            }

            self.logger.info(f"Successfully wrote execution result for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write execution result for run {run_id}: {e}")
            raise

    def write_step_results(
        self,
        step_results: Dict[str, StepResult],
        run_id: str | None = None,
        run_mode: str = "initial",
        metadata: Dict[str, Any] | None = None,
    ) -> Dict[str, Any]:
        """
        Write step results to log table.

        Args:
            step_results: Dictionary of step results
            run_id: Unique run identifier (generated if not provided)
            run_mode: Mode of the run
            metadata: Additional metadata

        Returns:
            Dict containing write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "write_step_results")

            # Log operation start
            self.logger.info(
                f"Writing {len(step_results)} step results for run {run_id}"
            )

            # Process step results
            log_rows = self.data_processor.process_step_results(
                step_results, run_id, run_mode, metadata
            )

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(f"Successfully wrote step results for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write step results for run {run_id}: {e}")
            raise

    def write_log_rows(
        self,
        log_rows: list[LogRow],
        run_id: str | None = None,
    ) -> Dict[str, Any]:
        """
        Write log rows directly to the table.

        Args:
            log_rows: List of log rows to write
            run_id: Unique run identifier (generated if not provided)

        Returns:
            Dict containing write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "write_log_rows")

            # Log operation start
            self.logger.info(f"Writing {len(log_rows)} log rows for run {run_id}")

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(f"Successfully wrote log rows for run {run_id}")
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write log rows for run {run_id}: {e}")
            raise

    def write_execution_result_batch(
        self,
        execution_results: list[ExecutionResult],
        run_ids: list[str] | None = None,
        run_mode: str = "initial",
        metadata: Dict[str, Any] | None = None,
    ) -> Dict[str, Any]:
        """
        Write multiple execution results in batch.

        Args:
            execution_results: List of execution results to write
            run_ids: List of run identifiers (generated if not provided)
            run_mode: Mode of the runs
            metadata: Additional metadata

        Returns:
            Dict containing batch write results and metrics
        """
        operation_id = str(uuid.uuid4())
        if run_ids is None:
            run_ids = [str(uuid.uuid4()) for _ in execution_results]

        try:
            # Start performance monitoring
            self.performance_monitor.start_operation(
                operation_id, "write_execution_result_batch"
            )

            # Log operation start
            self.logger.info(
                f"Writing batch of {len(execution_results)} execution results"
            )

            # Process all execution results
            all_log_rows = []
            self._reset_table_total_rows_cache()
            for i, execution_result in enumerate(execution_results):
                run_id = run_ids[i] if i < len(run_ids) else str(uuid.uuid4())
                log_rows = self.data_processor.process_execution_result(
                    execution_result,
                    run_id,
                    run_mode,
                    metadata,
                    table_total_rows_provider=self._get_table_total_rows,
                )
                all_log_rows.extend(log_rows)

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage
            write_result = self.storage_manager.write_batch(
                all_log_rows, self.config.write_mode
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "operation_id": operation_id,
                "execution_results_count": len(execution_results),
                "total_rows_written": write_result.get("rows_written", 0),
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"Successfully wrote batch of {len(execution_results)} execution results"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"Failed to write execution result batch: {e}")
            raise

    def show_logs(self, limit: int | None = None) -> None:
        """
        Display logs from the table.

        Args:
            limit: Maximum number of rows to display
        """
        try:
            self.logger.info(
                f"Displaying logs from {self.config.table_schema}.{self.config.table_name}"
            )

            # Query logs using spark.table for compatibility
            df = self.spark.table(
                f"{self.config.table_schema}.{self.config.table_name}"
            )

            # Show DataFrame
            if limit is not None:
                df.show(limit)
            else:
                df.show()

            self.logger.info("Logs displayed successfully")

        except Exception as e:
            self.logger.error(f"Failed to display logs: {e}")
            raise

    def get_table_info(self) -> TableInfo:
        """
        Get information about the log table.

        Returns:
            Dictionary containing table information
        """
        try:
            return self.storage_manager.get_table_info()
        except Exception as e:
            self.logger.error(f"Failed to get table info: {e}")
            raise WriterError(f"Failed to get table info: {e}") from e

    def _reset_table_total_rows_cache(self) -> None:
        """Clear cached table counts so subsequent operations refresh totals."""
        self._table_total_rows_cache.clear()

    def _get_table_total_rows(self, table_fqn: str | None) -> int | None:
        """
        Determine the total number of rows for a given table.

        Args:
            table_fqn: Fully qualified table name.

        Returns:
            Row count if available, otherwise None.
        """
        if not table_fqn:
            return None

        if table_fqn in self._table_total_rows_cache:
            return self._table_total_rows_cache[table_fqn]

        try:
            table_accessor = getattr(self.spark, "table", None)
            if not callable(table_accessor):
                self.logger.debug(
                    "table_total_rows: spark session does not expose table(); skipping count"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            if not table_exists(self.spark, table_fqn):
                self.logger.debug(
                    f"table_total_rows: table {table_fqn} does not exist; leaving value as None"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            table_df = table_accessor(table_fqn)
            count_method = getattr(table_df, "count", None)
            if not callable(count_method):
                self.logger.debug(
                    f"table_total_rows: object for {table_fqn} lacks count(); skipping"
                )
                self._table_total_rows_cache[table_fqn] = None
                return None

            raw_count = count_method()
            if isinstance(raw_count, (int, float)):
                row_count = int(raw_count)
            else:
                row_count = None

            self._table_total_rows_cache[table_fqn] = row_count
            return row_count
        except Exception as exc:  # pragma: no cover - defensive logging path
            self.logger.warning(
                f"table_total_rows: unable to compute row count for {table_fqn}: {exc}"
            )
            self._table_total_rows_cache[table_fqn] = None
            return None

    def optimize_table(self) -> OptimizeResult:
        """
        Optimize the Delta table for better performance.

        Returns:
            Dictionary containing optimization results
        """
        try:
            self.logger.info("Optimizing Delta table")
            return self.storage_manager.optimize_table()
        except Exception as e:
            self.logger.error(f"Failed to optimize table: {e}")
            raise

    def vacuum_table(self, retention_hours: int = 168) -> VacuumResult:
        """
        Vacuum the Delta table to remove old files.

        Args:
            retention_hours: Hours of retention for old files

        Returns:
            Dictionary containing vacuum results
        """
        try:
            self.logger.info(f"Vacuuming Delta table (retention: {retention_hours}h)")
            return self.storage_manager.vacuum_table(retention_hours)
        except Exception as e:
            self.logger.error(f"Failed to vacuum table: {e}")
            raise

    def analyze_quality_trends(self, days: int = 30) -> QualityTrends:
        """
        Analyze data quality trends.

        Args:
            days: Number of days to analyze

        Returns:
            Dictionary containing quality trend analysis
        """
        try:
            self.logger.info(f"Analyzing quality trends for last {days} days")

            # Query recent logs
            df = self.storage_manager.query_logs()

            # Analyze quality trends
            return self.quality_analyzer.analyze_quality_trends(df, days)

        except Exception as e:
            self.logger.error(f"Failed to analyze quality trends: {e}")
            raise WriterError(f"Failed to analyze quality trends: {e}") from e

    def analyze_execution_trends(self, days: int = 30) -> ExecutionTrends:
        """
        Analyze execution trends.

        Args:
            days: Number of days to analyze

        Returns:
            Dictionary containing execution trend analysis
        """
        try:
            self.logger.info(f"Analyzing execution trends for last {days} days")

            # Query recent logs
            df = self.storage_manager.query_logs()

            # Analyze execution trends
            return self.trend_analyzer.analyze_execution_trends(df, days)

        except Exception as e:
            self.logger.error(f"Failed to analyze execution trends: {e}")
            raise WriterError(f"Failed to analyze execution trends: {e}") from e

    def detect_quality_anomalies(self) -> QualityAnomalies:
        """
        Detect data quality anomalies.

        Returns:
            Dictionary containing anomaly detection results
        """
        try:
            self.logger.info("Detecting quality anomalies")

            # Query logs
            df = self.storage_manager.query_logs()

            # Detect anomalies
            return self.quality_analyzer.detect_quality_anomalies(df)

        except Exception as e:
            self.logger.error(f"Failed to detect quality anomalies: {e}")
            raise WriterError(f"Failed to detect quality anomalies: {e}") from e

    def generate_performance_report(self) -> PerformanceReport:
        """
        Generate comprehensive performance report.

        Returns:
            Dictionary containing performance report
        """
        try:
            self.logger.info("Generating performance report")

            # Query logs
            df = self.storage_manager.query_logs()

            # Generate report
            return self.analytics_engine.generate_performance_report(df)

        except Exception as e:
            self.logger.error(f"Failed to generate performance report: {e}")
            raise WriterError(f"Failed to generate performance report: {e}") from e

    def get_metrics(self) -> WriterMetrics:
        """Get current writer metrics."""
        return self.performance_monitor.get_metrics()

    def reset_metrics(self) -> None:
        """Reset writer metrics."""
        # Reset LogWriter metrics
        self.metrics = {
            "total_writes": 0,
            "successful_writes": 0,
            "failed_writes": 0,
            "total_duration_secs": 0.0,
            "avg_write_duration_secs": 0.0,
            "total_rows_written": 0,
            "memory_usage_peak_mb": 0.0,
        }
        # Reset performance monitor metrics
        self.performance_monitor.reset_metrics()

    def get_memory_usage(self) -> MemoryUsageInfo:
        """Get current memory usage information."""
        return self.performance_monitor.get_memory_usage()

    def _update_metrics(self, write_result: WriteResult, success: bool) -> None:
        """Update writer metrics."""
        try:
            self.metrics["total_writes"] += 1
            if success:
                self.metrics["successful_writes"] += 1
            else:
                self.metrics["failed_writes"] += 1

            if "rows_written" in write_result:
                self.metrics["total_rows_written"] += write_result["rows_written"]

            # Update performance monitor metrics
            self.performance_monitor.metrics.update(self.metrics)

        except Exception as e:
            self.logger.error(f"Failed to update metrics: {e}")

    # Backward compatibility methods for tests
    def _write_log_rows(
        self,
        log_rows: list[LogRow],
        run_id: str,
        metadata: Dict[str, Any] | None = None,
    ) -> WriteResult:
        """Write log rows directly (for backward compatibility with tests)."""
        return self.storage_manager.write_batch(log_rows, self.config.write_mode)

    def _write_log_rows_batch(
        self, log_rows: list[LogRow], run_id: str, batch_size: int = 100
    ) -> WriteResult:
        """Write log rows in batches (for backward compatibility with tests)."""
        results = []
        for i in range(0, len(log_rows), batch_size):
            batch = log_rows[i : i + batch_size]
            result = self._write_log_rows(batch, run_id)
            results.append(result)

        total_rows = sum(r.get("rows_written", 0) for r in results)
        from datetime import datetime

        return {
            "table_name": self.storage_manager.table_fqn,
            "write_mode": self.config.write_mode.value,
            "rows_written": total_rows,
            "timestamp": datetime.now().isoformat(),
            "success": True,
        }

    def _create_dataframe_from_log_rows(self, log_rows: list[LogRow]) -> Any:
        """Create DataFrame from log rows (for backward compatibility with tests)."""
        # Convert TypedDict to regular dicts for createDataFrame
        dict_rows = [dict(row) for row in log_rows]
        return self.spark.createDataFrame(dict_rows, schema=self.schema)  # type: ignore[attr-defined]

    def detect_anomalies(self, log_rows: list[LogRow]) -> AnomalyReport:
        """Detect anomalies in log data (for backward compatibility with tests)."""
        if not self.config.enable_anomaly_detection:
            return {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": 0,
            }

        try:
            # Basic anomaly detection logic
            if not log_rows:
                return {
                    "performance_anomalies": [],
                    "quality_anomalies": [],
                    "anomaly_score": 0.0,
                    "total_anomalies": 0,
                    "total_executions": len(log_rows),
                }

            # Check for duration anomalies (very simple logic)
            durations = [
                row.get("duration_secs", 0)
                for row in log_rows
                if "duration_secs" in row
            ]
            if not durations:
                return {
                    "performance_anomalies": [],
                    "quality_anomalies": [],
                    "anomaly_score": 0.0,
                    "total_anomalies": 0,
                    "total_executions": len(log_rows),
                }

            avg_duration = sum(durations) / len(durations)
            threshold = avg_duration * 2  # 2x average is anomalous

            # from .monitoring import PerformanceAnomaly  # Removed: defined in notebook cells above

            performance_anomalies = []
            for row in log_rows:
                duration = row.get("duration_secs", 0)
                if duration > threshold:
                    anomaly: PerformanceAnomaly = {
                        "step": row.get("step_name", "unknown"),
                        "execution_time": float(duration),
                        "validation_rate": float(row.get("validation_rate", 0.0)),
                        "success": bool(row.get("success", False)),
                    }
                    performance_anomalies.append(anomaly)

            total_anomalies = len(performance_anomalies)
            total_executions = len(log_rows)
            anomaly_score = (
                (total_anomalies / total_executions * 100)
                if total_executions > 0
                else 0.0
            )

            return {
                "performance_anomalies": performance_anomalies,
                "quality_anomalies": [],
                "anomaly_score": round(anomaly_score, 2),
                "total_anomalies": total_anomalies,
                "total_executions": total_executions,
            }
        except Exception as e:
            self.logger.warning(f"Anomaly detection failed: {e}")
            return {
                "performance_anomalies": [],
                "quality_anomalies": [],
                "anomaly_score": 0.0,
                "total_anomalies": 0,
                "total_executions": len(log_rows) if log_rows else 0,
            }

    # Additional methods expected by tests
    def validate_log_data_quality(self, log_rows: list[LogRow]) -> DataQualityReport:
        """Validate log data quality (for backward compatibility with tests)."""
        try:
            # from ..validation.utils import get_dataframe_info  # Removed: defined in notebook cells above

            if not log_rows:
                return {
                    "is_valid": True,
                    "total_rows": 0,
                    "null_counts": {},
                    "validation_issues": [],
                    "failed_executions": 0,
                    "data_quality_score": 100.0,
                }

            # Create DataFrame for validation
            df = self._create_dataframe_from_log_rows(log_rows)

            # Get basic info
            df_info = get_dataframe_info(df)

            # Count failed executions
            failed_executions = sum(
                1 for row in log_rows if not row.get("success", True)
            )

            # Calculate quality score
            total_rows = df_info.get("row_count", len(log_rows))
            validation_rate = 100.0  # Simplified
            data_quality_score = (
                validation_rate
                if failed_executions == 0
                else max(0, validation_rate - (failed_executions / total_rows * 100))
            )

            # Check for null values in critical columns
            null_counts: Dict[str, int] = {}

            # Determine validation issues
            validation_issues = []
            if failed_executions > 0:
                validation_issues.append(f"{failed_executions} failed executions")

            return {
                "is_valid": failed_executions == 0 and len(validation_issues) == 0,
                "total_rows": total_rows,
                "null_counts": null_counts,
                "validation_issues": validation_issues,
                "failed_executions": failed_executions,
                "data_quality_score": round(data_quality_score, 2),
            }

        except Exception as e:
            return {
                "is_valid": False,
                "total_rows": len(log_rows) if log_rows else 0,
                "null_counts": {},
                "validation_issues": [str(e)],
                "failed_executions": 0,
                "data_quality_score": 0.0,
            }

    # ========================================================================
    # New simplified API methods for working with PipelineReport
    # ========================================================================

    def _convert_report_to_log_rows(
        self, report: PipelineReport, run_id: str | None = None
    ) -> list[LogRow]:
        """
        Convert a PipelineReport to log rows for storage.

        This method extracts data from a PipelineReport and creates one log row
        per pipeline step (bronze, silver, gold) with step-specific metrics.

        Args:
            report: PipelineReport to convert
            run_id: Optional run ID (generated if not provided)

        Returns:
            List of LogRow dictionaries ready for storage (one per step)
        """

        if run_id is None:
            run_id = str(uuid.uuid4())

        log_rows: list[LogRow] = []

        # Helper function to parse datetime strings
        def parse_datetime(dt_str: str | None) -> datetime | None:
            if dt_str is None:
                return None
            try:
                return datetime.fromisoformat(dt_str)
            except (ValueError, AttributeError):
                return None

        # Process bronze steps
        for step_name, step_info in report.bronze_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            bronze_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "bronze",
                "step_name": step_name,
                "step_type": "bronze",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(bronze_log_row)

        # Process silver steps
        for step_name, step_info in report.silver_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            silver_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "silver",
                "step_name": step_name,
                "step_type": "silver",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(silver_log_row)

        # Process gold steps
        for step_name, step_info in report.gold_results.items():
            # Calculate valid/invalid rows from validation rate
            rows_processed = int(step_info.get("rows_processed") or 0)
            validation_rate_val = step_info.get("validation_rate")
            validation_rate = float(
                validation_rate_val if validation_rate_val is not None else 100.0
            )
            valid_rows = int(rows_processed * validation_rate / 100.0)
            invalid_rows = rows_processed - valid_rows

            table_fqn = step_info.get("output_table")
            table_total_rows = step_info.get("table_total_rows")
            if table_total_rows is None:
                table_total_rows = self._get_table_total_rows(table_fqn)

            gold_log_row: LogRow = {
                # Run-level information
                "run_id": run_id,
                "run_mode": report.mode.value,
                "run_started_at": report.start_time,
                "run_ended_at": report.end_time,
                # Execution context
                "execution_id": report.execution_id,
                "pipeline_id": report.pipeline_id,
                "schema": self.config.table_schema,
                # Step-level information
                "phase": "gold",
                "step_name": step_name,
                "step_type": "gold",
                # Timing information
                "start_time": parse_datetime(step_info.get("start_time")),
                "end_time": parse_datetime(step_info.get("end_time")),
                "duration_secs": float(step_info.get("duration", 0.0)),
                # Table information
                "table_fqn": step_info.get("output_table"),
                "write_mode": step_info.get("write_mode"),
                # Data metrics
                "rows_processed": rows_processed,
                "rows_written": int(step_info.get("rows_written") or rows_processed),
                "input_rows": int(step_info.get("input_rows") or rows_processed),
                "output_rows": int(step_info.get("rows_written") or rows_processed),
                "table_total_rows": table_total_rows,
                # Validation metrics
                "valid_rows": valid_rows,
                "invalid_rows": invalid_rows,
                "validation_rate": validation_rate,
                # Execution status
                "success": step_info.get("status") == "completed",
                "error_message": step_info.get("error"),
                # Performance metrics
                "memory_usage_mb": None,
                "cpu_usage_percent": None,
                # Metadata
                "metadata": {},
            }
            log_rows.append(gold_log_row)

        return log_rows

    def create_table(
        self, report: PipelineReport, run_id: str | None = None
    ) -> Dict[str, Any]:
        """
        Create or overwrite the log table with data from a PipelineReport.

        This method creates the log table if it doesn't exist, and writes
        the report data using OVERWRITE mode (replacing any existing data).

        Args:
            report: PipelineReport to write
            run_id: Optional run ID (generated if not provided)

        Returns:
            Dictionary with write results including:
                - success: Whether the operation succeeded
                - run_id: The run identifier used
                - rows_written: Number of rows written
                - table_fqn: Fully qualified table name

        Example:
            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
            >>> result = writer.create_table(pipeline_report)
            >>> print(f"Created table with {result['rows_written']} rows")
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "create_table")

            # Log operation start
            self.logger.info(f"📊 Creating log table {self.table_fqn} for run {run_id}")

            # Convert report to log rows
            log_rows = self._convert_report_to_log_rows(report, run_id)

            # Create table if not exists
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage with OVERWRITE mode
            write_result = self.storage_manager.write_batch(
                log_rows, WriteMode.OVERWRITE
            )

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "table_fqn": self.table_fqn,
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"✅ Successfully created log table {self.table_fqn} with "
                f"{result['rows_written']} row(s) for run {run_id}"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"❌ Failed to create log table for run {run_id}: {e}")
            raise

    def append(
        self, report: PipelineReport, run_id: str | None = None
    ) -> Dict[str, Any]:
        """
        Append data from a PipelineReport to the log table.

        This method appends the report data to an existing log table. If the
        table doesn't exist, it will be created first.

        Args:
            report: PipelineReport to append
            run_id: Optional run ID (generated if not provided)

        Returns:
            Dictionary with write results including:
                - success: Whether the operation succeeded
                - run_id: The run identifier used
                - rows_written: Number of rows written
                - table_fqn: Fully qualified table name

        Example:
            >>> writer = LogWriter(spark, schema="analytics", table_name="logs")
            >>> result = writer.append(pipeline_report)
            >>> print(f"Appended {result['rows_written']} rows to {result['table_fqn']}")
        """
        operation_id = str(uuid.uuid4())
        if run_id is None:
            run_id = str(uuid.uuid4())

        try:
            # Reset per-operation cache
            self._reset_table_total_rows_cache()

            # Start performance monitoring
            self.performance_monitor.start_operation(operation_id, "append")

            # Log operation start
            self.logger.info(
                f"📊 Appending to log table {self.table_fqn} for run {run_id}"
            )

            # Convert report to log rows
            log_rows = self._convert_report_to_log_rows(report, run_id)

            # Create table if not exists (for first append)
            self.storage_manager.create_table_if_not_exists(self.schema)

            # Write to storage with APPEND mode
            write_result = self.storage_manager.write_batch(log_rows, WriteMode.APPEND)

            # Update metrics
            self._update_metrics(write_result, True)

            # End performance monitoring
            operation_metrics = self.performance_monitor.end_operation(
                operation_id, True, write_result.get("rows_written", 0)
            )

            result = {
                "success": True,
                "run_id": run_id,
                "operation_id": operation_id,
                "rows_written": write_result.get("rows_written", 0),
                "table_fqn": self.table_fqn,
                "write_result": write_result,
                "operation_metrics": operation_metrics,
            }

            self.logger.info(
                f"✅ Successfully appended {result['rows_written']} row(s) to "
                f"{self.table_fqn} for run {run_id}"
            )
            return result

        except Exception as e:
            # End performance monitoring with failure
            self.performance_monitor.end_operation(operation_id, False, 0, str(e))
            # Create empty WriteResult for error case
            empty_result: WriteResult = {
                "table_name": self.storage_manager.table_fqn,
                "write_mode": self.config.write_mode.value,
                "rows_written": 0,
                "timestamp": "",
                "success": False,
            }
            self._update_metrics(empty_result, False)

            self.logger.error(f"❌ Failed to append to log table for run {run_id}: {e}")
            raise

In [None]:
# Usage Example
#
# Here's how to initialize PipelineBuilder and LogWriter:

# Example: Initialize PipelineBuilder and LogWriter
from pyspark.sql import SparkSession

# Initialize Spark
spark = (
    SparkSession.builder.appName("PipelineBuilder Example")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .getOrCreate()
)

# Initialize PipelineBuilder
builder = PipelineBuilder(spark=spark, schema="analytics")
print("✅ PipelineBuilder initialized")

# Initialize LogWriter (simplified API)
log_writer = LogWriter(spark, schema="analytics", table_name="pipeline_logs")
print("✅ LogWriter initialized")