# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary
# MAGIC 
# MAGIC In this notebook, we've explored functional approaches to DataFrame and schema validation in PySpark:
# MAGIC 
# MAGIC ### Key Concepts Covered
# MAGIC 
# MAGIC 1. **Functional Validation Patterns**
# MAGIC    - Pure validation functions with no side effects
# MAGIC    - Structured error handling with ValidationResult types
# MAGIC    - Immutable validation pipelines
# MAGIC 
# MAGIC 2. **Schema Validation**
# MAGIC    - Column existence checking
# MAGIC    - Data type compatibility validation
# MAGIC    - Schema evolution handling
# MAGIC 
# MAGIC 3. **Data Constraint Validation**
# MAGIC    - Null value checking
# MAGIC    - Range validation for numeric data
# MAGIC    - Pattern matching with regex
# MAGIC    - Custom business rule validation
# MAGIC 
# MAGIC 4. **Composable Validation Pipeline**
# MAGIC    - Rule composition with higher-order functions
# MAGIC    - Validation result aggregation
# MAGIC    - Comprehensive error reporting
# MAGIC 
# MAGIC 5. **Integration Patterns**
# MAGIC    - Great Expectations integration
# MAGIC    - External validation library adaptation
# MAGIC    - Custom validation rule creation
# MAGIC 
# MAGIC ### Best Practices Demonstrated
# MAGIC 
# MAGIC - ✅ **Pure Functions**: All validation functions are side-effect free
# MAGIC - ✅ **Immutable Results**: ValidationResult and ValidationReport are immutable
# MAGIC - ✅ **Composability**: Validation rules can be easily combined and reused
# MAGIC - ✅ **Error Handling**: Comprehensive error information with structured results
# MAGIC - ✅ **Testability**: All functions are easy to unit test
# MAGIC 
# MAGIC ### Next Steps
# MAGIC 
# MAGIC - Practice implementing the exercises provided
# MAGIC - Integrate validation patterns into your data pipelines
# MAGIC - Explore Great Expectations for advanced validation scenarios
# MAGIC - Build reusable validation libraries for your organization
# MAGIC - Implement validation in CI/CD pipelines for data quality assurance
# MAGIC 
# MAGIC This functional approach to validation ensures reliable, maintainable, and composable data quality checking in your PySpark applications.

In [None]:
# COMMAND ----------

print("=== EXERCISES: Practice DataFrame Validation ===\n")

# Exercise 1: Create a custom validation rule
print("📝 EXERCISE 1: Create a unique constraint validator")
print("   Task: Implement validate_unique_values() function")
print("   Requirements:")
print("   - Pure function that checks for duplicate values in a column")
print("   - Return ValidationResult with appropriate details")
print("   - Handle edge cases (missing column, empty DataFrame)")

def validate_unique_values(df: DataFrame, column: str) -> ValidationResult:
    """
    YOUR TASK: Implement this function to validate unique values.
    Should return ValidationResult indicating if all values in column are unique.
    """
    # TODO: Implement this function
    pass

print("   Hint: Use df.select(column).distinct().count() vs df.count()")

# Exercise 2: Create a composite validation rule
print("\n📝 EXERCISE 2: Create a date format validator")
print("   Task: Implement validate_date_format() function")
print("   Requirements:")
print("   - Validate that string dates match specific format (e.g., 'YYYY-MM-DD')")
print("   - Use regex pattern matching")
print("   - Return detailed information about invalid dates")

def validate_date_format(df: DataFrame, column: str, date_pattern: str = r'^\d{4}-\d{2}-\d{2}$') -> ValidationResult:
    """
    YOUR TASK: Implement date format validation.
    Should validate that all dates in column match the specified pattern.
    """
    # TODO: Implement this function
    pass

print("   Hint: Use F.col(column).rlike(pattern) for regex matching")

# Exercise 3: Build a validation pipeline
print("\n📝 EXERCISE 3: Build a customer data validator")
print("   Task: Create validation pipeline for customer data")
print("   Requirements:")
print("   - Customer ID: unique, not null")
print("   - Email: valid email format, not null")
print("   - Age: between 18 and 120")
print("   - Registration date: valid date format")

# Sample customer data for testing
customer_test_data = [
    (1, "john@email.com", 25, "2023-01-15"),
    (2, "jane@email.com", 30, "2023-02-20"),
    (3, "bob@email.com", 45, "2023-03-10"),
    (4, "alice@email.com", 28, "2023-04-05"),
]

customer_schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("email", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("registration_date", StringType(), False)
])

customer_df = spark.createDataFrame(customer_test_data, customer_schema)

def create_customer_validator() -> DataFrameValidator:
    """
    YOUR TASK: Create a complete validator for customer data.
    Use the validation functions we've created and new ones from exercises 1 & 2.
    """
    # TODO: Implement this function
    # Should return DataFrameValidator with appropriate rules
    pass

print("   Hint: Combine existing validation functions with your custom ones")

# Exercise 4: Error aggregation and reporting
print("\n📝 EXERCISE 4: Create enhanced validation reporting")
print("   Task: Implement detailed validation reporting")
print("   Requirements:")
print("   - Group validation results by error type")
print("   - Calculate error percentages")
print("   - Generate actionable recommendations")

def create_detailed_validation_report(report: ValidationReport, df: DataFrame) -> Dict[str, Any]:
    """
    YOUR TASK: Create enhanced reporting from ValidationReport.
    Should include error summaries, percentages, and recommendations.
    """
    # TODO: Implement this function
    # Should return dictionary with detailed analysis
    pass

# Solutions (uncomment to see reference implementations)
print("\n" + "="*60)
print("💡 SOLUTIONS (Reference Implementations)")
print("="*60)

# Solution 1: Unique values validator
def validate_unique_values_solution(df: DataFrame, column: str) -> ValidationResult:
    """Reference implementation for unique values validation."""
    if column not in df.columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' does not exist",
            rule_name="unique_values",
            details={"column": column}
        )
    
    total_count = df.count()
    if total_count == 0:
        return ValidationResult(
            is_valid=True,
            level=ValidationLevel.INFO,
            message=f"Column '{column}' is empty (no duplicates possible)",
            rule_name="unique_values"
        )
    
    unique_count = df.select(column).distinct().count()
    duplicate_count = total_count - unique_count
    
    if duplicate_count > 0:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' has {duplicate_count} duplicate values",
            rule_name="unique_values",
            failed_records=duplicate_count,
            details={"total_count": total_count, "unique_count": unique_count}
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message=f"Column '{column}' has all unique values",
        rule_name="unique_values"
    )

# Test the solution
print("\n✅ Testing unique values validator:")
unique_result = validate_unique_values_solution(customer_df, "customer_id")
print(f"   Result: {unique_result.message}")

# Solution 2: Date format validator
def validate_date_format_solution(df: DataFrame, column: str, date_pattern: str = r'^\d{4}-\d{2}-\d{2}$') -> ValidationResult:
    """Reference implementation for date format validation."""
    if column not in df.columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' does not exist",
            rule_name="date_format",
            details={"column": column}
        )
    
    invalid_count = df.filter(~F.col(column).rlike(date_pattern)).count()
    total_count = df.count()
    
    if invalid_count > 0:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' has {invalid_count} invalid date formats",
            rule_name="date_format",
            failed_records=invalid_count,
            details={"pattern": date_pattern, "total_count": total_count}
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message=f"All dates in '{column}' match expected format",
        rule_name="date_format"
    )

# Test the solution
print("\n✅ Testing date format validator:")
date_result = validate_date_format_solution(customer_df, "registration_date")
print(f"   Result: {date_result.message}")

print(f"\n🎯 Practice implementing these validators to master functional validation patterns!")
print(f"   Remember: Pure functions, immutable results, comprehensive error handling")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Exercises and Practice
# MAGIC 
# MAGIC Practice implementing functional validation patterns with these exercises.

In [None]:
# COMMAND ----------

# BEST PRACTICES for functional DataFrame validation

print("=== BEST PRACTICES ===\n")

# ✅ DO: Use pure functions that return validation results
def good_validation_pattern(df: DataFrame, column: str) -> ValidationResult:
    """
    Pure function that validates data and returns structured results.
    No side effects, easy to test, composable.
    """
    null_count = df.filter(F.col(column).isNull()).count()
    return ValidationResult(
        is_valid=null_count == 0,
        level=ValidationLevel.ERROR if null_count > 0 else ValidationLevel.INFO,
        message=f"Column '{column}' has {null_count} null values",
        rule_name="null_check"
    )

print("✅ GOOD: Pure validation function with structured results")
print("   - Returns ValidationResult with all necessary information")
print("   - No side effects or external dependencies")
print("   - Easy to test and compose")

# ✅ DO: Use immutable validation pipelines
print("\n✅ GOOD: Immutable validation pipeline composition")
base_validator = DataFrameValidator([
    create_validation_rule(validate_not_null, ["employee_id"])
])

# Create new validator without mutating the original
extended_validator = base_validator.add_rule(
    create_validation_rule(validate_range, "age", 0, 120)
)

print(f"   - Base validator has {len(base_validator.rules)} rules")
print(f"   - Extended validator has {len(extended_validator.rules)} rules")
print("   - Original validator unchanged (immutability)")

# ✅ DO: Handle errors gracefully with Result types
print("\n✅ GOOD: Graceful error handling with structured results")
try:
    result = validate_range(valid_df, "nonexistent_column", 0, 100)
    print(f"   - Error handled gracefully: {result.message}")
    print(f"   - Error details preserved: {result.details}")
except Exception:
    print("   - No exceptions thrown, errors captured in ValidationResult")

print("\n" + "="*60)
print("=== ANTI-PATTERNS ===\n")

# ❌ DON'T: Use functions with side effects
def bad_validation_pattern_with_side_effects(df: DataFrame, column: str) -> bool:
    """
    ANTI-PATTERN: Validation function with side effects.
    Prints directly, modifies global state, hard to test.
    """
    null_count = df.filter(F.col(column).isNull()).count()
    
    # ❌ Side effect: Direct printing
    if null_count > 0:
        print(f"ERROR: Column {column} has {null_count} nulls!")
        # ❌ Side effect: Modifying global state
        global validation_errors
        validation_errors = validation_errors.get(column, 0) + null_count
    
    return null_count == 0

print("❌ BAD: Validation with side effects")
print("   - Direct printing makes testing difficult")
print("   - Global state modification breaks composability")
print("   - Returns only boolean, loses important information")

# ❌ DON'T: Use mutable validation configuration
class BadMutableValidator:
    """
    ANTI-PATTERN: Mutable validator that can be modified after creation.
    """
    def __init__(self):
        self.rules = []  # ❌ Mutable list
        self.errors = []  # ❌ Mutable state
    
    def add_rule(self, rule):
        self.rules.append(rule)  # ❌ Modifies existing object
    
    def validate(self, df):
        self.errors.clear()  # ❌ Side effect: clearing previous state
        for rule in self.rules:
            # Validation logic would go here
            pass
        return len(self.errors) == 0

print("\n❌ BAD: Mutable validator")
print("   - Mutable state can lead to unexpected behavior")
print("   - Side effects in validation method")
print("   - Hard to reason about state changes")

# ❌ DON'T: Swallow exceptions without proper handling
def bad_validation_with_silent_failures(df: DataFrame, column: str) -> bool:
    """
    ANTI-PATTERN: Silently handling exceptions without reporting.
    """
    try:
        result = df.select(column).count()
        return True
    except Exception:
        # ❌ Silent failure - no information about what went wrong
        return False

print("\n❌ BAD: Silent exception handling")
print("   - Hides important error information")
print("   - Makes debugging difficult")
print("   - Fails without explanation")

# ✅ BETTER: Proper exception handling
def better_validation_with_error_capture(df: DataFrame, column: str) -> ValidationResult:
    """
    BETTER: Capture exceptions and convert to structured results.
    """
    try:
        if column not in df.columns:
            return ValidationResult(
                is_valid=False,
                level=ValidationLevel.ERROR,
                message=f"Column '{column}' not found in DataFrame",
                rule_name="column_existence",
                details={"column": column, "available_columns": df.columns}
            )
        
        count = df.select(column).count()
        return ValidationResult(
            is_valid=True,
            level=ValidationLevel.INFO,
            message=f"Column '{column}' accessible with {count} records",
            rule_name="column_existence"
        )
        
    except Exception as e:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Validation failed with exception: {str(e)}",
            rule_name="column_existence",
            details={"exception": str(e), "exception_type": type(e).__name__}
        )

print("\n✅ BETTER: Structured exception handling")
test_result = better_validation_with_error_capture(valid_df, "nonexistent_column")
print(f"   - Structured error: {test_result.message}")
print(f"   - Error details preserved: {test_result.details}")

print("\n" + "="*60)
print("=== KEY PRINCIPLES ===")
print("1. ✅ Pure functions with no side effects")
print("2. ✅ Immutable data structures and pipelines")  
print("3. ✅ Structured error handling with Result types")
print("4. ✅ Composable validation rules")
print("5. ✅ Comprehensive error information")
print("6. ❌ Avoid side effects in validation functions")
print("7. ❌ Avoid mutable validation state")
print("8. ❌ Avoid silent exception handling")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Best Practices and Anti-Patterns
# MAGIC 
# MAGIC Let's review functional programming best practices and anti-patterns in data validation.

In [None]:
# COMMAND ----------

# Great Expectations integration (conceptual - would require installation)
# This cell demonstrates how you would integrate Great Expectations
# In a real Databricks environment, you would install it first

def create_great_expectations_validator() -> ValidationRule:
    """
    Conceptual integration with Great Expectations.
    In practice, you would install great_expectations and configure it properly.
    """
    def ge_validation_rule(df: DataFrame) -> ValidationResult:
        """
        Wrapper function that adapts Great Expectations to our validation framework.
        This is a conceptual example - actual implementation would require GE setup.
        """
        try:
            # This is pseudo-code demonstrating the integration pattern
            # In reality, you would:
            # 1. Convert PySpark DataFrame to Pandas or use Spark backend
            # 2. Create GE DataContext and ExpectationSuite
            # 3. Run validations and capture results
            
            # Simulated Great Expectations-style validation
            issues_found = []
            
            # Simulate some GE-style expectations
            if df.filter(F.col("age") < 0).count() > 0:
                issues_found.append("Found negative ages")
            
            if df.filter(F.col("salary") <= 0).count() > 0:
                issues_found.append("Found non-positive salaries")
            
            # Check for duplicates
            total_count = df.count()
            unique_count = df.select("employee_id").distinct().count()
            if total_count != unique_count:
                issues_found.append("Found duplicate employee IDs")
            
            if issues_found:
                return ValidationResult(
                    is_valid=False,
                    level=ValidationLevel.ERROR,
                    message=f"Great Expectations validation failed: {'; '.join(issues_found)}",
                    rule_name="great_expectations",
                    details={"issues": issues_found}
                )
            
            return ValidationResult(
                is_valid=True,
                level=ValidationLevel.INFO,
                message="All Great Expectations validations passed",
                rule_name="great_expectations"
            )
            
        except Exception as e:
            return ValidationResult(
                is_valid=False,
                level=ValidationLevel.ERROR,
                message=f"Great Expectations validation error: {str(e)}",
                rule_name="great_expectations",
                details={"exception": str(e)}
            )
    
    return ge_validation_rule

# Example of how to configure Great Expectations expectations
def create_employee_expectations_config() -> Dict[str, Any]:
    """
    Configuration for Great Expectations expectations.
    This would be used to set up a proper GE suite.
    """
    return {
        "expectation_suite_name": "employee_data_suite",
        "expectations": [
            {
                "expectation_type": "expect_table_columns_to_match_ordered_list",
                "kwargs": {
                    "column_list": ["employee_id", "name", "email", "age", "department", "salary", "hire_date"]
                }
            },
            {
                "expectation_type": "expect_column_values_to_not_be_null",
                "kwargs": {"column": "employee_id"}
            },
            {
                "expectation_type": "expect_column_values_to_be_between",
                "kwargs": {"column": "age", "min_value": 18, "max_value": 65}
            },
            {
                "expectation_type": "expect_column_values_to_be_between",
                "kwargs": {"column": "salary", "min_value": 0}
            },
            {
                "expectation_type": "expect_column_values_to_match_regex",
                "kwargs": {
                    "column": "email",
                    "regex": r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
                }
            },
            {
                "expectation_type": "expect_column_values_to_be_in_set",
                "kwargs": {
                    "column": "department",
                    "value_set": ["Engineering", "Marketing", "Sales", "HR", "Finance"]
                }
            }
        ]
    }

# Test the conceptual GE integration
print("Testing conceptual Great Expectations integration...")
ge_validator = create_great_expectations_validator()

# Test with valid data
ge_result = ge_validator(valid_df)
print(f"✅ GE validation (valid data): {ge_result.message}")

# Test with problematic data (create data with duplicates)
duplicate_data = [
    (1, "Alice Johnson", "alice@company.com", 28, "Engineering", 75000.0, "2020-01-15"),
    (1, "Bob Smith", "bob@company.com", 32, "Marketing", 65000.0, "2019-03-10"),  # Duplicate ID
]

duplicate_df = spark.createDataFrame(duplicate_data, expected_schema)
ge_result_with_issues = ge_validator(duplicate_df)
print(f"❌ GE validation (with duplicates): {ge_result_with_issues.message}")

# Show how to integrate GE with our validation pipeline
complete_validator = enhanced_validator.add_rule(ge_validator)
print(f"\n📊 Complete validation pipeline now has {len(complete_validator.rules)} rules")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Integration with Great Expectations
# MAGIC 
# MAGIC Great Expectations is a powerful library for data validation. Let's see how to integrate it with our functional validation approach.

In [None]:
# COMMAND ----------

# Test validation pipeline with problematic data
print("Creating test data with validation violations...")

# Create DataFrame with various validation issues
problematic_data = [
    (None, "Alice Johnson", "alice@company.com", 25, "Engineering", 75000.0, "2020-01-15"),  # Null ID
    (2, None, "bob-invalid-email", 17, "Marketing", -5000.0, "2019-03-10"),  # Multiple issues
    (3, "Carol Davis", "carol@company.com", 70, "Engineering", 80000.0, "2021-06-01"),  # Age > 65
    (4, "David Wilson", "david@company.com", 35, "Sales", 70000.0, "2018-11-20"),  # Valid record
]

problematic_schema = StructType([
    StructField("employee_id", IntegerType(), True),  # Allow nulls for testing
    StructField("name", StringType(), True),
    StructField("email", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("department", StringType(), False),
    StructField("salary", DoubleType(), False),
    StructField("hire_date", StringType(), False)
])

problematic_df = spark.createDataFrame(problematic_data, problematic_schema)

print("Problematic data created:")
problematic_df.show(truncate=False)

# Test validation with problematic data
print("\n" + "="*60)
print("Testing validation pipeline with problematic data...")
print("="*60)

problematic_report = validator.validate(problematic_df)
problematic_report.print_summary()

# Show detailed failure analysis
print(f"\n🔍 Detailed Failure Analysis:")
for result in problematic_report.results:
    if not result.is_valid or result.level == ValidationLevel.WARNING:
        print(f"\n❌ {result.rule_name}:")
        print(f"   Message: {result.message}")
        if result.details:
            for key, value in result.details.items():
                print(f"   {key}: {value}")
        if result.failed_records:
            print(f"   Failed Records: {result.failed_records}")

# Demonstrate validation rule composition - add custom business rules
print(f"\n🔧 Adding custom business validation rules...")

def validate_department_codes(df: DataFrame, valid_departments: List[str]) -> ValidationResult:
    """Custom validation for department codes."""
    if "department" not in df.columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message="Department column not found",
            rule_name="department_codes"
        )
    
    # Check for invalid department codes
    invalid_count = df.filter(~F.col("department").isin(valid_departments)).count()
    total_count = df.count()
    
    if invalid_count > 0:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Found {invalid_count} records with invalid department codes",
            rule_name="department_codes",
            failed_records=invalid_count,
            details={"valid_departments": valid_departments}
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message="All department codes are valid",
        rule_name="department_codes"
    )

# Create enhanced validator with custom business rules
enhanced_validator = validator.add_rules([
    create_validation_rule(validate_department_codes, ["Engineering", "Marketing", "Sales", "HR", "Finance"])
])

print("Testing enhanced validator with business rules...")
enhanced_report = enhanced_validator.validate(valid_df)
enhanced_report.print_summary()

In [None]:
# COMMAND ----------

# Composable validation pipeline
from typing import Protocol

class ValidationRule(Protocol):
    """
    Protocol for validation rules that can be composed in a pipeline.
    All validation rules should return ValidationResult or List[ValidationResult].
    """
    def __call__(self, df: DataFrame) -> Union[ValidationResult, List[ValidationResult]]:
        ...

def create_validation_rule(validation_func: Callable, *args, **kwargs) -> ValidationRule:
    """
    Higher-order function that creates a validation rule from a validation function
    by partially applying its arguments.
    """
    return partial(validation_func, *args, **kwargs)

class DataFrameValidator:
    """
    Functional validation pipeline that composes multiple validation rules.
    Immutable and side-effect free.
    """
    
    def __init__(self, rules: List[ValidationRule]):
        self.rules = rules
    
    def validate(self, df: DataFrame) -> ValidationReport:
        """
        Execute all validation rules and return a comprehensive report.
        Pure function with no side effects.
        """
        all_results = []
        
        for rule in self.rules:
            try:
                result = rule(df)
                
                # Handle both single results and lists of results
                if isinstance(result, list):
                    all_results.extend(result)
                else:
                    all_results.append(result)
                    
            except Exception as e:
                # Convert exceptions to validation results
                error_result = ValidationResult(
                    is_valid=False,
                    level=ValidationLevel.ERROR,
                    message=f"Validation rule failed with exception: {str(e)}",
                    rule_name="exception_handler",
                    details={"exception": str(e), "exception_type": type(e).__name__}
                )
                all_results.append(error_result)
        
        return ValidationReport.from_results(all_results)
    
    def add_rule(self, rule: ValidationRule) -> 'DataFrameValidator':
        """Return a new validator with an additional rule (immutable)."""
        return DataFrameValidator(self.rules + [rule])
    
    def add_rules(self, rules: List[ValidationRule]) -> 'DataFrameValidator':
        """Return a new validator with additional rules (immutable)."""
        return DataFrameValidator(self.rules + rules)

# Create a comprehensive validation pipeline for employee data
def create_employee_data_validator() -> DataFrameValidator:
    """
    Factory function that creates a validation pipeline for employee data.
    Demonstrates composition of multiple validation rules.
    """
    rules = [
        # Schema validation
        create_validation_rule(
            validate_columns_exist, 
            ["employee_id", "name", "email", "age", "department", "salary", "hire_date"]
        ),
        create_validation_rule(validate_schema_compatibility, expected_schema),
        
        # Not null constraints
        create_validation_rule(validate_not_null, ["employee_id", "name", "email"]),
        
        # Range validations
        create_validation_rule(validate_range, "age", 18, 65),
        create_validation_rule(validate_range, "salary", 0, None),
        
        # Pattern validations
        create_validation_rule(
            validate_regex_pattern, 
            "email", 
            r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
            "valid email format"
        ),
    ]
    
    return DataFrameValidator(rules)

# Test the validation pipeline with valid data
print("Testing validation pipeline with valid data...")
validator = create_employee_data_validator()
report = validator.validate(valid_df)

report.print_summary()

# Show individual results
print(f"\n📋 Individual Validation Results ({len(report.results)} total):")
for i, result in enumerate(report.results, 1):
    status = "✅" if result.is_valid else "❌"
    print(f"  {i}. {status} {result.rule_name}: {result.message}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Composable Validation Pipeline
# MAGIC 
# MAGIC Now let's create a functional validation pipeline that composes multiple validation rules and produces comprehensive reports.

In [None]:
# COMMAND ----------

# Data constraint validation functions
def validate_not_null(df: DataFrame, columns: List[str]) -> List[ValidationResult]:
    """
    Pure function to validate that specified columns don't contain null values.
    Returns a list of validation results, one per column.
    """
    results = []
    
    for col_name in columns:
        if col_name not in df.columns:
            results.append(ValidationResult(
                is_valid=False,
                level=ValidationLevel.ERROR,
                message=f"Column '{col_name}' does not exist",
                rule_name="not_null",
                details={"column": col_name}
            ))
            continue
        
        null_count = df.filter(F.col(col_name).isNull()).count()
        total_count = df.count()
        
        if null_count > 0:
            results.append(ValidationResult(
                is_valid=False,
                level=ValidationLevel.ERROR,
                message=f"Column '{col_name}' contains {null_count} null values out of {total_count} records",
                rule_name="not_null",
                failed_records=null_count,
                details={"column": col_name, "null_count": null_count, "total_count": total_count}
            ))
        else:
            results.append(ValidationResult(
                is_valid=True,
                level=ValidationLevel.INFO,
                message=f"Column '{col_name}' has no null values",
                rule_name="not_null",
                details={"column": col_name}
            ))
    
    return results

def validate_range(df: DataFrame, column: str, min_val: Optional[float] = None, 
                  max_val: Optional[float] = None) -> ValidationResult:
    """
    Pure function to validate that numeric column values fall within specified range.
    """
    if column not in df.columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' does not exist",
            rule_name="range_validation",
            details={"column": column}
        )
    
    conditions = []
    range_description = []
    
    if min_val is not None:
        conditions.append(F.col(column) < min_val)
        range_description.append(f"< {min_val}")
    
    if max_val is not None:
        conditions.append(F.col(column) > max_val)
        range_description.append(f"> {max_val}")
    
    if not conditions:
        return ValidationResult(
            is_valid=True,
            level=ValidationLevel.INFO,
            message=f"No range constraints specified for '{column}'",
            rule_name="range_validation"
        )
    
    # Find records outside the valid range
    invalid_condition = reduce(lambda a, b: a | b, conditions)
    invalid_count = df.filter(invalid_condition).count()
    total_count = df.count()
    
    if invalid_count > 0:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' has {invalid_count} values outside valid range ({' or '.join(range_description)})",
            rule_name="range_validation",
            failed_records=invalid_count,
            details={
                "column": column, 
                "invalid_count": invalid_count, 
                "total_count": total_count,
                "min_val": min_val,
                "max_val": max_val
            }
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message=f"All values in '{column}' are within valid range",
        rule_name="range_validation",
        details={"column": column, "min_val": min_val, "max_val": max_val}
    )

def validate_regex_pattern(df: DataFrame, column: str, pattern: str, 
                          description: str = "") -> ValidationResult:
    """
    Pure function to validate that string column values match a regex pattern.
    """
    if column not in df.columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' does not exist",
            rule_name="regex_pattern",
            details={"column": column}
        )
    
    # Count records that don't match the pattern
    invalid_count = df.filter(~F.col(column).rlike(pattern)).count()
    total_count = df.count()
    
    rule_desc = description or f"pattern '{pattern}'"
    
    if invalid_count > 0:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Column '{column}' has {invalid_count} values that don't match {rule_desc}",
            rule_name="regex_pattern",
            failed_records=invalid_count,
            details={
                "column": column, 
                "pattern": pattern,
                "description": description,
                "invalid_count": invalid_count, 
                "total_count": total_count
            }
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message=f"All values in '{column}' match {rule_desc}",
        rule_name="regex_pattern",
        details={"column": column, "pattern": pattern, "description": description}
    )

# Test constraint validation functions
print("Testing constraint validation functions...")

# Test not null validation
null_results = validate_not_null(valid_df, ["employee_id", "name", "email"])
for result in null_results:
    status = "✅" if result.is_valid else "❌"
    print(f"{status} Not null test for {result.details.get('column')}: {result.message}")

# Test range validation  
age_range_result = validate_range(valid_df, "age", min_val=18, max_val=65)
print(f"✅ Age range test: {age_range_result.message}")

salary_range_result = validate_range(valid_df, "salary", min_val=0)
print(f"✅ Salary range test: {salary_range_result.message}")

# Test regex pattern validation
email_pattern_result = validate_regex_pattern(
    valid_df, "email", r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", 
    "valid email format"
)
print(f"✅ Email pattern test: {email_pattern_result.message}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Data Constraint Validation
# MAGIC 
# MAGIC Beyond schema validation, we need to validate data content against business rules and constraints. Let's implement functional constraint validation patterns.

In [None]:
# COMMAND ----------

# Core schema validation functions
def validate_columns_exist(df: DataFrame, expected_columns: List[str]) -> ValidationResult:
    """
    Pure function to validate that all expected columns exist in the DataFrame.
    """
    actual_columns = set(df.columns)
    expected_columns_set = set(expected_columns)
    missing_columns = expected_columns_set - actual_columns
    
    if missing_columns:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message=f"Missing required columns: {sorted(missing_columns)}",
            rule_name="columns_exist",
            details={"missing_columns": sorted(missing_columns)}
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message=f"All {len(expected_columns)} required columns present",
        rule_name="columns_exist"
    )

def validate_schema_compatibility(df: DataFrame, expected_schema: StructType) -> ValidationResult:
    """
    Pure function to validate DataFrame schema compatibility.
    Checks both column presence and data type compatibility.
    """
    actual_schema = df.schema
    expected_fields = {field.name: field for field in expected_schema.fields}
    actual_fields = {field.name: field for field in actual_schema.fields}
    
    issues = []
    
    # Check missing columns
    missing_cols = set(expected_fields.keys()) - set(actual_fields.keys())
    if missing_cols:
        issues.append(f"Missing columns: {sorted(missing_cols)}")
    
    # Check type compatibility for common columns
    common_cols = set(expected_fields.keys()) & set(actual_fields.keys())
    type_mismatches = []
    
    for col_name in common_cols:
        expected_type = expected_fields[col_name].dataType
        actual_type = actual_fields[col_name].dataType
        
        if not _are_types_compatible(actual_type, expected_type):
            type_mismatches.append(f"{col_name}: expected {expected_type}, got {actual_type}")
    
    if type_mismatches:
        issues.append(f"Type mismatches: {type_mismatches}")
    
    if issues:
        return ValidationResult(
            is_valid=False,
            level=ValidationLevel.ERROR,
            message="; ".join(issues),
            rule_name="schema_compatibility",
            details={"issues": issues}
        )
    
    return ValidationResult(
        is_valid=True,
        level=ValidationLevel.INFO,
        message="Schema is compatible",
        rule_name="schema_compatibility"
    )

def _are_types_compatible(actual: DataType, expected: DataType) -> bool:
    """Helper function to check type compatibility with some flexibility."""
    if actual == expected:
        return True
    
    # Allow some implicit conversions
    compatible_conversions = {
        (IntegerType, LongType),
        (FloatType, DoubleType),
        (StringType, VarcharType),
    }
    
    return (type(actual), type(expected)) in compatible_conversions

# Test the schema validation functions
print("Testing schema validation functions...")

# Test with valid DataFrame
result1 = validate_columns_exist(valid_df, ["employee_id", "name", "email", "age"])
print(f"✅ Valid columns test: {result1.is_valid} - {result1.message}")

# Test with missing columns
result2 = validate_columns_exist(valid_df, ["employee_id", "name", "missing_column"])
print(f"❌ Missing columns test: {result2.is_valid} - {result2.message}")

# Test schema compatibility
result3 = validate_schema_compatibility(valid_df, expected_schema)
print(f"✅ Schema compatibility test: {result3.is_valid} - {result3.message}")

In [None]:
# COMMAND ----------

# Validation result classes for functional error handling
from dataclasses import dataclass
from enum import Enum
from typing import NamedTuple

class ValidationLevel(Enum):
    ERROR = "ERROR"
    WARNING = "WARNING" 
    INFO = "INFO"

@dataclass
class ValidationResult:
    """
    Immutable validation result that contains all validation information.
    Enables functional composition and chaining of validation results.
    """
    is_valid: bool
    level: ValidationLevel
    message: str
    rule_name: str
    failed_records: Optional[int] = None
    details: Optional[Dict[str, Any]] = None
    
    def __post_init__(self):
        if self.details is None:
            self.details = {}

@dataclass 
class ValidationReport:
    """
    Immutable collection of validation results with summary statistics.
    """
    results: List[ValidationResult]
    total_validations: int
    passed_validations: int
    failed_validations: int
    warnings: int
    
    @classmethod
    def from_results(cls, results: List[ValidationResult]) -> 'ValidationReport':
        """Pure constructor that calculates summary statistics from results."""
        total = len(results)
        passed = sum(1 for r in results if r.is_valid)
        failed = sum(1 for r in results if not r.is_valid and r.level == ValidationLevel.ERROR)
        warnings = sum(1 for r in results if r.level == ValidationLevel.WARNING)
        
        return cls(
            results=results,
            total_validations=total,
            passed_validations=passed,
            failed_validations=failed,
            warnings=warnings
        )
    
    @property
    def is_valid(self) -> bool:
        """Check if all validations passed (warnings don't fail validation)."""
        return self.failed_validations == 0
    
    def print_summary(self) -> None:
        """Display validation summary."""
        status = "✅ PASSED" if self.is_valid else "❌ FAILED"
        print(f"\n{status} - Validation Report Summary")
        print(f"Total Validations: {self.total_validations}")
        print(f"Passed: {self.passed_validations}")
        print(f"Failed: {self.failed_validations}")
        print(f"Warnings: {self.warnings}")
        
        if not self.is_valid or self.warnings > 0:
            print("\nDetails:")
            for result in self.results:
                if not result.is_valid or result.level == ValidationLevel.WARNING:
                    icon = "⚠️" if result.level == ValidationLevel.WARNING else "❌"
                    print(f"  {icon} {result.rule_name}: {result.message}")

print("✅ Validation result classes defined")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Functional Schema Validation Patterns
# MAGIC 
# MAGIC Let's implement pure functions for various types of schema validation. These functions return validation results without side effects.

In [None]:
# COMMAND ----------

# Create sample data for validation demonstrations
def create_sample_data():
    """
    Pure function to create consistent sample data for validation examples.
    Returns multiple DataFrames with different validation scenarios.
    """
    
    # Valid employee data
    valid_employee_data = [
        (1, "Alice Johnson", "alice@company.com", 28, "Engineering", 75000.0, "2020-01-15"),
        (2, "Bob Smith", "bob@company.com", 32, "Marketing", 65000.0, "2019-03-10"),
        (3, "Carol Davis", "carol@company.com", 29, "Engineering", 80000.0, "2021-06-01"),
        (4, "David Wilson", "david@company.com", 35, "Sales", 70000.0, "2018-11-20"),
    ]
    
    valid_schema = StructType([
        StructField("employee_id", IntegerType(), False),
        StructField("name", StringType(), False),
        StructField("email", StringType(), False),
        StructField("age", IntegerType(), False),
        StructField("department", StringType(), False),
        StructField("salary", DoubleType(), False),
        StructField("hire_date", StringType(), False)
    ])
    
    valid_df = spark.createDataFrame(valid_employee_data, valid_schema)
    
    # Invalid data scenarios for validation testing
    invalid_data_scenarios = {
        "missing_columns": [
            (1, "Alice Johnson", "alice@company.com", 28, "Engineering"),  # Missing salary and hire_date
            (2, "Bob Smith", "bob@company.com", 32, "Marketing")
        ],
        
        "wrong_types": [
            ("1", "Alice Johnson", "alice@company.com", "28", "Engineering", "75000", "2020-01-15"),  # String IDs and age
            ("2", "Bob Smith", "bob@company.com", "32", "Marketing", "65000", "2019-03-10")
        ],
        
        "null_violations": [
            (1, None, "alice@company.com", 28, "Engineering", 75000.0, "2020-01-15"),  # Null name
            (2, "Bob Smith", None, 32, "Marketing", 65000.0, "2019-03-10")  # Null email
        ],
        
        "constraint_violations": [
            (1, "Alice Johnson", "invalid-email", -5, "InvalidDept", -10000.0, "invalid-date"),  # Multiple violations
            (2, "Bob Smith", "bob@company.com", 150, "Marketing", 1000000.0, "2019-13-45")
        ]
    }
    
    return valid_df, invalid_data_scenarios, valid_schema

# Generate sample data
valid_df, invalid_scenarios, expected_schema = create_sample_data()

print("✅ Sample data created for validation demonstrations")
valid_df.show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Schema Validation Fundamentals
# MAGIC 
# MAGIC Schema validation ensures that DataFrames conform to expected structure, data types, and constraints. Let's start with functional approaches to schema validation.
# MAGIC 
# MAGIC ### Core Validation Types
# MAGIC 
# MAGIC 1. **Structural Validation**: Column presence, order, and naming
# MAGIC 2. **Type Validation**: Data type compatibility and conversion
# MAGIC 3. **Constraint Validation**: Null checks, value ranges, format patterns
# MAGIC 4. **Relationship Validation**: Foreign key constraints and referential integrity

In [None]:
# COMMAND ----------

# Essential imports for DataFrame and schema validation
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.utils import AnalysisException
from typing import Dict, List, Tuple, Optional, Callable, Any, Union
from functools import reduce, partial
import json
from datetime import datetime
import re

# Initialize Spark session (if not already available)
spark = SparkSession.builder.appName("DataFrameSchemaValidation").getOrCreate()

print("✅ Setup complete - Ready for DataFrame and schema validation!")

# Databricks notebook source
# MAGIC %md
# MAGIC # 3.2 Validating DataFrames and Schemas
# MAGIC 
# MAGIC This notebook demonstrates functional approaches to DataFrame and schema validation in PySpark. We'll explore various validation strategies that maintain functional programming principles while ensuring data quality and integrity.
# MAGIC 
# MAGIC ## Learning Objectives
# MAGIC 
# MAGIC By the end of this notebook, you will understand how to:
# MAGIC - Implement functional schema validation patterns
# MAGIC - Create reusable validation functions
# MAGIC - Handle validation errors gracefully
# MAGIC - Build validation pipelines with pure functions
# MAGIC - Use Great Expectations with PySpark
# MAGIC - Create custom validation rules
# MAGIC 
# MAGIC ## Prerequisites
# MAGIC 
# MAGIC - Understanding of PySpark DataFrames
# MAGIC - Knowledge of functional programming concepts
# MAGIC - Familiarity with schema definitions