# Silver Pipeline - Databricks Notebook

This notebook implements the Silver layer pipeline for the Medallion architecture (Bronze → Silver → Gold).
The pipeline processes data through three sequential stages:

- **Silver_A**: Schema selection based on configurable business rules
- **Silver_B**: Data transformations with configurable business logic  
- **Silver_C**: Deduplication and filtering (placeholder for future enhancements)

## Setup and Configuration

In [None]:
# Import required libraries
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StructType, StructField, StringType
from typing import Dict, List, Any
import logging

# Import pipeline modules
from utils.config_loader import config_loader
import silver_pipeline_stages as stages

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load configuration
config = config_loader.load_config()
source_combinations = config_loader.get_source_combinations()

print(f"Configuration loaded successfully")
print(f"Processing {len(source_combinations)} source combinations:")
for source, lob, domain in source_combinations:
    print(f"  - {source}/{lob}/{domain}")

## Pipeline Configuration

In [None]:
# Set the year_month parameter for Bronze table lookup
# In production, this would be parameterized or derived from the execution context
year_month = "202407"  # Modify this as needed

print(f"Using year_month: {year_month}")
print(f"Bronze schema: {config.bronze_schema}")
print(f"Silver schema: {config.silver_schema}")
print(f"Schema rules table: {config.schema_rules_table}")
print(f"Transform rules table: {config.transform_rules_table}")

## Silver_A Stage - Schema Selection

This stage reads Bronze tables and applies schema selection rules to determine which fields to include in Silver_A.

In [None]:
def process_silver_a(source: str, lob: str, domain: str, year_month: str) -> DataFrame:
    """
    Process Silver_A stage for a given source/lob/domain combination
    
    Args:
        source: Source identifier
        lob: Line of business
        domain: Domain (pharmacy, medical, member)
        year_month: Year-month string (YYYYMM)
        
    Returns:
        Processed Silver_A DataFrame
    """
    print(f"Processing Silver_A for {source}/{lob}/{domain}")
    
    # Construct Bronze table name
    bronze_table = config_loader.get_bronze_table_name(source, lob, domain, year_month)
    print(f"  Bronze table: {bronze_table}")
    
    # Check if Bronze table exists
    if not stages.validate_bronze_table_exists(bronze_table):
        print(f"  WARNING: Bronze table {bronze_table} does not exist")
        # Return empty DataFrame with placeholder schema
        empty_schema = StructType([StructField("placeholder", StringType(), True)])
        return spark.createDataFrame([], empty_schema)
    
    # Read Bronze table
    bronze_df = spark.table(bronze_table)
    print(f"  Bronze table row count: {bronze_df.count()}")
    
    # Get schema rules
    schema_rules = stages.get_schema_rules(config, source, lob, domain)
    print(f"  Found {len(schema_rules)} schema rules")
    
    if schema_rules:
        for rule in schema_rules:
            print(f"    - {rule['field_name']} ({rule['data_type']}) [Required: {rule['is_required']}]")
    
    # Apply schema selection
    silver_a_df = stages.apply_schema_selection(bronze_df, schema_rules)
    print(f"  Silver_A table columns: {silver_a_df.columns}")
    print(f"  Silver_A table row count: {silver_a_df.count()}")
    
    return silver_a_df

# Process Silver_A for all source combinations
silver_a_tables = {}

for source, lob, domain in source_combinations:
    table_key = f"{source}_{lob}_{domain}_silver_a"
    silver_a_df = process_silver_a(source, lob, domain, year_month)
    silver_a_tables[table_key] = silver_a_df
    
    # Create or replace the Silver_A table
    silver_a_table_name = f"{config.silver_schema}.{table_key}"
    silver_a_df.write.mode("overwrite").saveAsTable(silver_a_table_name)
    print(f"  Saved Silver_A table: {silver_a_table_name}")
    print("="*50)

## Silver_B Stage - Business Transformations

This stage reads Silver_A tables and applies business transformation rules.

In [None]:
def process_silver_b(source: str, lob: str, domain: str) -> DataFrame:
    """
    Process Silver_B stage for a given source/lob/domain combination
    
    Args:
        source: Source identifier
        lob: Line of business
        domain: Domain (pharmacy, medical, member)
        
    Returns:
        Processed Silver_B DataFrame
    """
    print(f"Processing Silver_B for {source}/{lob}/{domain}")
    
    # Read Silver_A table
    silver_a_table_key = f"{source}_{lob}_{domain}_silver_a"
    
    if silver_a_table_key not in silver_a_tables:
        print(f"  ERROR: Silver_A table not found for {source}/{lob}/{domain}")
        empty_schema = StructType([StructField("placeholder", StringType(), True)])
        return spark.createDataFrame([], empty_schema)
    
    silver_a_df = silver_a_tables[silver_a_table_key]
    print(f"  Silver_A row count: {silver_a_df.count()}")
    
    # Get transformation rules
    transform_rules = stages.get_transform_rules(config, source, lob, domain)
    print(f"  Found {len(transform_rules)} transformation rules")
    
    if transform_rules:
        for rule in transform_rules:
            print(f"    - {rule['field_name']}: {rule['transform']} (Priority: {rule['priority']})")
    
    # Apply transformations
    silver_b_df = stages.apply_transformations(silver_a_df, transform_rules)
    print(f"  Silver_B table columns: {silver_b_df.columns}")
    print(f"  Silver_B table row count: {silver_b_df.count()}")
    
    return silver_b_df

# Process Silver_B for all source combinations
silver_b_tables = {}

for source, lob, domain in source_combinations:
    table_key = f"{source}_{lob}_{domain}_silver_b"
    silver_b_df = process_silver_b(source, lob, domain)
    silver_b_tables[table_key] = silver_b_df
    
    # Create or replace the Silver_B table
    silver_b_table_name = f"{config.silver_schema}.{table_key}"
    silver_b_df.write.mode("overwrite").saveAsTable(silver_b_table_name)
    print(f"  Saved Silver_B table: {silver_b_table_name}")
    print("="*50)

## Silver_C Stage - Deduplication and Filtering

This stage currently acts as a pass-through but is a placeholder for future deduplication and filtering logic.

In [None]:
def process_silver_c(source: str, lob: str, domain: str) -> DataFrame:
    """
    Process Silver_C stage for a given source/lob/domain combination
    Currently a pass-through, placeholder for future enhancements
    
    Args:
        source: Source identifier
        lob: Line of business
        domain: Domain (pharmacy, medical, member)
        
    Returns:
        Processed Silver_C DataFrame
    """
    print(f"Processing Silver_C for {source}/{lob}/{domain}")
    
    # Read Silver_B table
    silver_b_table_key = f"{source}_{lob}_{domain}_silver_b"
    
    if silver_b_table_key not in silver_b_tables:
        print(f"  ERROR: Silver_B table not found for {source}/{lob}/{domain}")
        empty_schema = StructType([StructField("placeholder", StringType(), True)])
        return spark.createDataFrame([], empty_schema)
    
    silver_b_df = silver_b_tables[silver_b_table_key]
    print(f"  Silver_B row count: {silver_b_df.count()}")
    
    # For now, just pass through the data
    # Future enhancements: deduplication, row filtering, data quality scoring
    silver_c_df = silver_b_df
    
    print(f"  Silver_C table columns: {silver_c_df.columns}")
    print(f"  Silver_C table row count: {silver_c_df.count()}")
    
    return silver_c_df

# Process Silver_C for all source combinations
silver_c_tables = {}

for source, lob, domain in source_combinations:
    table_key = f"{source}_{lob}_{domain}_silver_c"
    silver_c_df = process_silver_c(source, lob, domain)
    silver_c_tables[table_key] = silver_c_df
    
    # Create or replace the Silver_C table
    silver_c_table_name = f"{config.silver_schema}.{table_key}"
    silver_c_df.write.mode("overwrite").saveAsTable(silver_c_table_name)
    print(f"  Saved Silver_C table: {silver_c_table_name}")
    print("="*50)

## Data Quality Validation (Optional)

Apply data quality validation to ensure data meets business requirements.

In [None]:
# Apply data quality validation if enabled
if config.validation_enabled:
    print("Applying data quality validation...")
    
    for source, lob, domain in source_combinations:
        print(f"Validating {source}/{lob}/{domain}")
        
        # Validate Silver_A
        silver_a_key = f"{source}_{lob}_{domain}_silver_a"
        if silver_a_key in silver_a_tables:
            validated_df = stages.validate_required_fields(
                silver_a_tables[silver_a_key], config, source, lob, domain
            )
            print(f"  Silver_A validation passed: {validated_df.count()} rows")
        
        # Validate Silver_B
        silver_b_key = f"{source}_{lob}_{domain}_silver_b"
        if silver_b_key in silver_b_tables:
            validated_df = stages.validate_required_fields(
                silver_b_tables[silver_b_key], config, source, lob, domain
            )
            print(f"  Silver_B validation passed: {validated_df.count()} rows")
        
        # Validate Silver_C
        silver_c_key = f"{source}_{lob}_{domain}_silver_c"
        if silver_c_key in silver_c_tables:
            validated_df = stages.validate_required_fields(
                silver_c_tables[silver_c_key], config, source, lob, domain
            )
            print(f"  Silver_C validation passed: {validated_df.count()} rows")
else:
    print("Data quality validation is disabled")

## Pipeline Summary

Display summary of processed tables and their record counts.

In [None]:
print("\n" + "="*60)
print("SILVER PIPELINE EXECUTION SUMMARY")
print("="*60)

print(f"Processed {len(source_combinations)} source combinations")
print(f"Year-month parameter: {year_month}")
print(f"Target schema: {config.silver_schema}")

print("\nTable Summary:")
for source, lob, domain in source_combinations:
    print(f"\n{source}/{lob}/{domain}:")
    
    # Silver_A
    silver_a_key = f"{source}_{lob}_{domain}_silver_a"
    if silver_a_key in silver_a_tables:
        count = silver_a_tables[silver_a_key].count()
        print(f"  Silver_A: {count} rows")
    else:
        print(f"  Silver_A: Not processed")
    
    # Silver_B
    silver_b_key = f"{source}_{lob}_{domain}_silver_b"
    if silver_b_key in silver_b_tables:
        count = silver_b_tables[silver_b_key].count()
        print(f"  Silver_B: {count} rows")
    else:
        print(f"  Silver_B: Not processed")
    
    # Silver_C
    silver_c_key = f"{source}_{lob}_{domain}_silver_c"
    if silver_c_key in silver_c_tables:
        count = silver_c_tables[silver_c_key].count()
        print(f"  Silver_C: {count} rows")
    else:
        print(f"  Silver_C: Not processed")

print("\n" + "="*60)
print("PIPELINE EXECUTION COMPLETE")
print("="*60)

## Optional: Data Exploration

Explore the processed Silver tables to verify results.

In [None]:
# Example: Display sample data from processed tables
# Uncomment and modify as needed for exploration

# for source, lob, domain in source_combinations:
#     print(f"\nSample data for {source}/{lob}/{domain}:")
#     
#     # Show Silver_C sample (final stage)
#     silver_c_key = f"{source}_{lob}_{domain}_silver_c"
#     if silver_c_key in silver_c_tables:
#         silver_c_tables[silver_c_key].show(5, truncate=False)
#     else:
#         print(f"  No data available for {silver_c_key}")
        
print("Data exploration cell ready - uncomment code above to explore tables")