In [None]:
# Common configuration
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")

# Common imports

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, when, year
import logging

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def build_parks_table():
    """Transforms CAMS_FEATURE and related tables into confirm_parks table"""
    try:
        logger.info("Starting parks table transformation")
        
        # Step 1: Load and cache reference tables
        logger.info("Loading reference tables")
        
        feature_types_df = spark.sql("""
            SELECT feature_type_code, feature_type_name 
            FROM DIMSU_Lakehouse_Bronze.assets.confirm_feature_type
        """).cache()
        
        suburbs_df = spark.sql("""
            SELECT area_code, area_name 
            FROM DIMSU_Lakehouse_Bronze.assets.confirm_area
        """).cache()
        
        park_names_df = spark.sql("""
            SELECT site_code, plot_number, feat_attrib_notes
            FROM DIMSU_Lakehouse_Bronze.assets.confirm_feat_attrib_type
            WHERE attrib_type_code = 'PKAN'
        """).cache()
        
        # Register as temp views for SQL operations
        feature_types_df.createOrReplaceTempView("feature_types")
        suburbs_df.createOrReplaceTempView("suburbs")
        park_names_df.createOrReplaceTempView("park_names")
        
        # Step 2: Create Parks DataFrame with all transformations
        logger.info("Creating Parks table")
        
        parks_df = spark.sql("""
            SELECT 
                f.site_code AS `Site code`,
                f.plot_number AS `Plot No.`,
                f.feature_deadflag AS `Dead flag`,
                f.contract_area_code AS `Contract area code`,
                f.feature_type_code AS `Park type code`,
                f.central_asset_id AS `Central asset ID`,
                f.feature_start_date AS `Park added`,
                CASE 
                    WHEN f.contract_area_code = 'PKN' THEN 'North' 
                    ELSE 'South' 
                END AS `Service area`,
                ft.feature_type_name AS `Park type`,
                pn.feat_attrib_notes AS `Park name`,
                s.area_name AS `Suburb`,
                concat(f.site_code, '-', cast(f.plot_number as string)) AS `Site and plot`,
                year(f.feature_start_date) AS `Year added`
            FROM DIMSU_Lakehouse_Bronze.assets.confirm_feature f
            JOIN feature_types ft ON f.feature_type_code = ft.feature_type_code
            LEFT JOIN park_names pn ON f.site_code = pn.site_code AND f.plot_number = pn.plot_number
            LEFT JOIN suburbs s ON f.area_code = s.area_code
            WHERE lower(f.feature_type_code) LIKE 'pk%'
        """)
        
        # Display preview for testing
        display(parks_df.limit(10))
        
        # Step 3: Validate the data
        validate_parks_table(parks_df)
        
        # Step 4: Write to Silver lakehouse with partitioning
        row_count = parks_df.count()
        logger.info(f"Writing Parks table with {row_count} rows")
        
        parks_df.write \
            .partitionBy("Service area") \
            .format("delta") \
            .mode("overwrite") \
            .saveAsTable("DIMSU_Lakehouse_Silver.asset.confirm_parks")
        
        # Clean up - unpersist cached tables
        feature_types_df.unpersist()
        suburbs_df.unpersist()
        park_names_df.unpersist()
        
        logger.info("Parks table processing complete")
        return f"Successfully processed {row_count} park records"
        
    except Exception as e:
        logger.error(f"Error in parks table transformation: {str(e)}")
        raise e

def validate_parks_table(parks_df):
    """Validates the Parks table against business rules"""
    validation_results = []
    
    # Check for required values
    for column in ["Site code", "Plot No.", "Park type code"]:
        null_count = parks_df.filter(col(column).isNull()).count()
        if null_count > 0:
            validation_results.append(f"WARNING: {null_count} rows have null values in {column}")
    
    # Check for referential integrity
    invalid_park_types = parks_df.select("Park type code") \
        .join(spark.table("feature_types").select("feature_type_code"), 
              parks_df["Park type code"] == col("feature_type_code"), 
              "left_anti") \
        .count()
    
    if invalid_park_types > 0:
        validation_results.append(f"WARNING: {invalid_park_types} parks have invalid park types")
    
    # Row count validation
    row_count = parks_df.count()
    if row_count < 10:  # Just a sanity check
        validation_results.append(f"WARNING: Only {row_count} parks found, expected more")
    
    # Log validation results
    if validation_results:
        for result in validation_results:
            logger.warning(result)
    else:
        logger.info("All validations passed")
    
    return validation_results

# Execute the transformation
result = build_parks_table()
print(result)

In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Load and cache reference tables
logger.info("Loading reference tables")

feature_types_df = spark.sql("""
    SELECT feature_type_code, feature_type_name 
    FROM DIMSU_Lakehouse_Bronze.assets.confirm_feature_type
""").cache()

suburbs_df = spark.sql("""
    SELECT area_code, area_name 
    FROM DIMSU_Lakehouse_Bronze.assets.confirm_area
""").cache()

park_names_df = spark.sql("""
    SELECT site_code, plot_number, feat_attrib_notes
    FROM DIMSU_Lakehouse_Bronze.assets.confirm_feat_attrib_type
    WHERE attrib_type_code = 'PKAN'
""").cache()

# Register as temp views for SQL operations
feature_types_df.createOrReplaceTempView("feature_types")
suburbs_df.createOrReplaceTempView("suburbs")
park_names_df.createOrReplaceTempView("park_names")

# Step 2: Create Parks DataFrame with all transformations
logger.info("Creating Parks table")

parks_df = spark.sql("""
    SELECT 
        f.site_code AS `Site code`,
        f.plot_number AS `Plot No.`,
        f.feature_deadflag AS `Dead flag`,
        f.contract_area_code AS `Contract area code`,
        f.feature_type_code AS `Park type code`,
        f.central_asset_id AS `Central asset ID`,
        f.feature_start_date AS `Park added`,
        CASE 
            WHEN f.contract_area_code = 'PKN' THEN 'North' 
            ELSE 'South' 
        END AS `Service area`,
        ft.feature_type_name AS `Park type`,
        pn.feat_attrib_notes AS `Park name`,
        s.area_name AS `Suburb`,
        concat(f.site_code, '-', cast(f.plot_number as string)) AS `Site and plot`,
        year(f.feature_start_date) AS `Year added`
    FROM DIMSU_Lakehouse_Bronze.assets.confirm_feature f
    JOIN feature_types ft ON f.feature_type_code = ft.feature_type_code
    LEFT JOIN park_names pn ON f.site_code = pn.site_code AND f.plot_number = pn.plot_number
    LEFT JOIN suburbs s ON f.area_code = s.area_code
    WHERE lower(f.feature_type_code) LIKE 'pk%'
""")

# Display preview for testing
display(parks_df.limit(10))

StatementMeta(, a4e908d3-b6af-4fca-84ad-7bb4c07a848a, 7, Finished, Available, Finished)

INFO:__main__:Loading reference tables
INFO:__main__:Creating Parks table


SynapseWidget(Synapse.DataFrame, db7285cb-2c9a-4e66-9215-b325d8a3eb1a)

In [None]:
# Testing cached DFs - They persist across cells in the session!
display(feature_types_df.limit(5))  # This will use the cached version