In [0]:
# Databricks notebook source
# --------------------------------------------
# Library: Helper utilities
# Author: Ed Ball (via ChatGPT)
# Purpose:
#   - Provide reusable helper functions for common warehouse actions
# --------------------------------------------


In [0]:
from pyspark.sql.functions import col, lit, current_timestamp

In [0]:
def merge_place_canonicals(
    source_canonical_id: str,
    target_canonical_id: str,
    merge_reason: str,
    merged_by: str = None,
    dry_run: bool = False
):
    if source_canonical_id == target_canonical_id:
        raise ValueError("Source and target canonical IDs must be different.")

    if not merge_reason or merge_reason.strip() == "":
        raise ValueError("merge_reason is required and must be meaningful.")

    merged_by = merged_by or "unknown"

    canonical = spark.table("genealogy.silver_place_canonical")

    source = canonical.filter(col("canonical_place_id") == source_canonical_id).collect()
    target = canonical.filter(col("canonical_place_id") == target_canonical_id).collect()

    if not source:
        raise ValueError(f"Source canonical {source_canonical_id} does not exist.")

    if not target:
        raise ValueError(f"Target canonical {target_canonical_id} does not exist.")

    if not source[0]["is_active"]:
        raise ValueError(f"Source canonical {source_canonical_id} is already inactive.")

    if not target[0]["is_active"]:
        raise ValueError(f"Target canonical {target_canonical_id} is not active.")

    print(f"Merging canonical place {source_canonical_id} → {target_canonical_id}")
    print(f"Dry run: {dry_run}")

    if dry_run:
        # Preview impact
        variant_count = spark.table("genealogy.silver_place_variant") \
            .filter(f"canonical_place_id = '{source_canonical_id}'").count()

        print(f"Would reassign {variant_count} variants")
        source_name = source[0]["preferred_name"]
        target_name = target[0]["preferred_name"]
        print(f"Would merge source canonical record {source_name} into target canonical record {target_name}")

        return

    # --- Step 1: retire source canonical
    spark.sql(f"""
        UPDATE genealogy.silver_place_canonical
        SET
          is_active = false,
          merged_into_canonical_id = '{target_canonical_id}',
          merge_reason = '{merge_reason}',
          merge_timestamp = current_timestamp()
        WHERE canonical_place_id = '{source_canonical_id}'
    """)

    # --- Step 2: repoint variants
    spark.sql(f"""
        UPDATE genealogy.silver_place_variant
        SET canonical_place_id = '{target_canonical_id}'
        WHERE canonical_place_id = '{source_canonical_id}'
    """)

    # --- Step 3: write merge log
    spark.sql(f"""
        INSERT INTO genealogy.silver_place_merge_log
        VALUES (
          '{source_canonical_id}',
          '{target_canonical_id}',
          '{merge_reason}',
          '{merged_by}',
          current_timestamp()
        )
    """)

    print(f"✅ Merged canonical {source_canonical_id} → {target_canonical_id}")
