In [0]:
# Databricks notebook source
# --------------------------------------------
# Library: GEDCOM text utilities
# Author: Ed Ball (via ChatGPT)
# Purpose:
#   - Resolve CONC / CONT continuation lines
#   - Provide reusable text helpers for GEDCOM pipelines
# --------------------------------------------


In [0]:
# COMMAND ----------

from pyspark.sql import DataFrame
from pyspark.sql import functions as F


In [0]:
# COMMAND ----------

def resolve_gedcom_continuations(
    df: DataFrame,
    *,
    file_col: str = "file_id",
    line_col: str = "line_no",
    parent_col: str = "parent_line_no",
    tag_col: str = "tag",
    value_col: str = "value",
    output_col: str = "full_text",
) -> DataFrame:
    """
    Resolve GEDCOM CONC / CONT continuation lines into a single text field.

    Rules:
      - CONC  -> append text directly
      - CONT  -> append '\\n' + text

    Design:
      - Tag-agnostic (works for NOTE, TEXT, EVEN, SOUR, OBJE, custom tags)
      - Order-preserving
      - Returns one row per original non-CONC/CONT line
      - Does NOT mutate input DataFrame

    Parameters:
      df          : bronze GEDCOM lines DataFrame
      *_col       : column name overrides
      output_col  : name of resolved text column

    Returns:
      DataFrame with resolved text column and no CONC/CONT rows
    """

    # Split continuation rows from text owners
    continuations = df.filter(F.col(tag_col).isin("CONC", "CONT"))
    owners = df.filter(~F.col(tag_col).isin("CONC", "CONT"))

    # Join owners to their continuation children
    joined = (
        owners.alias("o")
        .join(
            continuations.alias("c"),
            (F.col(f"c.{parent_col}") == F.col(f"o.{line_col}")) &
            (F.col(f"c.{file_col}") == F.col(f"o.{file_col}")),
            "left"
        )
    )

    # Apply GEDCOM semantics
    with_fragments = joined.withColumn(
        "_continuation_fragment",
        F.when(
            F.col(f"c.{tag_col}") == "CONT",
            F.concat(F.lit("\n"), F.col(f"c.{value_col}"))
        ).when(
            F.col(f"c.{tag_col}") == "CONC",
            F.col(f"c.{value_col}")
        )
    )

    # Aggregate fragments in strict line order
    aggregated = (
        with_fragments
        .groupBy(
            F.col(f"o.{file_col}").alias(file_col),
            F.col(f"o.{line_col}").alias(line_col),
            F.col(f"o.{parent_col}").alias(parent_col),
            F.col(f"o.{tag_col}").alias(tag_col),
            F.col(f"o.{value_col}").alias(value_col)
        )
        .agg(
            F.expr(
                """
                concat_ws(
                  '',
                  transform(
                    array_sort(
                      collect_list(
                        struct(c.line_no as ln, _continuation_fragment as txt)
                      )
                    ),
                    x -> x.txt
                  )
                )
                """
            ).alias("_continuation_text")
        )
    )

    # Final resolved text
    return (
        aggregated
        .withColumn(
            output_col,
            F.concat_ws("", F.col(value_col), F.col("_continuation_text"))
        )
        .drop("_continuation_text")
    )


In [0]:
# COMMAND ----------

def resolve_bronze_gedcom_text(bronze_df: DataFrame) -> DataFrame:
    """
    Opinionated wrapper for standard bronze schema.
    Keeps call-sites clean in silver notebooks.
    """

    return resolve_gedcom_continuations(
        bronze_df,
        file_col="source_file",
        line_col="line_no",
        parent_col="parent_line_no",
        tag_col="tag",
        value_col="value",
        output_col="full_text"
    )
