In [0]:
from pyspark.sql import functions as F, types as T, Window

enriched = spark.table("genealogy.bronze_gedcom_enriched")

# Identify source roots
source_roots = (
    enriched
    .where((F.col("tag") == "SOUR") & (F.col("level") == 0))
    .select(
        "record_xref",
        F.col("line_no").alias("source_root_line_no")
    )
)

# Scope all descendants per source
source_scope = (
    enriched
    .join(
        source_roots,
        on=["record_xref", "source_root_line_no"],
        how="inner"
    )
)

# Helper to extract first attribute per source (may not actually be needed if sources can only have one of each)
def first_source_attr(tag_name, out_col):
    w = Window.partitionBy("record_xref", "source_root_line_no").orderBy("line_no")
    return (
        source_scope
        .where(F.col("tag") == tag_name)
        .withColumn("rn", F.row_number().over(w))
        .where(F.col("rn") == 1)
        .select(
            "record_xref",
            "source_root_line_no",
            F.col("value").alias(out_col)
        )
    )

# Extract standard source attributes
source_title = first_source_attr("TITL", "source_title")
source_author = first_source_attr("AUTH", "source_author")
source_publ = first_source_attr("PUBL", "source_publication")
source_text = first_source_attr("TEXT", "source_text")
source_repo = first_source_attr("REPO", "source_repo")

# Assemble note text
note_pieces = (
    source_scope
    .where(F.col("tag").isin("NOTE", "CONC", "CONT"))
    .where(F.col("note_root_line_no").isNotNull())
    .select(
        "record_xref",
        "source_root_line_no",
        "note_root_line_no",
        "line_no",
        "tag",
        "value"
    )
)

note_by_root = (
    note_pieces
    .groupBy("record_xref", "source_root_line_no", "note_root_line_no")
    .agg(F.sort_array(F.collect_list(F.struct("line_no", "tag", "value"))).alias("pieces"))
    .withColumn(
        "note_text",
        F.expr("""
            aggregate(
              pieces,
              '',
              (acc, x) ->
                case
                  when x.tag = 'NOTE' then concat(acc, coalesce(x.value, ''))
                  when x.tag = 'CONC' then concat(acc, coalesce(x.value, ''))
                  when x.tag = 'CONT' then concat(acc, '\n', coalesce(x.value, ''))
                  else acc
                end
            )
        """)
    )
    .select("record_xref", "source_root_line_no", "note_root_line_no", "note_text")
)

source_note = (
    note_by_root
    .groupBy("record_xref", "source_root_line_no")
    .agg(F.sort_array(F.collect_list(F.struct("note_root_line_no", "note_text"))).alias("notes"))
    .withColumn("source_note", F.expr("array_join(transform(notes, x -> x.note_text), '\n\n')"))
    .select("record_xref", "source_root_line_no", "source_note")
)

# Assemble final silver_source
silver_source = (
    source_roots
    .join(source_title, ["record_xref", "source_root_line_no"], "left")
    .join(source_author, ["record_xref", "source_root_line_no"], "left")
    .join(source_publ, ["record_xref", "source_root_line_no"], "left")
    .join(source_text, ["record_xref", "source_root_line_no"], "left")
    .join(source_note, ["record_xref", "source_root_line_no"], "left")
    .join(source_repo, ["record_xref", "source_root_line_no"], "left")
)

# Write table
(
    silver_source
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("genealogy.silver_source")
)


In [0]:
events = spark.table("genealogy.silver_event")

# Identify event-level citations
citations = (
    enriched
    .where(
        (F.col("tag") == "SOUR") &
        (F.col("level") >= 2) &
        (F.col("event_line_no").isNotNull())
    )
    .select(
        "record_xref",
        F.col("line_no").alias("citation_root_line_no"),
        "event_line_no",
        F.col("value").alias("source_xref")
    )
)

# Map citation â†’ source_id
sources = (
    spark.table("genealogy.silver_source")
    .select(
        F.col("record_xref").alias("source_xref"),
        F.col("source_root_line_no")
    )
)

citations = (
    citations
    .join(
        sources,
        on="source_xref",
        how="left"
    )
)

# Pull citation attributes (PAGE, TEXT etc.)
citation_scope = (
    enriched
    .join(
        citations.select(
            "citation_root_line_no",
            "event_line_no"
        ),
        on=["citation_root_line_no", "event_line_no"],
        how="inner"
    )
)

# Helper for first attribute by citation (maybe redundant)
def first_citation_attr(tag_name, out_col):
    w = Window.partitionBy("record_xref", "citation_root_line_no", "event_line_no").orderBy("line_no")
    return (
        citation_scope
        .where(F.col("tag") == tag_name)
        .withColumn("rn", F.row_number().over(w))
        .where(F.col("rn") == 1)
        .select(
            "record_xref",
            "citation_root_line_no",
            "event_line_no",
            F.col("value").alias(out_col)
        )
    )

# Extract standard citation attributes
citation_date = first_citation_attr("DATE", "citation_date")
citation_page = first_citation_attr("PAGE", "citation_page")
citation_text = first_citation_attr("TEXT", "citation_text")
citation_url = first_citation_attr("WWW", "citation_url")
citation_ancestry_person_id = first_citation_attr("_APID", "citation_ancestry_person_id")
citation_ancestry_husband_id = first_citation_attr("_HPID", "citation_ancestry_husband_id")
citation_ancestry_wife_id = first_citation_attr("_WPID", "citation_ancestry_wife_id")

# Assemble citation note text
note_pieces = (
    citation_scope
    .where(F.col("tag").isin("NOTE", "CONC", "CONT"))
    .where(F.col("note_root_line_no").isNotNull())
    .select(
        "record_xref",
        "citation_root_line_no",
        "event_line_no",
        "note_root_line_no",
        "line_no",
        "tag",
        "value"
    )
)

note_by_root = (
    note_pieces
    .groupBy("record_xref", "citation_root_line_no", "event_line_no", "note_root_line_no")
    .agg(F.sort_array(F.collect_list(F.struct("line_no", "tag", "value"))).alias("pieces"))
    .withColumn(
        "note_text",
        F.expr("""
            aggregate(
              pieces,
              '',
              (acc, x) ->
                case
                  when x.tag = 'NOTE' then concat(acc, coalesce(x.value, ''))
                  when x.tag = 'CONC' then concat(acc, coalesce(x.value, ''))
                  when x.tag = 'CONT' then concat(acc, '\n', coalesce(x.value, ''))
                  else acc
                end
            )
        """)
    )
    .select("record_xref", "citation_root_line_no", "event_line_no", "note_root_line_no", "note_text")
)

citation_note = (
    note_by_root
    .groupBy("record_xref", "citation_root_line_no", "event_line_no")
    .agg(F.sort_array(F.collect_list(F.struct("note_root_line_no", "note_text"))).alias("notes"))
    .withColumn("citation_note", F.expr("array_join(transform(notes, x -> x.note_text), '\n\n')"))
    .select("record_xref", "citation_root_line_no", "event_line_no", "citation_note")
)

# Join to events and assemble final table
silver_event_source = (
    citations
    .join(events, on=["record_xref", "event_line_no"], how="left")
    .join(citation_date, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_page, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_text, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_note, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_url, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_ancestry_person_id, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_ancestry_husband_id, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .join(citation_ancestry_wife_id, ["record_xref", "citation_root_line_no", "event_line_no"], "left")
    .select(
        "event_id",
        "source_xref",
        "citation_root_line_no",
        "citation_date",
        "citation_page",
        "citation_text",
        "citation_note",
        "citation_url",
        "citation_ancestry_person_id",
        "citation_ancestry_husband_id",
        "citation_ancestry_wife_id"
    )
)

# Write table
(
    silver_event_source
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("genealogy.silver_event_source")
)
