In [0]:
%run ./lib_gedcom_text_utils


In [0]:
from pyspark.sql import functions as F, types as T, Window
import pandas as pd

# ----------------------------
# 0) Load your GEDCOM-lines table
# ----------------------------
ged_raw = spark.table("genealogy.bronze_gedcom_with_record") \
    .select("record_xref", "line_no", "level", "tag", "value", "source_file")

# Drop header lines
#ged = ged_raw.where(F.col("record_xref").isNotNull()) Commented out as this is also removing the root source records

# Optional: improve parallelism if you have lots of records
ged = ged_raw.repartition("record_xref")

# ----------------------------
# 1) Configure your extraction logic
# ----------------------------
EVENT_WHITELIST = {
    "BIRT", "BAPM", "CHR", "DEAT", "BURI", "CREM", "PROB", # Vital events
    "CENS", "EMIG", "IMMI", "NATU", # Migration/census events 
    "MARR", "DIV", "_SEPR", # Family formation/breakdown events
    "RESI", "EDUC", "OCCU", "GRAD", "RETI", # Event-like
    "EVEN", "FACT", # generic
    "_MILT", "_FUN", "_EMPLOY", "_MDCL", # custom events
    "NAME", "SEX" # attributes
    }  
# Anything under these subtree roots is ignored for event attrs (prevents picking up SOUR.DATE etc.)
IGNORE_SUBTREE_ROOTS = {"SOUR", "OBJE", "REPO"}

# Tags we want to pivot into columns (you can add more)
ATTR_TAGS = {"DATE", "PLAC", "NOTE", "TYPE", "CONC", "CONT"}

# ----------------------------
# 2) Enrich each line with parent + owning event + blocked flags (stack walk per record_xref)
# ----------------------------
enriched_schema = T.StructType([
    T.StructField("record_xref", T.StringType(), True),
    T.StructField("line_no", T.IntegerType(), False),
    T.StructField("level", T.IntegerType(), False),
    T.StructField("tag", T.StringType(), True),
    T.StructField("value", T.StringType(), True),

    T.StructField("parent_line_no", T.IntegerType(), True),

    # owning event for this line (nearest ancestor in EVENT_WHITELIST)
    T.StructField("event_line_no", T.IntegerType(), True),
    T.StructField("event_tag", T.StringType(), True),

    # whether this line is inside an ignored subtree (e.g., under SOUR)
    T.StructField("blocked", T.BooleanType(), False),

    # for NOTE/CONC/CONT assembly: which NOTE line this piece belongs to
    T.StructField("note_root_line_no", T.IntegerType(), True),

    # for tracking sources and citations
    T.StructField("source_root_line_no", T.IntegerType(), True),
    T.StructField("citation_root_line_no", T.IntegerType(), True),

    T.StructField("source_file", T.StringType(), False),
])

def enrich_record(pdf: pd.DataFrame) -> pd.DataFrame:
    pdf = pdf.sort_values("line_no", kind="mergesort").reset_index(drop=True)

    # stack items: (level, line_no, tag)
    stack = []

    out_parent = []
    out_event_line = []
    out_event_tag = []
    out_blocked = []
    out_note_root = []
    out_source_root = []
    out_citation_root = []

    for _, row in pdf.iterrows():
        level = int(row["level"])
        tag = row["tag"]

        # Pop until top has level < current level (so top becomes parent)
        while stack and stack[-1][0] >= level:
            stack.pop()

        parent_line_no = stack[-1][1] if stack else None
        out_parent.append(parent_line_no)

        # blocked if any ancestor tag is an ignored subtree root
        blocked = any(a_tag in IGNORE_SUBTREE_ROOTS for (_, _, a_tag) in stack)
        out_blocked.append(bool(blocked))

        # determine owning event (including self if current tag is an event)
        event_ln = None
        event_tg = None

        # If current line is itself an event, it owns itself
        if tag in EVENT_WHITELIST:
            event_ln = int(row["line_no"])
            event_tg = tag
        else:
            for (lvl, ln, tg) in reversed(stack):
                if tg in EVENT_WHITELIST:
                    event_ln = ln
                    event_tg = tg
                    break

        out_event_line.append(event_ln)
        out_event_tag.append(event_tg)

        # NOTE root tracking for NOTE/CONC/CONT pieces
        note_root = None
        if tag == "NOTE":
            note_root = int(row["line_no"])
        elif tag in ("CONC", "CONT"):
            # nearest NOTE ancestor
            for (lvl, ln, tg) in reversed(stack):
                if tg == "NOTE":
                    note_root = ln
                    break
        out_note_root.append(note_root)

        source_root = None
        citation_root = None

        # Walk ancestors to find nearest SOURCE root
        for (lvl, ln, tg) in reversed(stack):
            if tg == "SOUR":
                # level 0 SOUR = source definition
                if lvl == 0:
                    source_root = ln
                else:
                    # level >=2 SOUR = citation
                    citation_root = ln
                break

        out_source_root.append(source_root)
        out_citation_root.append(citation_root)

        # push current node
        stack.append((level, int(row["line_no"]), tag))

    pdf["parent_line_no"] = out_parent
    pdf["event_line_no"] = out_event_line
    pdf["event_tag"] = out_event_tag
    pdf["blocked"] = out_blocked
    pdf["note_root_line_no"] = out_note_root
    pdf["source_root_line_no"] = out_source_root
    pdf["citation_root_line_no"] = out_citation_root

    return pdf[[
        "record_xref", "line_no", "level", "tag", "value",
        "parent_line_no", "event_line_no", "event_tag",
        "blocked", "note_root_line_no", "source_root_line_no", "citation_root_line_no",
        "source_file"
    ]]

enriched = ged.groupBy("record_xref").applyInPandas(enrich_record, schema=enriched_schema)

enriched_resolved = (
    resolve_bronze_gedcom_text(enriched)
    .join(
        enriched.select("record_xref", "line_no", "level", "event_line_no", "event_tag", "blocked", "note_root_line_no", "source_root_line_no", "citation_root_line_no"),
        on=["line_no"],
        how="left"
    )
)

(enriched_resolved.write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .saveAsTable("genealogy.bronze_gedcom_enriched")
)
