In [0]:
STAGING_DIR = "/Volumes/workspace/staging_google_drive/gedcom/Ancestry GEDCOM Exports/"

files = dbutils.fs.ls(STAGING_DIR)
print(files)

latest = spark.table("genealogy.control_latest_gedcom").collect()[0]
latest_filename = latest['source_filename']
latest_file_id = latest['file_id']
latest_size = latest['file_size']
latest_sync_time = latest['_fivetran_synced']
latest_path = f"{STAGING_DIR}{latest_filename}"

latest_path


In [0]:
# idempotency check on exact file id

already_ingested = (
    spark.table("genealogy.bronze_gedcom")
    .filter(f"source_file_id = '{latest_file_id}'")
    .limit(1)
    .count()
    > 0
)

if already_ingested:
    raise RuntimeError(
        f"SKIP_PIPELINE: GEDCOM file {latest_file_id} has already been ingested"
    )

In [0]:
CANONICAL_DIR = "/Volumes/workspace/genealogy/file_uploads/"
CANONICAL_FILE = f"{CANONICAL_DIR}/Genealogy of Ed Ball.ged"

In [0]:
# Ensure destination directory exists
dbutils.fs.mkdirs(CANONICAL_DIR)

# Remove existing canonical file if present
try:
    dbutils.fs.rm(CANONICAL_FILE)
except Exception:
    pass

# Copy latest snapshot into canonical location
dbutils.fs.cp(latest_path, CANONICAL_FILE)


In [0]:
GEDCOM_PATH = CANONICAL_FILE

rows = []

with open(GEDCOM_PATH.replace("file:", ""), "r", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        line = line.rstrip("\n")
        parts = line.split(" ", 2)

        level = int(parts[0])
        xref_id = None
        tag = None
        value = None

        if len(parts) == 2:
            tag = parts[1]
        elif len(parts) == 3:
            if parts[1].startswith("@") and parts[1].endswith("@"):
                xref_id = parts[1]
                tag = parts[2]
            else:
                tag = parts[1]
                value = parts[2]

        rows.append((i, level, xref_id, tag, value))

df = spark.createDataFrame(
    rows,
    ["line_no", "level", "xref_id", "tag", "value"]
)

display(df.limit(50))


In [0]:
# guardrails for new file id, but same content or anomalous content

from pyspark.sql.functions import sha2, concat_ws, collect_list

new_record_count = df.count()

content_hash = (
    df
    .select(sha2(concat_ws("||", *df.columns), 256).alias("row_hash"))
    .agg(sha2(concat_ws("||", collect_list("row_hash")), 256).alias("hash"))
    .collect()[0]["hash"]
)

prev = (
    spark.table("genealogy.control_ingested_gedcom")
    .orderBy("ingested_at", ascending=False)
    .limit(1)
    .collect()
)

prev = prev[0] if prev else None

checks = []

if prev:
    size_ratio = latest_size / prev["size"] if prev["size"] else 1
    record_ratio = new_record_count / prev["record_count"] if prev["record_count"] else 1

    checks.append(("size_change", size_ratio))
    checks.append(("record_change", record_ratio))
    checks.append(("hash_changed", content_hash != prev["hash"]))

if prev:
    if (
        abs(size_ratio - 1) < 0.01
        and abs(record_ratio - 1) < 0.01
        and content_hash == prev["hash"]
    ):
        raise RuntimeError(
            "INTENTIONAL_SKIP: GEDCOM identical to previous ingestion"
        )

    if record_ratio < 0.5:
        raise RuntimeError(
            "BLOCK_PIPELINE: GEDCOM record count dropped >50% (likely broken export)"
        )


In [0]:
from pyspark.sql.functions import lit

source_file = latest["source_path"]

(
    df
    .withColumn("source_file", lit(latest_filename))
    .withColumn("source_file_id", lit(latest_file_id))
    .withColumn("source_file_synced_at", lit(latest_sync_time))
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("genealogy.bronze_gedcom")
)

In [0]:
from pyspark.sql.functions import current_timestamp

spark.createDataFrame(
    [(latest_file_id, latest_path, latest_size, new_record_count, content_hash)],
    ["file_id", "source_path", "size", "record_count", "hash"]
).withColumn("ingested_at", current_timestamp()) \
 .write.mode("append") \
 .saveAsTable("genealogy.control_ingested_gedcom")


In [0]:
%sql
SELECT COUNT(*) FROM genealogy.bronze_gedcom;


In [0]:
%sql
CREATE OR REPLACE VIEW genealogy.bronze_gedcom_with_record AS
SELECT
  *,
  last_value(
    xref_id,
    true
  ) OVER (
    ORDER BY line_no
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) AS record_xref
FROM genealogy.bronze_gedcom;


In [0]:
%sql
SELECT
  record_xref,
  COUNT(*) AS lines
FROM genealogy.bronze_gedcom_with_record
GROUP BY record_xref
ORDER BY lines DESC
LIMIT 10;
