In [0]:
from pyspark.sql import functions as F, types as T, Window
from datetime import date
import calendar
import re

In [0]:
MONTHS = {
  "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4,
  "MAY": 5, "JUN": 6, "JUL": 7, "AUG": 8,
  "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12,
  "JANUARY": 1, "FEBRUARY": 2, "MARCH": 3, "APRIL": 4,
  "MAY": 5, "JUNE": 6, "JULY": 7, "AUGUST": 8,
  "SEPTEMBER": 9, "OCTOBER": 10, "NOVEMBER": 11, "DECEMBER": 12
}

In [0]:
def parse_gedcom_date(date_raw):
    """
    Parses a GEDCOM DATE field into structured start/end dates with precision and type.
    Handles:
        - Exact dates: 15 MAR 1871, MAR 1871, 1871
        - Prefixes: ABT, EST, CAL, BEF, AFT
        - Ranges: BET 1870 AND 1875, FROM 1870 TO 1875
    Returns dict with:
        date_raw, date_type, date_start, date_end, date_precision, date_parse_confidence
    """
    if date_raw is None:
        return None

    s = date_raw.strip().upper().replace(".", "")
    result = {
        "date_raw": date_raw,
        "date_type": "EXACT",
        "date_start": None,
        "date_end": None,
        "date_precision": None,
        "date_parse_confidence": "HIGH"
    }

    # 1) Handle Ranges first: BET ... AND ..., FROM ... TO ...
    range_match = re.fullmatch(r"(BET|FROM) (.+) (AND|TO) (.+)", s)
    if range_match:
        left_raw = range_match.group(2).strip()
        right_raw = range_match.group(4).strip()
        left = parse_gedcom_date(left_raw)
        right = parse_gedcom_date(right_raw)
        result.update({
            "date_type": "RANGE",
            "date_start": left["date_start"],
            "date_end": right["date_end"],
            "date_precision": "RANGE",
            "date_parse_confidence": min(left["date_parse_confidence"], right["date_parse_confidence"])
        })
        return result

    # 2) Handle prefixes
    for prefix in ["ABT", "EST", "CAL", "BEF", "AFT"]:
        if s.startswith(prefix):
            result["date_type"] = prefix
            s = s[len(prefix):].strip()
            break  # only one prefix

    # 3) DAY MONTH YEAR
    m = re.fullmatch(r"(\d{1,2}) ([A-Z]{3,9}) (\d{4})", s)
    if m:
        day = int(m.group(1))
        month = MONTHS.get(m.group(2), 1)
        year = int(m.group(3))
        dt = date(year, month, day)
        result.update({
            "date_start": dt,
            "date_end": dt,
            "date_precision": "DAY"
        })
    else:
        # 4) MONTH YEAR
        m = re.fullmatch(r"([A-Z]{3,9}) (\d{4})", s)
        if m:
            month = MONTHS.get(m.group(1), 1)
            year = int(m.group(2))
            last_day = calendar.monthrange(year, month)[1]
            result.update({
                "date_start": date(year, month, 1),
                "date_end": date(year, month, last_day),
                "date_precision": "MONTH"
            })
        else:
            # 5) YEAR only
            if re.fullmatch(r"\d{4}", s):
                y = int(s)
                result.update({
                    "date_start": date(y, 1, 1),
                    "date_end": date(y, 12, 31),
                    "date_precision": "YEAR"
                })
            else:
                # unparseable fallback
                result["date_parse_confidence"] = "LOW"
                return result

    # 6) Apply prefix semantics to adjust start/end
    if result["date_type"] == "AFT":
        if result["date_start"]:
            # start = parsed, end = far future
            result["date_end"] = date(9999, 12, 31)
    elif result["date_type"] == "BEF":
        if result["date_start"]:
            # start = distant past, end = parsed
            result["date_start"] = date(1, 1, 1)
    elif result["date_type"] == "ABT":
        # About: widen Â±1 year for YEAR precision
        if result["date_precision"] == "YEAR":
            y = result["date_start"].year
            result["date_start"] = date(y-1, 1, 1)
            result["date_end"] = date(y+1, 12, 31)
        elif result["date_precision"] == "MONTH":
            y, m = result["date_start"].year, result["date_start"].month
            start_m = m-1 if m>1 else 1
            end_m = m+1 if m<12 else 12
            start_day = 1
            end_day = calendar.monthrange(y, end_m)[1]
            result["date_start"] = date(y, start_m, start_day)
            result["date_end"] = date(y, end_m, end_day)

    return result


In [0]:
tests = [
    "15 MAR 1871",
    "MAR 1871",
    "1871",
    "ABT 1875",
    "EST MAR 1871",
    "CAL 15 MAR 1871",
    "BEF 15 MAR 1890",
    "AFT MAR 1871",
    "BET 1870 AND 1875",
    "FROM 1 JAN 1900 TO 31 DEC 1905",
    "Abt. 1945",
    "October 1756"
]

for t in tests:
    print(parse_gedcom_date(t))



In [0]:
date_schema = T.StructType([
    T.StructField("event_id", T.StringType(), False),
    T.StructField("date_raw", T.StringType(), True),
    T.StructField("date_type", T.StringType(), True),
    T.StructField("date_start", T.DateType(), True),
    T.StructField("date_end", T.DateType(), True),
    T.StructField("date_precision", T.StringType(), True),
    T.StructField("date_parse_confidence", T.StringType(), True)
])


In [0]:
events_to_parse = (
    spark.table("genealogy.silver_event")
    .select(
        "event_id",
        "event_date",
        F.col("event_date").alias("event_date_raw")
    )
    .where(F.col("event_date").isNotNull())
    .distinct()
)

In [0]:
import pandas as pd

def parse_dates_pdf(pdf: pd.DataFrame) -> pd.DataFrame:
    out = []
    for _, row in pdf.iterrows():
        parsed = parse_gedcom_date(row["event_date_raw"])
        parsed["event_id"] = row["event_id"]
        out.append(parsed)
    return pd.DataFrame(out, columns=date_schema.names)

parsed_dates = events_to_parse.groupBy("event_id").applyInPandas(parse_dates_pdf, schema=date_schema)


In [0]:
target_table = "genealogy.silver_event_date"

(parsed_dates
  .write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .saveAsTable(target_table)
)


In [0]:
display(
    spark.table(target_table)
    .orderBy("date_parse_confidence", "event_id")
    .limit(50)
)
