In [0]:
from pyspark.sql.types import (
    StructType, StructField, StringType, ArrayType, IntegerType
)

place_schema = StructType([
    StructField("place_display", StringType(), True),
    StructField("place_normalised", StringType(), True),
    StructField("place_tokens", ArrayType(StringType()), True),
    StructField("token_count", IntegerType(), True),
    StructField("inferred_country", StringType(), True),
    StructField("normalisation_level", StringType(), True),
])


In [0]:
def normalize_place(place_raw: str):
    if place_raw is None:
        return (None, None, None, None, None, "RAW")

    import re
    import unicodedata

    text = unicodedata.normalize("NFKD", place_raw.strip())
    text = re.sub(r"\s+", " ", text)

    text = re.sub(r"\s*,\s*", ", ", text)
    text = re.sub(r"[;/]", ",", text)
    text = re.sub(r"\s*,\s*", ", ", text)
    text = text.strip(", ")

    tokens = [t.strip() for t in text.split(",") if t.strip()]

    ABBREVIATIONS = {
        "st.": "St",
        "co.": "County",
        "nr": "Near"
    }

    expanded = []
    for t in tokens:
        expanded.append(ABBREVIATIONS.get(t.lower(), t))

    place_display = ", ".join(expanded)

    norm = place_display.lower()
    norm = re.sub(r"[^\w\s]", "", norm)
    norm = re.sub(r"\s+", " ", norm).strip()

    COUNTRY_ALIASES = {
        "england": "England",
        "scotland": "Scotland",
        "wales": "Wales",
        "ireland": "Ireland",
        "uk": "United Kingdom",
        "u.k.": "United Kingdom",
        "united kingdom": "United Kingdom",
        "usa": "United States of America",
        "australia": "Australia",
        "new zealand": "New Zealand",
        "canada": "Canada"
    }

    inferred_country = None
    last = expanded[-1].lower()
    if last in COUNTRY_ALIASES:
        inferred_country = COUNTRY_ALIASES[last]

    level = "CLEANED"
    if len(expanded) > 1:
        level = "TOKENISED"
    if inferred_country:
        level = "INFERRED"

    return (
        place_display,
        norm,
        expanded,
        len(expanded),
        inferred_country,
        level
    )


In [0]:
spark.udf.register(
    "normalize_place",
    normalize_place,
    place_schema
)

In [0]:
%sql
USE CATALOG workspace;
USE SCHEMA genealogy;
SHOW FUNCTIONS LIKE 'normalize_place';

In [0]:
%sql
CREATE OR REPLACE TABLE genealogy.silver_place_variant
USING DELTA
AS
SELECT
  sha2(place_raw, 256) AS place_variant_id,
  place_raw,

  place_struct.place_display,
  place_struct.place_normalised,
  place_struct.place_tokens,
  place_struct.token_count,
  place_struct.inferred_country,
  place_struct.normalisation_level

FROM (
  SELECT
    event_place AS place_raw,
    normalize_place(event_place) AS place_struct
  FROM genealogy.silver_event
  WHERE event_place IS NOT NULL
  GROUP BY event_place
);


In [0]:
%sql
-- No nulls where they shouldn't be
SELECT *
FROM genealogy.silver_place_variant
WHERE place_display IS NULL
   OR place_normalised IS NULL
   OR token_count IS NULL;


In [0]:
%sql
COMMENT ON TABLE genealogy.silver_place_variant IS
'Distinct raw GEDCOM place strings normalised via normalize_place_udf. One row per unique place string.';

COMMENT ON COLUMN genealogy.silver_place_variant.place_variant_id IS
'Deterministic hash of place_raw. Identifies a specific textual variant, not a canonical place.';

In [0]:
%sql
UPDATE genealogy.silver_event
SET event_place_variant_id = (
  SELECT MAX(place_variant_id) --should only be one
  FROM genealogy.silver_place_variant
  WHERE event_place = place_raw
  GROUP BY place_raw
)