In [0]:
# %run ./grant_access_to_address_data
# %run ./local_dev/spark_setup.ipynb

In [0]:
# Install all required dependencies
%pip install --pre uk_address_matcher duckdb


In [0]:
# convert spark df to DuckDBimport duckdb
import duckdb
from uk_address_matcher import (
    clean_data_using_precomputed_rel_tok_freq,
    get_linker,
    best_matches_with_distinguishability,
    improve_predictions_using_distinguishing_tokens,
)
import pandas as pd

def matcher(is_top_k_address_search):
    """
    by default top_k is set to 5 in the MoJ algorithm
    """
    con = duckdb.connect(database=':memory:')
    p_input = "/dbfs/tmp/address_data/input/*.parquet"
    p_all = "/dbfs/tmp/address_data/all/*.parquet"

    duck_df_input = con.read_parquet(p_input).order("postcode")
    duck_df_all = con.read_parquet(p_all).order("postcode")

    # Clean both datasets using uk_address_matcher's pre-trained model
    # Returns Duck DB relations
    duckdb_canonical_clean = clean_data_using_precomputed_rel_tok_freq(duck_df_all, con=con)
    duckdb_query_clean = clean_data_using_precomputed_rel_tok_freq(duck_df_input, con=con)

    # DEBUGGING
    # duckdb_query_clean_df = duckdb_query_clean.df()
    # duckdb_canonical_clean_df = duckdb_canonical_clean.df()
    # print(duckdb_canonical_clean_df.columns)
    # print(duckdb_query_clean_df.columns)

    # Create the address matcher (linker)
    linker = get_linker(
        df_addresses_to_match=duckdb_query_clean,
        df_addresses_to_search_within=duckdb_canonical_clean,
        con=con,
        include_full_postcode_block=True,
        additional_columns_to_retain=["original_address_concat"]
    )
    
    # First-pass match (good for top k matches)
    # Returns polars df
    df_predict = linker.inference.predict(threshold_match_weight=-50, experimental_optimisation=True)
    # Returns Duck DB relations
    df_predict_ddb = df_predict.as_duckdbpyrelation()
    
    # Second-pass refinement using distinguishing tokens
    # Returns Duck DB relations
    df_predict_improved = improve_predictions_using_distinguishing_tokens(
        df_predict=df_predict_ddb,
        con=con,
        match_weight_threshold=-20
    )
    

    if is_top_k_address_search:
        return df_predict_improved.df()
    else:
        # Get the best match per input record
        # Returns Duck DB relations
        best_matches = best_matches_with_distinguishability(
            df_predict=df_predict_improved,
            df_addresses_to_match=duck_df_input,
            con=con
        )
        return best_matches.df()


In [0]:
%run ./create_address_test_data
# creates 'address_samples' used below

In [0]:
from pyspark.sql.functions import col

# call address matcher
matched_sdf = spark.createDataFrame(matcher(False)).withColumnRenamed("unique_id_l", "matched_uprn").withColumnRenamed("unique_id_r", "test_address_input_id").withColumnRenamed("original_address_concat", "address_concat_l")
# merge matcher outputs with address_samples_dirty
df_eval = matched_sdf.join(address_samples, address_samples.unique_id == matched_sdf.test_address_input_id, "left")
# display top 10 rows
df_eval.show(10, truncate=False)

 

In [0]:
from pyspark.sql.functions import col, when, round as spark_round, mean

def classify_match(df):
    return df.withColumn(
        "match_result",
        when(col("matched_uprn").isNull() & col("expected_uprn").isNull(), "TN")
        .when(col("matched_uprn").isNotNull() & col("expected_uprn").isNull(), "FP")
        .when(col("matched_uprn").isNull() & col("expected_uprn").isNotNull(), "FN")
        .when(col("expected_uprn") == col("matched_uprn"), "TP")
        .when(col("expected_uprn") != col("matched_uprn"), "FP")
        .otherwise("TN")
    )

df_eval = classify_match(df_eval)

df_eval = df_eval.withColumn(
    "match_weight",
    when(col("match_weight").isNotNull(), spark_round(col("match_weight"), 2))
)

# count metrics
tp = df_eval.filter(col("match_result") == "TP").count()
fp = df_eval.filter(col("match_result") == "FP").count()
fn = df_eval.filter(col("match_result") == "FN").count()
tn = df_eval.filter(col("match_result") == "TN").count()

# display 'matched_uprn' and 'expected_uprn' columns for FPs and FNs
if fp > 0:
    print("FP List")
    display(df_eval.filter(col("match_result") == "FP").select("address_concat_r", "original_address_concat_l", "matched_uprn", "expected_uprn", "match_weight", "match_result"))
if fn > 0:
    print("FN List")
    display(df_eval.filter(col("match_result") == "FN").select("address_concat_r", "original_address_concat_l", "matched_uprn", "expected_uprn", "match_weight", "match_result"))

match_weight = df_eval.select(spark_round(mean("match_weight"), 2)).collect()[0][0]
precision = round(tp / (tp + fp) if (tp + fp) != 0 else 0, 2)
recall = round(tp / (tp + fn), 2)
f1 = round(2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0, 2)
distinguishability = df_eval.select(spark_round(mean("distinguishability"), 2)).collect()[0][0]
print("Predictive successes:")
print(f"TP/True Postitive: Model correctly predicted a match.")
print(tp)
print(f"TN/True Negative: Model correctly predicted no match.")
print(tn)
print("Predictive errors:")
print(f"FP/False Positive: Model predicted a match when there were none (or predicted the wrong match).")
print(fp)
print(f"FN/False Negative: Model predicted no match, when there was one.")
print(fn)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")
print(f"Match weight: {match_weight}")
print(f"Distinguishability: {distinguishability}")