In [0]:
%run ./grant_access_to_address_data


In [0]:
# Install all required dependencies
%pip install --pre uk_address_matcher duckdb



In [0]:
%run ./create_address_test_data

In [0]:
# convert spark df to DuckDBimport duckdb
import duckdb
from uk_address_matcher import (
    clean_data_using_precomputed_rel_tok_freq,
    get_linker,
    best_matches_with_distinguishability,
    improve_predictions_using_distinguishing_tokens,
)
import pandas as pd

def matcher(is_top_k_address_search, query_addresses_pdf):
    con = duckdb.connect(database=':memory:')

    # Extract required fields for address matching
    all_address_subset_pdf = all_address_pdf[[
        "uprn",
        "fulladdress",
        "postcode"
    ]].copy()

    # change column names to match uk_address_matcher's expectations
    all_address_subset_pdf.columns = ["unique_id", "address_concat", "postcode"]

    # Register canonical (NGD) and query datasets
    con.register("canonical_df", all_address_subset_pdf)
    con.register("query_df", query_addresses_pdf)

    # Clean both datasets using uk_address_matcher's pre-trained model
    # Returns Duck DB relations
    duckdb_canonical_clean = clean_data_using_precomputed_rel_tok_freq(con.table("canonical_df"), con=con)
    duckdb_query_clean = clean_data_using_precomputed_rel_tok_freq(con.table("query_df"), con=con)

    # DEBUGGING
    # duckdb_query_clean_df = duckdb_query_clean.df()
    # duckdb_canonical_clean_df = duckdb_canonical_clean.df()
    # print(duckdb_canonical_clean_df.columns)
    # print(duckdb_query_clean_df.columns)

    # Create the address matcher (linker)
    linker = get_linker(
        df_addresses_to_match=duckdb_query_clean,
        df_addresses_to_search_within=duckdb_canonical_clean,
        con=con,
        include_full_postcode_block=True,
        additional_columns_to_retain=["original_address_concat"]
    )
    
    # First-pass match (good for top k matches)
    # Returns polars df
    df_predict = linker.inference.predict(threshold_match_weight=-50, experimental_optimisation=True)
    # Returns Duck DB relations
    df_predict_ddb = df_predict.as_duckdbpyrelation()
    
    # Second-pass refinement using distinguishing tokens
    # Returns Duck DB relations
    df_predict_improved = improve_predictions_using_distinguishing_tokens(
        df_predict=df_predict_ddb,
        con=con,
        match_weight_threshold=5
    )
    

    if is_top_k_address_search:
        return df_predict_improved.df()
    else:
        # Get the best match per input record
        # Returns Duck DB relations
        best_matches = best_matches_with_distinguishability(
            df_predict=df_predict_improved,
            df_addresses_to_match=con.table("query_df"),
            con=con
        )
        return best_matches.df()


In [0]:
import pandas as pd

address_search_with_uprn_pdf["unique_id"] = address_search_with_uprn_pdf["unique_id"].astype("Int64")

# remove UPRN from address_search_df
address_search_pdf = address_search_with_uprn_pdf.drop(columns=["expected_uprn"], errors='ignore')

matched_pdf = matcher(False, address_search_pdf)
matched_pdf = matched_pdf.rename(columns={"unique_id_l": "matched_uprn", "unique_id_r": "input_unique_id"})
df_eval = address_search_with_uprn_pdf.merge(matched_pdf, left_on="unique_id", right_on="input_unique_id", how="left")
display(df_eval)

# DEBUG
# print(matched_pdf.columns)
# print(matched_pdf.dtypes["unique_id"])
# print(address_search_pdf_with_uprn.dtypes["unique_id"])
 

In [0]:

def classify_match(row):

    """
        TP: Correct match
        FP: Wrong match (false positive)
        FN: Missed a match
        TN: Correctly left unmatched
    """
    # check if null

    if pd.isnull(row["matched_uprn"]) and pd.isnull(row["expected_uprn"]):
        return "TN"
    elif pd.notnull(row["matched_uprn"]) and pd.isnull(row["expected_uprn"]):
        return "FP"
    elif pd.isnull(row["matched_uprn"]) and pd.notnull(row["expected_uprn"]):
        return "FN"
    elif row["expected_uprn"] == row["matched_uprn"]:
        return "TP"
    else:
        return "TN"
 
df_eval["match_result"] = df_eval.apply(classify_match, axis=1)
#  work out match weight for each row
df_eval["match_weight"] = df_eval["match_weight"].map(lambda x: round(x, 2) if pd.notnull(x) else None)

 
# count metrics
tp = (df_eval["match_result"] == "TP").sum()
fp = (df_eval["match_result"] == "FP").sum()
fn = (df_eval["match_result"] == "FN").sum()
match_weight = round(df_eval["match_weight"].mean(),2)
precision = round(tp / (tp + fp), 2)
recall = round(tp / (tp + fn),2)
f1 = round(2 * (precision * recall) / (precision + recall),2)
distinguishability = round(df_eval["distinguishability"].mean(),2)

print(f"TP: {tp}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")
print(f"Match weight: {match_weight}")
print(f"Distinguishability: {distinguishability}")