### Ensure prerequisites are installed
1. Install libpostal
    - Instructions
        - [Mac/Linux instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux)
        - [Windows instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-windows)
    - In building these examples, we used the optional [Senzing data model](https://github.com/Senzing/libpostal-data) for libpostal. Instructions for using this model are [here](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-with-an-alternative-data-model).

2. Install Python dependencies
    ```bash
    pip3 install -r requirements.txt
    ```

> **Note**: These samples use the [open-data list of address points](https://opendata.dc.gov/datasets/DCGIS::address-points/about) from the Washington, D.C. [Master Address Repository (MAR)](https://opendata.dc.gov/pages/addressing-in-dc). 

### Parsing addresses for pre-filtering
This sample demostrates the usage of `libpostal` to parse unstructured address data. The MAR address point CSV is structured (city, state, zip are separate fields), so we format the address into a single string before parsing with `libpostal`, which will parse the address into labelled components.

The output will be saved to `./output/parsed.csv`.

In [15]:
import os
import pandas as pd
from collections import namedtuple
from postal.parser import parse_address

output_directory = './output'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def process_row(row):
    try:
        formatted=f"{row.ADDRESS} {row.CITY} {row.STATE} {row.ZIPCODE}"
        parsed = parse_address(formatted, language='en', country='us')

        keys = [item[1] for item in parsed]
        values = [item[0] for item in parsed]

        ParseResult = namedtuple('ParseResult', keys)
        parse_result = ParseResult(*values)

        if hasattr(parse_result, 'house_number') and hasattr(parse_result, 'road'):
            return parse_result, True
        else:
            return parse_result, False
    except ValueError:
        return None, False

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    'ADDRESS_NUMBER_SUFFIX': 'string',
    'ZIPCODE': 'string'
})

parsed = []

for row in mar_address_points.itertuples():
    result = process_row(row)
    processed_data, has_street = result
    
    if has_street == True:
        parsed.append(processed_data._asdict())
if parsed:
    df_parsed = pd.DataFrame(parsed)
    df_parsed.to_csv(os.path.join(output_directory, 'parsed.csv'))
else:
    print('No output to write for df_parsed')


### Define a mapping of common abbrevations to their USPS standard expansions
`USPS_SUFFIX_MAP` defines a dictionary of common road suffix abbreviations and their corresponding standard expansions.
The `standardize_street_designators` function tokenizes a string, replaces abbreviation tokens with their expansions, and reassembles the string. This allows normalizing `123 Main St` to `123 Main Street`.

In [5]:
USPS_SUFFIX_MAP = {
    # Alleys
    "aly": "ALLEY", "allee": "ALLEY",

    # Annex
    "anx": "ANNEX",

    # Arcades
    "arc": "ARCADE",

    # Avenues
    "ave": "AVENUE", "av": "AVENUE", "avn": "AVENUE",

    # Bays
    "bch": "BEACH",

    # Bluffs
    "blf": "BLUFF", "blfs": "BLUFFS",

    # Bottoms
    "btm": "BOTTOM", "btms": "BOTTOMS",

    # Boulevards
    "blvd": "BOULEVARD", "boul": "BOULEVARD",

    # Branches
    "br": "BRANCH",

    # Bridges
    "brg": "BRIDGE",

    # Brooks
    "brk": "BROOK", "brks": "BROOKS",

    # Burgs
    "bg": "BURG", "bgs": "BURGS",

    # Bypasses
    "byp": "BYPASS", "bypa": "BYPASS", "bypas": "BYPASS",

    # Camps
    "cp": "CAMP",

    # Canyons
    "cyn": "CANYON",

    # Capes
    "cpe": "CAPE",

    # Causeways
    "cswy": "CAUSEWAY", "cswy": "CAUSEWAY",

    # Centers
    "ctr": "CENTER", "cent": "CENTER", "cntr": "CENTER", "centr": "CENTER",

    # Circles
    "cir": "CIRCLE", "circ": "CIRCLE", "circl": "CIRCLE",

    # Courts
    "ct": "COURT", "cts": "COURTS",

    # Coves
    "cov": "COVE", "covs": "COVES",

    # Creeks
    "crk": "CREEK",

    # Crescents
    "cres": "CRESCENT", "crsnt": "CRESCENT", "crscnt": "CRESCENT",

    # Crests
    "crst": "CREST",

    # Crossings
    "xing": "CROSSING", "xng": "CROSSING",

    # Dale
    "dl": "DALE", "dles": "DALES",

    # Dams
    "dm": "DAM",

    # Divides
    "dv": "DIVIDE", "dvd": "DIVIDE",

    # Drives
    "dr": "DRIVE", "driv": "DRIVE", "drv": "DRIVE",

    # Estates
    "est": "ESTATE", "ests": "ESTATES",

    # Expressways
    "expy": "EXPRESSWAY", "expr": "EXPRESSWAY", "express": "EXPRESSWAY",

    # Extensions
    "ext": "EXTENSION", "exts": "EXTENSIONS",

    # Falls
    "fall": "FALL", "fls": "FALLS",

    # Ferries
    "frry": "FERRY", "fry": "FERRY",

    # Fields
    "fld": "FIELD", "flds": "FIELDS",

    # Flats
    "flat": "FLAT", "flt": "FLAT", "flts": "FLATS",

    # Fords
    "frd": "FORD", "frds": "FORDS",

    # Forests
    "frst": "FOREST",

    # Forges
    "frg": "FORGE", "frgs": "FORGES",

    # Forks
    "frk": "FORK", "frks": "FORKS",

    # Forts
    "ft": "FORT",

    # Freeways
    "fwy": "FREEWAY",

    # Gardens
    "gdn": "GARDEN", "gdns": "GARDENS", "grdn": "GARDEN", "grdns": "GARDENS",

    # Gates
    "gtwy": "GATEWAY", "gatwy": "GATEWAY", "gatewy": "GATEWAY",

    # Glens
    "gln": "GLEN", "glns": "GLENS",

    # Greens
    "grn": "GREEN", "grns": "GREENS",

    # Groves
    "grv": "GROVE", "grvs": "GROVES",

    # Harbors
    "hbr": "HARBOR", "hbrs": "HARBORS", "hrbr": "HARBOR",

    # Havens
    "hvns": "HAVENS", "hvn": "HAVEN",

    # Heights
    "hgt": "HEIGHTS", "hgts": "HEIGHTS", "ht": "HEIGHTS", "hts": "HEIGHTS",

    # Highways
    "hwy": "HIGHWAY", "highwy": "HIGHWAY", "hiwy": "HIGHWAY",

    # Hills
    "hl": "HILL", "hls": "HILLS",

    # Hollows
    "holw": "HOLLOW", "hollw": "HOLLOW", "holws": "HOLLOWS",

    # Inlets
    "inlt": "INLET",

    # Islands
    "is": "ISLAND", "isls": "ISLANDS", "islnd": "ISLAND",

    # Isles
    "isle": "ISLE", "isles": "ISLES",

    # Junctions
    "jct": "JUNCTION", "jctn": "JUNCTION", "junctn": "JUNCTION", "juncton": "JUNCTION", "jcts": "JUNCTIONS",

    # Keys
    "ky": "KEY", "kys": "KEYS",

    # Knolls
    "knl": "KNOLL", "knls": "KNOLLS",

    # Lakes
    "lk": "LAKE", "lks": "LAKES",

    # Lands
    "land": "LAND",

    # Landings
    "lndg": "LANDING", "landg": "LANDING", "lndng": "LANDING",

    # Lanes
    "ln": "LANE",

    # Light (singular)
    "lgt": "LIGHT",

    # Lights (plural)
    "lgts": "LIGHTS",

    # Loaf
    "lf": "LOAF",

    # Locks
    "lck": "LOCK", "lcks": "LOCKS",

    # Lodges
    "ldg": "LODGE", "ldge": "LODGE",

    # Loops
    "loop": "LOOP",

    # Malls
    "mall": "MALL",

    # Manors
    "mnr": "MANOR", "mnrs": "MANORS",

    # Meadows
    "mdw": "MEADOW", "mdws": "MEADOWS",

    # Mews
    "mews": "MEWS",

    # Mills
    "ml": "MILL", "mls": "MILLS",

    # Missions
    "msn": "MISSION", "mssn": "MISSION",

    # Mount
    "mt": "MOUNT",

    # Mountains
    "mtn": "MOUNTAIN", "mntn": "MOUNTAIN", "mountin": "MOUNTAIN", "mtns": "MOUNTAINS",

    # Necks
    "nck": "NECK",

    # Orchards
    "orch": "ORCHARD", "orchrd": "ORCHARD",

    # Ovals
    "oval": "OVAL",

    # Parks
    "park": "PARK", "prk": "PARK",

    # Parkways
    "pky": "PARKWAY", "pkway": "PARKWAY", "pkwy": "PARKWAY", "pkwys": "PARKWAYS",

    # Passes
    "pas": "PASS", "psg": "PASSAGE",

    # Paths
    "path": "PATH", "paths": "PATHS",

    # Pikes
    "pike": "PIKE", "pikes": "PIKES",

    # Pines
    "pine": "PINE", "pines": "PINES",

    # Places
    "pl": "PLACE",

    # Plains
    "pln": "PLAIN", "plns": "PLAINS",

    # Plazas
    "plz": "PLAZA", "plza": "PLAZA",

    # Points
    "pt": "POINT", "pts": "POINTS",

    # Port
    "prt": "PORT", "prts": "PORTS",

    # Prairie
    "pr": "PRAIRIE", "prair": "PRAIRIE", "prr": "PRAIRIE",

    # Ranches
    "rnch": "RANCH", "ranchs": "RANCHES",

    # Rapids
    "rpd": "RAPID", "rpds": "RAPIDS",

    # Rest
    "rst": "REST",

    # Ridges
    "rdg": "RIDGE", "rdgs": "RIDGES",

    # River
    "riv": "RIVER", "rvr": "RIVER", "rivr": "RIVER",

    # Roads
    "rd": "ROAD", "rds": "ROADS",

    # Row
    "row": "ROW",

    # Rue
    "rue": "RUE",

    # Runs
    "run": "RUN", "runs": "RUNS",

    # Shores
    "shl": "SHOAL", "shls": "SHOALS", "shr": "SHORE", "shrs": "SHORES",

    # Skyway
    "skwy": "SKYWAY",

    # Springs
    "spg": "SPRING", "spng": "SPRING", "sprng": "SPRING", "spgs": "SPRINGS", "spngs": "SPRINGS", "sprngs": "SPRINGS",

    # Spurs
    "spur": "SPUR", "spurs": "SPURS",

    # Squares
    "sq": "SQUARE", "sqr": "SQUARE", "sqre": "SQUARE", "sqrs": "SQUARES",

    # Streets
    "st": "STREET", "strt": "STREET",

    # Summits
    "smt": "SUMMIT", "sumit": "SUMMIT", "sumitt": "SUMMIT",

    # Terraces
    "ter": "TERRACE", "terr": "TERRACE",

    # Throughway
    "thwy": "THROUGHWAY",

    # Traces
    "trce": "TRACE",

    # Tracks
    "trak": "TRACK", "trk": "TRACK", "trks": "TRACKS",

    # Trails
    "trl": "TRAIL", "trls": "TRAILS",

    # Tunnels
    "tunl": "TUNNEL", "tunl": "TUNNEL", "tunnl": "TUNNEL", "tunnels": "TUNNELS",

    # Turns
    "turn": "TURN",

    # Underpass
    "upas": "UNDERPASS",

    # Union
    "un": "UNION", "uns": "UNIONS",

    # Valleys
    "val": "VALLEY", "vly": "VALLEY", "vlly": "VALLEY", "vlys": "VALLEYS",

    # Viaduct
    "via": "VIADUCT", "viadct": "VIADUCT",

    # Views
    "vws": "VIEWS", "vw": "VIEW",

    # Villages
    "vlg": "VILLAGE", "vllg": "VILLAGE", "vllge": "VILLAGE", "vlgs": "VILLAGES",

    # Ville
    "vl": "VILLE",

    # Vista
    "vis": "VISTA", "vist": "VISTA", "vst": "VISTA",

    # Walks
    "wk": "WALK", "wlks": "WALKS",

    # Walls
    "wall": "WALL",

    # Ways
    "way": "WAY", "ways": "WAYS",

    # Wells
    "wl": "WELL", "wls": "WELLS",
}

In [6]:
import re

def standardize_street_designators(address_string: str) -> str:
    """
    Standardizes street designator abbreviations in an unstructured address string
    using a provided USPS mapping.

    Args:
        address_string (str): The input address string.
        usps_map (dict): A dictionary where keys are lowercase abbreviations
                         and values are uppercase USPS standard expansions.

    Returns:
        str: The address string with standardized street designators.
    """
    # 1. Normalize the input string for consistent processing
    #    - Convert to uppercase for lookup against USPS standard (typically uppercase)
    #    - Split by common delimiters to help with tokenization (e.g., commas, periods)
    #      We'll rely on re.split to handle multiple delimiters.
    normalized_string = address_string.upper()
    
    # Use re.split to tokenize, keeping delimiters for reconstruction if needed,
    # or just split by whitespace and common punctuation, then rejoin.
    # For simple replacement, splitting by non-alphanumeric characters or whitespace
    # and processing each word is often easiest.
    
    # Pattern to split by any non-alphanumeric character or whitespace.
    # This also keeps the delimiters (punctuation, spaces) as separate tokens
    # so we can reassemble correctly.
    tokens_with_delimiters = re.findall(r'(\W+|\w+)', normalized_string)
    
    output_tokens = []
    
    for token in tokens_with_delimiters:
        # Check if the token is an alphanumeric word (i.e., not punctuation/whitespace)
        if re.fullmatch(r'\w+', token):
            # Convert token to lowercase for map lookup
            lower_token = token.lower()
            if lower_token in USPS_SUFFIX_MAP:
                output_tokens.append(USPS_SUFFIX_MAP[lower_token])
            else:
                output_tokens.append(token) # Keep original if not found
        else:
            # If it's a delimiter (space, comma, etc.), keep it as is
            output_tokens.append(token)
            
    # Reconstruct the string
    return "".join(output_tokens).strip()

### Using Splink record linkage between known address points and sample addresses
[Splink](https://moj-analytical-services.github.io/splink/index.html) is a [probabilistic record linkage](https://en.wikipedia.org/wiki/Record_linkage#Probabilistic_record_linkage) library for Python.

Here we're using it to link incoming sample addresses (which we standardize and then parse with libpostal) to the list of known addresses in the area. We are defining rules to block on ZIP code and state, as well as matching exactly on address number, state, and zip code. We fuzz match using the Jaro-Winkler string distance algorithm for the street name and city name.

In [None]:
import duckdb
import pandas as pd
from postal.parser import parse_address
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
import splink.comparison_library as cl

def parse_zip_code(address):
    parsed = parse_address(address)
    return next((n[0] for n in parsed if n[1] == 'postcode'), None)

def parse_address_string(address):
    def get_value_by_label(tuples_list, target_label):
        return next((value for value, label in tuples_list if label == target_label), None)
    standarized = standardize_street_designators(address)
    parsed = parse_address(standarized, language="en", country="us")
    result = {}
    result['address_number'] = get_value_by_label(parsed, 'house_number')
    result['street'] = get_value_by_label(parsed, 'road')
    result['city'] = get_value_by_label(parsed, 'city')
    result['state'] = get_value_by_label(parsed, 'state')
    result['zip'] = get_value_by_label(parsed, 'postcode')
    return result

df_mar = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    'ADDRESS_NUMBER_SUFFIX': 'string',
    'ZIPCODE': 'string'
})
df_mar["STREET"] = df_mar[["STREET_NAME","STREET_TYPE", "QUADRANT"]].astype(str).stack().str.lower().unstack().agg(' '.join, axis=1)
mar_columns=["MAR_ID", "ADDRESS_NUMBER", "STREET", "CITY", "STATE", "ZIPCODE"]
mar_new_columns=["unique_id","address_number","street", "city", "state","zip"]
mar_col_map=dict(zip(mar_columns, mar_new_columns))
df_mar = df_mar[mar_columns].rename(columns=mar_col_map)
df_mar["street"] = df_mar["street"].str.lower()
df_mar["city"] = df_mar["city"].str.lower()
df_mar["state"] = df_mar["state"].str.lower()

df_samples = pd.read_csv("./data/address_samples.csv")
df_samples = pd.json_normalize(df_samples["address"].apply(parse_address_string))
df_samples["unique_id"] = df_samples.reset_index(drop=True).index + 1

df_mar_duck = duckdb.sql("SELECT * FROM df_mar")
df_samples_duck = duckdb.sql("SELECT * FROM df_samples")

con = duckdb.connect(":default:")
db_api = DuckDBAPI(connection=con)

settings = SettingsCreator(
    link_type="link_only",
    blocking_rules_to_generate_predictions=[
        block_on("state"),
        block_on("zip"),
        block_on("substr(zip,1,3)")
    ],
    comparisons=[
        cl.ExactMatch("address_number"),
        cl.JaroWinklerAtThresholds("street"),
        cl.JaroWinklerAtThresholds("city"),
        cl.ExactMatch("state"),
        cl.ExactMatch("zip"),
    ],

    probability_two_random_records_match=1/1e6,
    retain_intermediate_calculation_columns=True,
    retain_matching_columns=True,
)

linker = Linker([df_mar_duck, df_samples_duck], settings, db_api)
linker.training.estimate_u_using_random_sampling(max_pairs=1e7)
linker.visualisations.match_weights_chart()

# Execute the record linkage
splink_df_predict = linker.inference.predict()

df_predict = splink_df_predict.as_pandas_dataframe()
threshold = 0.9
df_predict = df_predict[df_predict["match_probability"] > threshold]
output_cols = ["unique_id_l", "unique_id_r", "match_probability", "address_number_l", "street_l", "address_number_r", "street_r"]
df_predict = df_predict[output_cols]

# Display the results
print(df_predict)
