## Ensure prerequisites are installed
1. Install libpostal
    - Instructions
        - [Mac/Linux instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux)
        - [Windows instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-windows)
    - In building these examples, we used the optional [Senzing data model](https://github.com/Senzing/libpostal-data) for libpostal. Instructions for using this model are [here](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-with-an-alternative-data-model).

2. Install Python dependencies
    ```bash
    pip3 install -r requirements.txt
    ```

## Create random sampling of known addresses with various string edits

In [None]:
import pandas as pd
import random
import string

def random_string_edit(text, probability=1.0):
    """Applies a random edit to the input string with a given probability."""
    if random.random() < probability:
        operation = random.choice(['insert', 'delete', 'substitute', 'transpose'])
        index = random.randrange(len(text) + 1)  # +1 for potential insertion at the end

        if operation == 'insert':
            random_char = random.choice(string.ascii_uppercase + string.digits + ' ')
            return text[:index] + random_char + text[index:]
        elif operation == 'delete' and text:  # Ensure text is not empty
            return text[:index] + text[index+1:]
        elif operation == 'substitute' and text:
            random_char = random.choice(string.ascii_uppercase + string.digits + ' ')
            return text[:index] + random_char + text[index+1:]
        elif operation == 'transpose' and len(text) >= 2:
            idx1 = random.randrange(len(text) - 1)
            idx2 = idx1 + 1
            list_text = list(text)
            list_text[idx1], list_text[idx2] = list_text[idx2], list_text[idx1]
            return "".join(list_text)
    return text  # No edit applied

def transform_row(row):
    # ADDRESS_NUMBER,ADDRESS_NUMBER_SUFFIX,STREET_NAME,STREET_TYPE,QUADRANT
    segment_to_edit = f"{row.STREET_NAME} {row.STREET_TYPE} {row.QUADRANT}"
    edited_segment = random_string_edit(random_string_edit(segment_to_edit))
    address_number_suffix = row.ADDRESS_NUMBER_SUFFIX if row.ADDRESS_NUMBER_SUFFIX != pd.NA else ""
    print(address_number_suffix)
    return f"{row.ADDRESS_NUMBER} {address_number_suffix} {edited_segment}, {row.CITY}, {row.STATE} {row.ZIPCODE}"
    

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    # 'ADDRESS_NUMBER_SUFFIX': 'string',
    'ZIPCODE': 'string'
})
mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

sampled = mar_address_points.sample(n=25)
transformed = sampled.apply(transform_row, axis=1)

df_output = pd.DataFrame({ 'address': transformed, 'original': sampled['ADDRESS'] })
df_output.to_csv('./data/address_samples.csv', index=False)

## Parsing addresses for pre-filtering

In [None]:
import os
import pandas as pd
from collections import namedtuple
from postal.parser import parse_address

output_directory = './output'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def process_row(row):
    try:
        formatted=f"{row.ADDRESS} {row.CITY} {row.STATE} {row.ZIPCODE}"
        parsed = parse_address(formatted, language='en', country='us')

        keys = [item[1] for item in parsed]
        values = [item[0] for item in parsed]

        ParseResult = namedtuple('ParseResult', keys)
        parse_result = ParseResult(*values)

        if hasattr(parse_result, 'house_number') and hasattr(parse_result, 'road'):
            return parse_result, True
        else:
            return parse_result, False
    except ValueError:
        return None, False

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    # 'ADDRESS_NUMBER_SUFFIX': 'string',
    'ZIPCODE': 'string'
})

street_present = []

for row in mar_address_points.itertuples():
    result = process_row(row)
    processed_data, has_street = result
    
    if has_street == True:
        street_present.append(processed_data._asdict())
if street_present:
    df_street_present = pd.DataFrame(street_present)
    df_street_present.to_csv(os.path.join(output_directory, 'street_present.csv'))
else:
    print('No output to write for street_present')


## Building an inverted index for near-dupe matching
This sample uses the [open-data list of address points](https://opendata.dc.gov/datasets/DCGIS::address-points/about) from the Washington, D.C. [Master Address Repository (MAR)](https://opendata.dc.gov/pages/addressing-in-dc). 



In [None]:
# import pandas as pd
# import pickle
# from collections import defaultdict
# from functools import reduce
# # from fuzzywuzzy import fuzz
# from postal.parser import parse_address
# from postal.expand import expand_address
# from postal.near_dupe import near_dupe_hashes

# mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
#     'ADDRESS_NUMBER': 'string',
#     'ADDRESS_NUMBER_SUFFIX': 'string',
#     'ZIPCODE': 'string'
# })
# mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

# flat_map = lambda f, xs, **kwargs: reduce(lambda a, b: a + b, map(lambda x: f(x, **kwargs), xs))

# def get_expansion_hash(expansion, exclude_house_number=False):
#     parsed_address = parse_address(expansion, language='en', country='us')
#     labels=[]
#     values=[]
#     for value, label in parsed_address:
#         if exclude_house_number == True and label == 'house_number':
#             continue
#         labels.append(label)
#         values.append(value)
#     return near_dupe_hashes(labels, values, languages=['en'], address_only_keys=True, )

# def get_hashes(address, exclude_house_number=False):
#     expansions = expand_address(address, languages=['en'])
#     return flat_map(get_expansion_hash, expansions, exclude_house_number=exclude_house_number)

# def build_inverted_index() -> defaultdict:
#     index = defaultdict(set)

#     for row in mar_address_points.itertuples():
#         formatted=f"{row.ADDRESS}, {row.CITY}, {row.STATE} {row.ZIPCODE}"
#         full_hashes = set(get_hashes(formatted))
#         no_house_number_hashes = set(get_hashes(formatted, exclude_house_number=True))
#         hashes = list(full_hashes | no_house_number_hashes)
#         result = (row.MAR_ID, row.ADDRESS, str(row.ADDRESS_NUMBER), row.ADDRESS_NUMBER_SUFFIX, row.STREET_NAME, row.STREET_TYPE, row.QUADRANT, row.CITY, row.STATE, row.ZIPCODE, row.COUNTRY)

#         for hash in hashes:
#             index[hash].add(result)

#     return index

# def load_or_build_inverted_index() -> defaultdict:
#     try:
#         with open('./data/address_index.pkl', 'rb') as read:
#             print('Loading MAR index from file...')
#             return pickle.load(read)
#     except Exception:
#         print('Building MAR index from scratch...')
#         index = build_inverted_index()
#         with open('./data/address_index.pkl', 'wb') as write:
#             pickle.dump(index, write)
#         return index

# # df_samples = pd.read_csv("./data/address_samples.csv")

# inverted_index = load_or_build_inverted_index()



Loading MAR index from file...


In [12]:
import duckdb
import pandas as pd
from postal.parser import parse_address
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
import splink.comparison_library as cl

def parse_zip_code(address):
    parsed = parse_address(address)
    return next((n[0] for n in parsed if n[1] == 'postcode'), None)

df_mar = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    'ADDRESS_NUMBER_SUFFIX': 'string',
    'ZIPCODE': 'string'
})
df_mar["FULL_ADDRESS"] = df_mar["ADDRESS"].astype(str) + ', ' + df_mar["CITY"].astype(str) + ', ' + df_mar["STATE"].astype(str) + ' ' + df_mar["ZIPCODE"]
mar_columns=["MAR_ID", "FULL_ADDRESS", "ZIPCODE"]
mar_new_columns=["unique_id","address","zip"]
mar_col_map=dict(zip(mar_columns, mar_new_columns))
df_mar = df_mar[mar_columns].rename(columns=mar_col_map)

df_samples = pd.read_csv("./data/address_samples.csv")
df_samples["zip"] = df_samples["address"].apply(parse_zip_code)
df_samples["unique_id"] = df_samples.reset_index(drop=True).index + 1

df_mar_duck = duckdb.sql("SELECT * FROM df_mar")
df_samples_duck = duckdb.sql("SELECT * FROM df_samples")

con = duckdb.connect(":default:")
db_api = DuckDBAPI(connection=con)

settings = SettingsCreator(
    link_type="link_only",
    blocking_rules_to_generate_predictions=[
        block_on("zip")
    ],
    comparisons=[
        cl.LevenshteinAtThresholds("address", [2, 4]),
        cl.ExactMatch("zip"),
    ],
    probability_two_random_records_match=0.0002,
    retain_intermediate_calculation_columns=True,
    retain_matching_columns=True,
)

linker = Linker([df_mar_duck, df_samples_duck], settings, db_api)

# Execute the record linkage
splink_df_predict = linker.inference.predict()

df_predict = splink_df_predict.as_pandas_dataframe()
threshold = 0.65
df_predict = df_predict[df_predict["match_probability"] > threshold]
output_cols = ["unique_id_l", "unique_id_r", "match_probability", "address_l", "address_r"]
df_predict = df_predict[output_cols]

# Display the results
print(df_predict)


Blocking time: 0.05 seconds
Predict time: 0.77 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'address':
    m values not fully trained
Comparison: 'address':
    u values not fully trained
Comparison: 'zip':
    m values not fully trained
Comparison: 'zip':
    u values not fully trained


        unique_id_l  unique_id_r  match_probability  \
54863         75011            8           0.995255   
286137       295422            1           0.995255   

                                            address_l  \
54863   1038 BARNABY TERRACE SE, WASHINGTON, DC 20032   
286137       4511 3RD STREET SE, WASHINGTON, DC 20032   

                                            address_r  
54863   1038 BARNABY TERRACE SE, WASHINGTON, DC 20032  
286137       4511 3RD STREET SE, WASHINGTON, DC 20032  
