## Ensure prerequisites are installed
1. Install libpostal
    - Instructions
        - [Mac/Linux instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux)
        - [Windows instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-windows)
    - In building these examples, we used the optional [Senzing data model](https://github.com/Senzing/libpostal-data) for libpostal. Instructions for using this model are [here](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-with-an-alternative-data-model).

2. Install Python dependencies
    ```bash
    pip3 install -r requirements.txt
    ```

## Create random sampling of known addresses with various string edits

In [None]:
import pandas as pd
import random
import string

def random_string_edit(text, probability=1.0):
    """Applies a random edit to the input string with a given probability."""
    if random.random() < probability:
        operation = random.choice(['insert', 'delete', 'substitute', 'transpose'])
        index = random.randrange(len(text) + 1)  # +1 for potential insertion at the end

        if operation == 'insert':
            random_char = random.choice(string.ascii_uppercase + string.digits + ' ')
            return text[:index] + random_char + text[index:]
        elif operation == 'delete' and text:  # Ensure text is not empty
            return text[:index] + text[index+1:]
        elif operation == 'substitute' and text:
            random_char = random.choice(string.ascii_uppercase + string.digits + ' ')
            return text[:index] + random_char + text[index+1:]
        elif operation == 'transpose' and len(text) >= 2:
            idx1 = random.randrange(len(text) - 1)
            idx2 = idx1 + 1
            list_text = list(text)
            list_text[idx1], list_text[idx2] = list_text[idx2], list_text[idx1]
            return "".join(list_text)
    return text  # No edit applied

def transform_row(row):
    edited_address = random_string_edit(random_string_edit(str(row.ADDRESS)))
    return f"{edited_address}, {row.CITY}, {row.STATE} {row.ZIPCODE}"

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ZIPCODE': 'string'
})
mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

sampled = mar_address_points.sample(n=25)
transformed = sampled.apply(transform_row, axis=1)

df_output = pd.DataFrame({ 'address': transformed, 'original': sampled['ADDRESS'] })
df_output.to_csv('./data/address_samples.csv', index=False)

## Parsing addresses for pre-filtering

In [None]:
import os
import pandas as pd
from collections import namedtuple
from postal.expand import expand_address
from postal.parser import parse_address

output_directory = './output'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def process_row(row):
    try:
        formatted=f"{row.ADDRESS} {row.CITY} {row.STATE} {row.ZIPCODE}"
        parsed = parse_address(formatted, language='en', country='us')

        keys = [item[1] for item in parsed]
        values = [item[0] for item in parsed]

        ParseResult = namedtuple('ParseResult', keys)
        parse_result = ParseResult(*values)

        return None, False

        if hasattr(parse_result, 'house_number') and hasattr(parse_result, 'road'):
            return parse_result, True
        else:
            return parse_result, False
    except ValueError:
        expanded = expand_address(formatted, languages=['en'])

        # print(formatted)
        # print(expanded)
        # print(parsed)

        return None, False


mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ZIPCODE': 'string'
})

street_present = []
street_missing = []

for row in mar_address_points.itertuples():
    result = process_row(row)
    processed_data, has_street = result
    
    if has_street == True:
        street_present.append(processed_data._asdict())
    elif processed_data != None:
        street_missing.append(processed_data._asdict())

if street_present:
    df_street_present = pd.DataFrame(street_present)
    df_street_present.to_csv(os.path.join(output_directory, 'street_present.csv'))
else:
    print('No output to write for street_present')

if street_missing:
    df_street_missing = pd.DataFrame(street_missing)
    df_street_missing.to_csv(os.path.join(output_directory, 'street_missing.csv'))
else:
    print('No output to write for street_missing')


## Building an inverted index for near-dupe matching
This sample uses the [open-data list of address points](https://opendata.dc.gov/datasets/DCGIS::address-points/about) from the Washington, D.C. [Master Address Repository (MAR)](https://opendata.dc.gov/pages/addressing-in-dc). 



In [None]:
import pandas as pd
import pickle
from collections import defaultdict
from functools import reduce
from fuzzywuzzy import fuzz
from postal.parser import parse_address
from postal.expand import expand_address
from postal.near_dupe import near_dupe_hashes

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ADDRESS_NUMBER': 'string',
    'ZIPCODE': 'string'
})
mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

flat_map = lambda f, xs, **kwargs: reduce(lambda a, b: a + b, map(lambda x: f(x, **kwargs), xs))

def get_expansion_hash(expansion, exclude_house_number=False):
    parsed_address = parse_address(expansion, language='en', country='us')
    labels=[]
    values=[]
    for value, label in parsed_address:
        if exclude_house_number == True and label == 'house_number':
            continue
        labels.append(label)
        values.append(value)
    return near_dupe_hashes(labels, values, languages=['en'], address_only_keys=True, )

def get_hashes(address, exclude_house_number=False):
    expansions = expand_address(address, languages=['en'])
    return flat_map(get_expansion_hash, expansions, exclude_house_number=exclude_house_number)

def build_inverted_index() -> defaultdict:
    index = defaultdict(set)

    for row in mar_address_points.itertuples():
        formatted=f"{row.ADDRESS}, {row.CITY}, {row.STATE} {row.ZIPCODE}"
        full_hashes = set(get_hashes(formatted))
        no_house_number_hashes = set(get_hashes(formatted, exclude_house_number=True))
        hashes = list(full_hashes | no_house_number_hashes)
        result = (row.MAR_ID, row.ADDRESS, str(row.ADDRESS_NUMBER), row.ADDRESS_NUMBER_SUFFIX, row.STREET_NAME, row.STREET_TYPE, row.QUADRANT, row.CITY, row.STATE, row.ZIPCODE, row.COUNTRY)

        for hash in hashes:
            index[hash].add(result)

    return index

def load_or_build_inverted_index() -> defaultdict:
    try:
        with open('./data/address_index.pkl', 'rb') as read:
            print('Loading MAR index from file...')
            return pickle.load(read)
    except Exception:
        print('Building MAR index from scratch...')
        index = build_inverted_index()
        with open('./data/address_index.pkl', 'wb') as write:
            pickle.dump(index, write)
        return index

df_samples = pd.read_csv("./data/address_samples.csv")

inverted_index = load_or_build_inverted_index()

def find_candidate_matches(address: str):
    print(f"Searching for matches for: {address}")
    parsed = parse_address(address, language='en', country='us')
    print(parsed)

    house_numbers = [n[0] for n in parsed if n[1] == 'house_number']
    house_number = house_numbers[0] if str(house_numbers) else None
    print(f"House number: {house_number}")

    candidate_matches = []

    hashes = get_hashes(address, exclude_house_number=True)
    print(f"Hashes: {hashes}")

    for hash in hashes:
        hash_matches = inverted_index[hash]
        for match in hash_matches:
            candidate_matches.append(match)

    possible_mar_ids = set(map(lambda x: x[0], candidate_matches))
    print(f"Possible MAR IDs", possible_mar_ids)
    possible_addresses = mar_address_points[mar_address_points['MAR_ID'].isin(possible_mar_ids)]
    possible_addresses = possible_addresses[['MAR_ID', 'ADDRESS', 'ADDRESS_NUMBER']]

    possible_addresses = possible_addresses.loc[possible_addresses['ADDRESS_NUMBER'].apply(lambda x: fuzz.ratio(x[2], house_number) > 0.95)]
    # possible_addresses = sorted(possible_addresses, key=lambda x: )
    possible_addresses.sort_values(by='ADDRESS_NUMBER', key=lambda x: fuzz.ratio(x, house_number))
    print(f"Possible addresses: {possible_addresses}")

    # filtered_matches = list(filter(lambda x: str(x[2]).startswith(house_number) or house_number.startswith(str(x[2])), candidate_matches))

    # print(f"All candidate matches: {filtered_matches}")
    # print(f"Set of candidate matches: {set(filtered_matches)}")
    # return set(filtered_matches)
    return None

for row in df_samples.head(1).itertuples():
    address = row.address
    original = row.original    
    

    candidate_matches = find_candidate_matches(address)
    print(candidate_matches)


Loading MAR index from file...
Searching for matches for: 112T UPSHUR STREE NE, WASHINGTON, DC 20017
[('112t', 'house_number'), ('upshur stree ne', 'road'), ('washington', 'city'), ('dc', 'state'), ('20017', 'postcode')]
House number: 112t
Hashes: ['sct|upshur street ne|washington', 'sct|upshur street nebraska|washington', 'sct|upshur street northeast|washington', 'sct|upshur|washington', 'spc|upshur street ne|20017', 'spc|upshur street nebraska|20017', 'spc|upshur street northeast|20017', 'spc|upshur|20017', 'sct|upshur street ne|washington', 'sct|upshur street nebraska|washington', 'sct|upshur street northeast|washington', 'sct|upshur|washington', 'spc|upshur street ne|20017', 'spc|upshur street nebraska|20017', 'spc|upshur street northeast|20017', 'spc|upshur|20017', 'sct|upshur street|washington', 'sct|upshur street|nebraska', 'sct|upshur|washington', 'sct|upshur|nebraska', 'spc|upshur street|20017', 'spc|upshur|20017', 'sct|upshur street|washington', 'sct|upshur street|nebraska', 