## Ensure prerequisites are installed
1. Install libpostal
    - Instructions
        - [Mac/Linux instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux)
        - [Windows instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-windows)
    - In building these examples, we used the optional [Senzing data model](https://github.com/Senzing/libpostal-data) for libpostal. Instructions for using this model are [here](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-with-an-alternative-data-model).

2. Install Python dependencies
    ```bash
    pip3 install -r requirements.txt
    ```

In [34]:
import pandas as pd
import random
import string

def random_string_edit(text, probability=1.0):
    """Applies a random edit to the input string with a given probability."""
    if random.random() < probability:
        operation = random.choice(['insert', 'delete', 'substitute', 'transpose'])
        index = random.randrange(len(text) + 1)  # +1 for potential insertion at the end

        if operation == 'insert':
            random_char = random.choice(string.ascii_letters + string.digits + ' ')
            return text[:index] + random_char + text[index:]
        elif operation == 'delete' and text:  # Ensure text is not empty
            return text[:index] + text[index+1:]
        elif operation == 'substitute' and text:
            random_char = random.choice(string.ascii_letters + string.digits + ' ')
            return text[:index] + random_char + text[index+1:]
        elif operation == 'transpose' and len(text) >= 2:
            idx1 = random.randrange(len(text) - 1)
            idx2 = idx1 + 1
            list_text = list(text)
            list_text[idx1], list_text[idx2] = list_text[idx2], list_text[idx1]
            return "".join(list_text)
    return text  # No edit applied

def transform_row(row):
    edited_address = random_string_edit(random_string_edit(str(row.ADDRESS)))
    return f"{edited_address}, {row.CITY}, {row.STATE} {row.ZIPCODE}"

mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ZIPCODE': 'string'
})
mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

sampled = mar_address_points.sample(n=25)
transformed = sampled.apply(transform_row, axis=1)

df_output = pd.DataFrame({ 'address': transformed, 'original': sampled['ADDRESS'] })
df_output.to_csv('./data/address_samples.csv', index=False)

## Parsing addresses for pre-filtering

In [None]:
import os
import pandas as pd
from collections import namedtuple
from postal.expand import expand_address
from postal.parser import parse_address

output_directory = './output'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def process_row(row):
    try:
        formatted=f"{row.ADDRESS} {row.CITY} {row.STATE} {row.ZIPCODE}"
        parsed = parse_address(formatted, language='en', country='us')

        keys = [item[1] for item in parsed]
        values = [item[0] for item in parsed]

        ParseResult = namedtuple('ParseResult', keys)
        parse_result = ParseResult(*values)

        return None, False

        if hasattr(parse_result, 'house_number') and hasattr(parse_result, 'road'):
            return parse_result, True
        else:
            return parse_result, False
    except ValueError:
        expanded = expand_address(formatted, languages=['en'])

        print(formatted)
        print(expanded)
        print(parsed)

        return None, False


mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
    'ZIPCODE': 'string'
})

street_present = []
street_missing = []

for row in mar_address_points.itertuples():
    result = process_row(row)
    processed_data, has_street = result
    
    if has_street == True:
        street_present.append(processed_data._asdict())
    elif processed_data != None:
        street_missing.append(processed_data._asdict())

if street_present:
    df_street_present = pd.DataFrame(street_present)
    df_street_present.to_csv(os.path.join(output_directory, 'street_present.csv'))
else:
    print('No output to write for street_present')

if street_missing:
    df_street_missing = pd.DataFrame(street_missing)
    df_street_missing.to_csv(os.path.join(output_directory, 'street_missing.csv'))
else:
    print('No output to write for street_missing')


1218 H STREET NE WASHINGTON DC 20002
['1218 h street ne washington district of columbia 20002', '1218 h street ne washington dc 20002', '1218 h street nebraska washington district of columbia 20002', '1218 h street nebraska washington dc 20002', '1218 h street northeast washington district of columbia 20002', '1218 h street northeast washington dc 20002']
[('1218', 'postcode'), ('h street ne', 'road'), ('washington', 'city'), ('dc', 'state'), ('20002', 'postcode')]
4800 U STREET NW WASHINGTON DC 20007
['4800 unit street northwest washington district of columbia 20007', '4800 unit street northwest washington dc 20007', '4800 unit street nw washington district of columbia 20007', '4800 unit street nw washington dc 20007', '4800 u street northwest washington district of columbia 20007', '4800 u street northwest washington dc 20007', '4800 u street nw washington district of columbia 20007', '4800 u street nw washington dc 20007']
[('4800', 'postcode'), ('u street nw', 'road'), ('washington

## Building an inverted index for near-dupe matching
This sample uses the [open-data list of address points](https://opendata.dc.gov/datasets/DCGIS::address-points/about) from the Washington, D.C. [Master Address Repository (MAR)](https://opendata.dc.gov/pages/addressing-in-dc). 



In [None]:
import pandas as pd
from collections import defaultdict
from functools import reduce
from postal.parser import parse_address
from postal.expand import expand_address
from postal.near_dupe import near_dupe_hashes

flat_map = lambda f, xs: reduce(lambda a, b: a + b, map(f, xs))

def get_expansion_hash(expansion):
    parsed_address = parse_address(expansion, language='en', country='us')
    labels = [p[1] for p in parsed_address]
    values = [p[0] for p in parsed_address]
    return near_dupe_hashes(labels, values, languages=['en'], address_only_keys=True)

def get_hashes(address):
    expansions = expand_address(address, languages=['en'])
    return flat_map(get_expansion_hash, expansions)

def build_inverted_index() -> defaultdict:
    mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
        'ZIPCODE': 'string'
    })
    mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

    index = defaultdict(set)

    for row in mar_address_points.itertuples():
        formatted=f"{row.ADDRESS}, {row.CITY}, {row.STATE} {row.ZIPCODE}"
        hashes = list(set(get_hashes(formatted)))
        result = (row.MAR_ID, row.ADDRESS, row.ADDRESS_NUMBER, row.ADDRESS_NUMBER_SUFFIX, row.STREET_NAME, row.STREET_TYPE, row.QUADRANT, row.CITY, row.STATE, row.ZIPCODE, row.COUNTRY)

        for hash in hashes:
            index[hash].add(result)

    return index

inverted_index = build_inverted_index()

# TODO - load some sample addresses and find candidate matches in the inverted index.