## Ensure prerequisites are installed
1. Install libpostal
    - Instructions
        - [Mac/Linux instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux)
        - [Windows instructions](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-windows)
    - In building these examples, we used the optional [Senzing data model](https://github.com/Senzing/libpostal-data) for libpostal. Instructions for using this model are [here](https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-with-an-alternative-data-model).

2. Install Python dependencies
    ```bash
    pip3 install -r requirements.txt
    ```

## Parsing addresses for pre-filtering

## Building an inverted index for near-dupe matching
This sample uses the [open-data list of address points](https://opendata.dc.gov/datasets/DCGIS::address-points/about) from the Washington, D.C. [Master Address Repository (MAR)](https://opendata.dc.gov/pages/addressing-in-dc). 



In [None]:
import pandas as pd
from collections import defaultdict
from functools import reduce
from postal.parser import parse_address
from postal.expand import expand_address
from postal.near_dupe import near_dupe_hashes

flat_map = lambda f, xs: reduce(lambda a, b: a + b, map(f, xs))

def get_expansion_hash(expansion):
    parsed_address = parse_address(expansion, language='en', country='us')
    labels = [p[1] for p in parsed_address]
    values = [p[0] for p in parsed_address]
    return near_dupe_hashes(labels, values, languages=['en'], address_only_keys=True)

def get_hashes(address):
    expansions = expand_address(address, languages=['en'])
    return flat_map(get_expansion_hash, expansions)

def build_inverted_index() -> defaultdict:
    mar_address_points = pd.read_csv('./data/mar_address_points.csv', header=0, low_memory=False, dtype={
        'ZIPCODE': 'string'
    })
    mar_address_points = mar_address_points[mar_address_points['ADDRESS_TYPE'] == 'ADDRESS']

    index = defaultdict(set)

    for row in mar_address_points.itertuples():
        formatted=f"{row.ADDRESS}, {row.CITY}, {row.STATE} {row.ZIPCODE}"
        hashes = list(set(get_hashes(formatted)))
        result = (row.MAR_ID, row.ADDRESS, row.ADDRESS_NUMBER, row.ADDRESS_NUMBER_SUFFIX, row.STREET_NAME, row.STREET_TYPE, row.QUADRANT, row.CITY, row.STATE, row.ZIPCODE, row.COUNTRY)

        for hash in hashes:
            index[hash].add(result)

    return index

inverted_index = build_inverted_index()

# TODO - load some sample addresses and find candidate matches in the inverted index.