In [12]:
!python --version

Python 3.9.12


In [4]:
# !pip install polars==1.26.0

In [155]:
import re
import os
import zipfile
import random
import string
import polars as pl
from pathlib import Path
from typing import List, Tuple, Dict

print(f'polars version: {pl.__version__}')

project_dir = Path(os.getcwd())

polars version: 1.9.0


In [9]:
# !wget https://nationaladdressdata.s3.amazonaws.com/NAD_r18_TXT.zip

In [10]:
# zip_data_file_path = project_dir / 'NAD_r18_TXT.zip'

# with zipfile.ZipFile(zip_data_file_path, 'r') as zip_ref:
#     zip_ref.extractall(".")

In [7]:
raw_file_path = project_dir / 'TXT/NAD_r18.txt'

In [15]:
!pip list | grep polars

polars                        1.26.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [38]:
schema_overrides = {
    "OID_": pl.Int64,
    "AddNum_Pre": pl.Utf8,
    "Add_Number": pl.Int64,
    "AddNum_Suf": pl.Utf8,
    "AddNo_Full": pl.Int64,
    "St_PreMod": pl.Utf8,
    "St_PreDir": pl.Utf8,
    "St_PreTyp": pl.Utf8,
    "St_PreSep": pl.Utf8,
    "St_Name": pl.Utf8,
    "St_PosTyp": pl.Utf8,
    "St_PosDir": pl.Utf8,
    "St_PosMod": pl.Utf8,
    "StNam_Full": pl.Utf8,
    "Building": pl.Utf8,
    "Floor": pl.Utf8,
    "Unit": pl.Utf8,
    "Room": pl.Utf8,
    "Seat": pl.Utf8,
    "Addtl_Loc": pl.Utf8,
    "SubAddress": pl.Utf8,
    "LandmkName": pl.Utf8,
    "County": pl.Utf8,
    "Inc_Muni": pl.Utf8,
    "Post_City": pl.Utf8,
    "Census_Plc": pl.Utf8,
    "Uninc_Comm": pl.Utf8,
    "Nbrhd_Comm": pl.Utf8,
    "NatAmArea": pl.Utf8,
    "NatAmSub": pl.Utf8,
    "Urbnztn_PR": pl.Utf8,
    "PlaceOther": pl.Utf8,
    "PlaceNmTyp": pl.Utf8,
    "State": pl.Utf8,
    "Zip_Code": pl.Int64,
    "Plus_4": pl.Int64,
    "UUID": pl.Utf8,
    "AddAuth": pl.Int64,
    "AddrRefSys": pl.Utf8,
    "Longitude": pl.Float64,
    "Latitude": pl.Float64,
    "NatGrid": pl.Utf8,
    "Elevation": pl.Utf8,
    "Placement": pl.Utf8,
    "AddrPoint": pl.Utf8,
    "Related_ID": pl.Utf8,
    "RelateType": pl.Utf8,
    "ParcelSrc": pl.Utf8,
    "Parcel_ID": pl.Utf8,
    "AddrClass": pl.Utf8,
    "Lifecycle": pl.Utf8,
    "Effective": pl.Utf8,
    "Expire": pl.Utf8,
    "DateUpdate": pl.Utf8,
    "AnomStatus": pl.Utf8,
    "LocatnDesc": pl.Utf8,
    "Addr_Type": pl.Utf8,
    "DeliverTyp": pl.Utf8,
    "NAD_Source": pl.Utf8,
    "DataSet_ID": pl.Utf8,
    "StreetAddress": pl.Utf8,
    "SecondaryAddress": pl.Utf8,
    "CityStateZip": pl.Utf8,
    "FullAddress": pl.Utf8,
}

In [64]:
df = pl.read_csv(
    raw_file_path, 
    ignore_errors=True, 
    separator=",", 
    infer_schema_length=0, 
    quote_char=None, 
    schema_overrides=schema_overrides,
    truncate_ragged_lines=True,
    null_values=["Not stated"]
)

In [65]:
df = df.filter(pl.col('State').is_not_null())

In [66]:
valid_states = [
    'TX', 'LA', 'ME', 'WY', 'KY', 'MI', 'WA', 'VT', 'ND', 'TN',
    'IN', 'WV', 'MN', 'RI', 'DE', 'IL', 'SD', 'AK', 'MS', 'OK',
    'PA', 'WI', 'NY', 'KS', 'NM', 'AZ', 'SC', 'FL', 'NC', 'MD',
    'UT', 'NE', 'NH', 'VA', 'GA', 'AL', 'CA', 'MA', 'CT', 'AR',
    'CO', 'MT', 'DC', 'ID', 'IA', 'OH', 'MO'
]

In [67]:
df = df.filter(
    pl.col('State').is_in(valid_states)
)

In [68]:
print(f'Number of records: {len(df)}')

Number of records: 80044721


In [69]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("AddNum_Pre"),
            pl.col("Add_Number").cast(str),
            pl.col("AddNum_Suf"),
            pl.col("St_PreMod"),
            pl.col("St_PreDir"),
            pl.col("St_PreTyp"),
            pl.col("St_Name"),
            pl.col("St_PosTyp"),
            pl.col("St_PosDir"),
            pl.col("St_PosMod")
        ],
        separator=" ",
        ignore_nulls=True
    ).alias("StreetAddress"),
    pl.concat_str(
        [
            pl.col("Building"),
            pl.col("Floor"),
            pl.col("Unit"),
            pl.col("Room"),
            pl.col("Seat"),
            pl.col("Addtl_Loc"),
            pl.col("SubAddress")
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("SecondaryAddress"),
    pl.concat_str(
        [
            pl.col("Post_City"),
            pl.col("State"),
            pl.concat_str(
                [pl.col("Zip_Code").cast(str), pl.col("Plus_4").cast(str)],
                separator="-",
                ignore_nulls=True
            )
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("CityStateZip")
)

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("StreetAddress"),
            pl.col("SecondaryAddress"),
            pl.col("CityStateZip")
        ],
        separator="\n",
        ignore_nulls=True
    ).alias("FullAddress")
)

In [70]:
mean_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].mean()
median_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].median()

print(f'Mean Address Length: {mean_add_len} | Median Address Length: {median_add_len}')

Mean Address Length: 42.41529813065374 | Median Address Length: 41.0


In [82]:
state_name_abbr_tuples = [
    ("Alabama", "AL"),
    ("Alaska", "AK"),
    ("Arizona", "AZ"),
    ("Arkansas", "AR"),
    ("California", "CA"),
    ("Colorado", "CO"),
    ("Connecticut", "CT"),
    ("Delaware", "DE"),
    ("District of Columbia", "DC"),
    ("Florida", "FL"),
    ("Georgia", "GA"),
    ("Idaho", "ID"),
    ("Illinois", "IL"),
    ("Indiana", "IN"),
    ("Iowa", "IA"),
    ("Kansas", "KS"),
    ("Kentucky", "KY"),
    ("Louisiana", "LA"),
    ("Maine", "ME"),
    ("Maryland", "MD"),
    ("Massachusetts", "MA"),
    ("Michigan", "MI"),
    ("Minnesota", "MN"),
    ("Mississippi", "MS"),
    ("Missouri", "MO"),
    ("Montana", "MT"),
    ("Nebraska", "NE"),
    ("New Hampshire", "NH"),
    ("New Mexico", "NM"),
    ("New York", "NY"),
    ("North Carolina", "NC"),
    ("North Dakota", "ND"),
    ("Ohio", "OH"),
    ("Oklahoma", "OK"),
    ("Pennsylvania", "PA"),
    ("Rhode Island", "RI"),
    ("South Carolina", "SC"),
    ("South Dakota", "SD"),
    ("Tennessee", "TN"),
    ("Texas", "TX"),
    ("Utah", "UT"),
    ("Vermont", "VT"),
    ("Virginia", "VA"),
    ("Washington", "WA"),
    ("West Virginia", "WV"),
    ("Wisconsin", "WI"),
    ("Wyoming", "WY"),
]

In [79]:
address_per_state = 10_000

In [91]:
def get_state_df(df: pl.DataFrame, state_abv: str, samples: int = address_per_state) -> pl.DataFrame:
    state_df = df.filter(pl.col('State') == state_abv)

    sample_with_replacement = True if len(state_df) < samples else False

    return state_df.sample(n=samples, seed=0, with_replacement=sample_with_replacement, shuffle=True) 

def build_dataset(df: pl.DataFrame, states: List[Tuple[str, str]]) -> pl.DataFrame:
    dfs = [get_state_df(df, state_abv) for state, state_abv in states]
    return pl.concat(dfs)

In [92]:
sampled_df = build_dataset(df, state_name_abbr_tuples)

In [94]:
print(f'Number of samples: {len(sampled_df)}')

Number of samples: 470000


In [95]:
print(f'Number of unique states: {len(sampled_df["State"].unique())}')

Number of unique states: 47


In [98]:
sampled_df.write_parquet(project_dir / 'nad_sample_address.parquet', compression='gzip')

## Sequence to Sequence Dataset prep

In [164]:
project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'

In [167]:
df = pl.read_parquet(data_dir / 'address_dataset.parquet')
print(f'Shape of data: {df.shape}')
print(f"States: {df['State'].unique().to_list()}")

Shape of data: (470000, 64)
States: ['AK', 'MS', 'SD', 'TX', 'PA', 'AZ', 'NC', 'AL', 'KY', 'GA', 'LA', 'MA', 'MT', 'ID', 'SC', 'OK', 'RI', 'CO', 'FL', 'VT', 'IL', 'MO', 'WV', 'NM', 'DC', 'IN', 'NY', 'NE', 'OH', 'MI', 'DE', 'CT', 'CA', 'WI', 'NH', 'MD', 'UT', 'IA', 'ME', 'KS', 'TN', 'AR', 'VA', 'WY', 'ND', 'WA', 'MN']


In [168]:
ma_df = df.filter(
    pl.col('State') == 'MA'
).sample(n=100, shuffle=True, with_replacement=False)

print(f'Number of samples: {len(ma_df)}')

Number of samples: 100


In [169]:
def build_clean_address(row: Dict) -> str:
    """Construct address from structured fields with type consistency"""
    parts = []
    
    # Street component
    street = []
    if row.get('Add_Number') is not None:
        street.append(str(int(row['Add_Number'])))
    if row.get('St_Name'):
        street.append(str(row['St_Name']).lower())
    if street:
        parts.append(' '.join(street))
    
    # Location component
    location = []
    if row.get('Post_City'):
        location.append(str(row['Post_City']).lower())
    if row.get('State'):
        location.append(str(row['State']).lower())
    if row.get('Zip_Code') is not None:
        location.append(f"{int(row['Zip_Code']):05d}"[:5])
    if location:
        parts.append(', '.join(location))
    
    return ', '.join(parts)

def add_character_noise(component: str) -> str:
    """Add character noise while maintaining string type"""
    return ''.join([
        random.choice(string.ascii_lowercase) 
        if c.isalpha() and random.random() < 0.2 
        else c
        for c in component
    ]) if component else component

def generate_noisy_address(row: Dict) -> str:
    """Generate noisy address with type-safe modifications"""
    modified = row.copy()
    
    # 50% chance to remove street number (set to None)
    if random.random() < 0.5:
        modified['Add_Number'] = None
    
    # Add noise to street name (keep as string)
    if modified.get('St_Name'):
        modified['St_Name'] = add_character_noise(str(modified['St_Name']))
    
    # 30% chance to remove city (set to None)
    if random.random() < 0.3:
        modified['Post_City'] = None
    
    # 20% chance to modify zip code (keep as integer)
    if modified.get('Zip_Code') and random.random() < 0.2:
        zip_code = int(modified['Zip_Code'])
        if 10000 <= zip_code <= 99999:
            modified['Zip_Code'] = zip_code // 10  # Truncate last digit
    
    return build_clean_address(modified)

def create_address_pairs(df: pl.DataFrame, n_noisy_varient_per_add: int = 3) -> pl.DataFrame:
    """Generate address pairs with schema consistency"""
    results = []

    for row in df.to_dicts():
        oid = row['OID_']
        state = row['State']

        # Original clean target
        clean_target = build_clean_address(row)
        
        # Add clean pair
        results.append({
            'oid': oid,
            'source': clean_target,
            'target': clean_target,
            'state': state
        })
        
        # Generate n noisy variants
        for _ in range(n_noisy_varient_per_add):
            noisy_source = generate_noisy_address(row)
            results.append({
                'oid': oid,
                'source': noisy_source,
                'target': clean_target,
                'state': state
            })
    # Ensure schema consistency
    return pl.DataFrame(results).unique()

In [170]:
ma_pairs = create_address_pairs(ma_df, n_noisy_varient_per_add=3)

In [171]:
ma_pairs.sort('oid')

oid,source,target,state
i64,str,str,str
21805693,"""147 bafvzs, ma, 01776""","""147 haynes, ma, 01776""","""MA"""
21805693,"""147 haynes, ma, 01776""","""147 haynes, ma, 01776""","""MA"""
21805693,"""haynes, ma, 01776""","""147 haynes, ma, 01776""","""MA"""
21914186,"""95 squantum, ma, 02171""","""95 squantum, ma, 02171""","""MA"""
21914186,"""95 sqrantum, ma, 02171""","""95 squantum, ma, 02171""","""MA"""
…,…,…,…
25320323,"""35 timberlane, ma, 02649""","""35 timberlane, ma, 02649""","""MA"""
25320323,"""35 twmbeutane, ma, 02649""","""35 timberlane, ma, 02649""","""MA"""
25336079,"""188 ryver, ma, 01011""","""188 river, ma, 01011""","""MA"""
25336079,"""188 river, ma, 01011""","""188 river, ma, 01011""","""MA"""
