# Imports

In [41]:
import re
import os
import zipfile
import random
import string
import polars as pl
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict, Any
from geopy.geocoders import Nominatim

print(f'polars version: {pl.__version__}')

geolocator = Nominatim(user_agent="geocoder_llm_project", timeout=300)

project_dir = Path(os.getcwd())

polars version: 1.9.0


In [9]:
# !wget https://nationaladdressdata.s3.amazonaws.com/NAD_r18_TXT.zip

In [10]:
# zip_data_file_path = project_dir / 'NAD_r18_TXT.zip'

# with zipfile.ZipFile(zip_data_file_path, 'r') as zip_ref:
#     zip_ref.extractall(".")

In [7]:
raw_file_path = project_dir / 'TXT/NAD_r18.txt'

Dataset schema. Everything is a string here.

In [38]:
schema_overrides = {
    "OID_": pl.Int64,
    "AddNum_Pre": pl.Utf8,
    "Add_Number": pl.Int64,
    "AddNum_Suf": pl.Utf8,
    "AddNo_Full": pl.Int64,
    "St_PreMod": pl.Utf8,
    "St_PreDir": pl.Utf8,
    "St_PreTyp": pl.Utf8,
    "St_PreSep": pl.Utf8,
    "St_Name": pl.Utf8,
    "St_PosTyp": pl.Utf8,
    "St_PosDir": pl.Utf8,
    "St_PosMod": pl.Utf8,
    "StNam_Full": pl.Utf8,
    "Building": pl.Utf8,
    "Floor": pl.Utf8,
    "Unit": pl.Utf8,
    "Room": pl.Utf8,
    "Seat": pl.Utf8,
    "Addtl_Loc": pl.Utf8,
    "SubAddress": pl.Utf8,
    "LandmkName": pl.Utf8,
    "County": pl.Utf8,
    "Inc_Muni": pl.Utf8,
    "Post_City": pl.Utf8,
    "Census_Plc": pl.Utf8,
    "Uninc_Comm": pl.Utf8,
    "Nbrhd_Comm": pl.Utf8,
    "NatAmArea": pl.Utf8,
    "NatAmSub": pl.Utf8,
    "Urbnztn_PR": pl.Utf8,
    "PlaceOther": pl.Utf8,
    "PlaceNmTyp": pl.Utf8,
    "State": pl.Utf8,
    "Zip_Code": pl.Int64,
    "Plus_4": pl.Int64,
    "UUID": pl.Utf8,
    "AddAuth": pl.Int64,
    "AddrRefSys": pl.Utf8,
    "Longitude": pl.Float64,
    "Latitude": pl.Float64,
    "NatGrid": pl.Utf8,
    "Elevation": pl.Utf8,
    "Placement": pl.Utf8,
    "AddrPoint": pl.Utf8,
    "Related_ID": pl.Utf8,
    "RelateType": pl.Utf8,
    "ParcelSrc": pl.Utf8,
    "Parcel_ID": pl.Utf8,
    "AddrClass": pl.Utf8,
    "Lifecycle": pl.Utf8,
    "Effective": pl.Utf8,
    "Expire": pl.Utf8,
    "DateUpdate": pl.Utf8,
    "AnomStatus": pl.Utf8,
    "LocatnDesc": pl.Utf8,
    "Addr_Type": pl.Utf8,
    "DeliverTyp": pl.Utf8,
    "NAD_Source": pl.Utf8,
    "DataSet_ID": pl.Utf8,
    "StreetAddress": pl.Utf8,
    "SecondaryAddress": pl.Utf8,
    "CityStateZip": pl.Utf8,
    "FullAddress": pl.Utf8,
}

Given the data is huge (31GB) and build over years, there are some inconsistencies. So we ignore errors, infer schema and provide null values as well.

In [64]:
df = pl.read_csv(
    raw_file_path, 
    ignore_errors=True, 
    separator=",", 
    infer_schema_length=0, 
    quote_char=None, 
    schema_overrides=schema_overrides,
    truncate_ragged_lines=True,
    null_values=["Not stated"]
)

Filter out states which are not null

In [65]:
df = df.filter(pl.col('State').is_not_null())

These are the states available in the data. We have data from 47 states.

In [66]:
valid_states = [
    'TX', 'LA', 'ME', 'WY', 'KY', 'MI', 'WA', 'VT', 'ND', 'TN',
    'IN', 'WV', 'MN', 'RI', 'DE', 'IL', 'SD', 'AK', 'MS', 'OK',
    'PA', 'WI', 'NY', 'KS', 'NM', 'AZ', 'SC', 'FL', 'NC', 'MD',
    'UT', 'NE', 'NH', 'VA', 'GA', 'AL', 'CA', 'MA', 'CT', 'AR',
    'CO', 'MT', 'DC', 'ID', 'IA', 'OH', 'MO'
]

In [67]:
df = df.filter(
    pl.col('State').is_in(valid_states)
)

In [68]:
print(f'Number of records: {len(df)}')

Number of records: 80044721


Concatinating different columns into single strings to get street and country information, and finally build the FullAddress column.

In [7]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("AddNum_Pre"),
            pl.col("Add_Number").cast(str),
            pl.col("AddNum_Suf"),
            pl.col("St_PreMod"),
            pl.col("St_PreDir"),
            pl.col("St_PreTyp"),
            pl.col("St_Name"),
            pl.col("St_PosTyp"),
            pl.col("St_PosDir"),
            pl.col("St_PosMod"),
        ],
        separator=" ",
        ignore_nulls=True
    ).alias("StreetAddress"),
    pl.concat_str(
        [
            pl.col("Building"),
            pl.col("Floor"),
            pl.col("Unit"),
            pl.col("Room"),
            pl.col("Seat"),
            pl.col("Addtl_Loc"),
            pl.col("SubAddress")
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("SecondaryAddress"),
    pl.concat_str(
        [
            pl.col("LandmkName"),
            pl.col("County"),
            pl.col("Inc_Muni"),
            pl.col("Post_City"),
            pl.col("State"),
            pl.concat_str(
                [pl.col("Zip_Code").cast(str), pl.col("Plus_4").cast(str)],
                separator="-",
                ignore_nulls=True
            )
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("CityStateZip")
)

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("StreetAddress"),
            pl.col("SecondaryAddress"),
            pl.col("CityStateZip")
        ],
        separator="\n",
        ignore_nulls=True
    ).alias("FullAddress")
)

Some inconsistencies found in the `Inc_Muni` columns

In [None]:
['UNKN', '250240201300', '510000105900', '**PREVIOUS NAME REMOVED BY FDC. (MAYBERRY COURT)', '631332003700', 'SWWJDU 15-5']

In [None]:
'Mh Sw' = 'south west', 'Mm 100.8 I95 Sb Hwy', 'Hwy' = 'highway', 'Lti' = '', 

In [None]:
remove_chars = '?'

In [12]:
idx = 10
print(df['FullAddress'][idx])

617 MEXBORO Road

Monroe, UNINCORPORATED, FRISCO CITY, AL, 36445


Mean & Median address length: 41-42 characters.

In [70]:
mean_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].mean()
median_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].median()

print(f'Mean Address Length: {mean_add_len} | Median Address Length: {median_add_len}')

Mean Address Length: 42.41529813065374 | Median Address Length: 41.0


State full name and abbreviations

In [82]:
state_name_abbr_tuples = [
    ("Alabama", "AL"),
    ("Alaska", "AK"),
    ("Arizona", "AZ"),
    ("Arkansas", "AR"),
    ("California", "CA"),
    ("Colorado", "CO"),
    ("Connecticut", "CT"),
    ("Delaware", "DE"),
    ("District of Columbia", "DC"),
    ("Florida", "FL"),
    ("Georgia", "GA"),
    ("Idaho", "ID"),
    ("Illinois", "IL"),
    ("Indiana", "IN"),
    ("Iowa", "IA"),
    ("Kansas", "KS"),
    ("Kentucky", "KY"),
    ("Louisiana", "LA"),
    ("Maine", "ME"),
    ("Maryland", "MD"),
    ("Massachusetts", "MA"),
    ("Michigan", "MI"),
    ("Minnesota", "MN"),
    ("Mississippi", "MS"),
    ("Missouri", "MO"),
    ("Montana", "MT"),
    ("Nebraska", "NE"),
    ("New Hampshire", "NH"),
    ("New Mexico", "NM"),
    ("New York", "NY"),
    ("North Carolina", "NC"),
    ("North Dakota", "ND"),
    ("Ohio", "OH"),
    ("Oklahoma", "OK"),
    ("Pennsylvania", "PA"),
    ("Rhode Island", "RI"),
    ("South Carolina", "SC"),
    ("South Dakota", "SD"),
    ("Tennessee", "TN"),
    ("Texas", "TX"),
    ("Utah", "UT"),
    ("Vermont", "VT"),
    ("Virginia", "VA"),
    ("Washington", "WA"),
    ("West Virginia", "WV"),
    ("Wisconsin", "WI"),
    ("Wyoming", "WY"),
]

Sampling 10K addresses for each state, from the entire dataset. We do sampling without replacement as we don't want duplicates in the dataset. So we have a dataset of 470K records.

In [79]:
address_per_state = 10_000

In [91]:
def get_state_df(df: pl.DataFrame, state_abv: str, samples: int = address_per_state) -> pl.DataFrame:
    state_df = df.filter(pl.col('State') == state_abv)

    sample_with_replacement = True if len(state_df) < samples else False

    return state_df.sample(n=samples, seed=0, with_replacement=sample_with_replacement, shuffle=True) 

def build_dataset(df: pl.DataFrame, states: List[Tuple[str, str]]) -> pl.DataFrame:
    dfs = [get_state_df(df, state_abv) for state, state_abv in states]
    return pl.concat(dfs)

In [92]:
sampled_df = build_dataset(df, state_name_abbr_tuples)

In [94]:
print(f'Number of samples: {len(sampled_df)}')

Number of samples: 470000


In [95]:
print(f'Number of unique states: {len(sampled_df["State"].unique())}')

Number of unique states: 47


We store the dataset into a parquet format because it is a columnar store and compresses efficiently. 

In [98]:
sampled_df.write_parquet(project_dir / 'nad_sample_address.parquet', compression='gzip')

## Sequence to Sequence Dataset prep

In [2]:
project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'
null_values = ["unkn", "unincorporated", "unknown", "null", "nan", "null", "nill", "na", "none"]

In [3]:
df = pl.read_parquet(data_dir / 'address_dataset.parquet')
print(f'Shape of data: {df.shape}')
print(f"States: {df['State'].unique().to_list()}")

Shape of data: (470000, 64)
States: ['NC', 'FL', 'OH', 'KS', 'AK', 'NE', 'SC', 'ND', 'SD', 'TN', 'NM', 'VT', 'CO', 'NY', 'LA', 'AZ', 'IL', 'PA', 'WI', 'CA', 'AL', 'MD', 'ID', 'NH', 'WY', 'MS', 'IN', 'MN', 'UT', 'AR', 'WA', 'WV', 'CT', 'MI', 'ME', 'TX', 'DE', 'IA', 'MT', 'OK', 'RI', 'MO', 'MA', 'GA', 'KY', 'DC', 'VA']


In [4]:
state_mapping = {
    'TX': 'Texas',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'WY': 'Wyoming',
    'KY': 'Kentucky',
    'MI': 'Michigan',
    'WA': 'Washington',
    'VT': 'Vermont',
    'ND': 'North Dakota',
    'TN': 'Tennessee',
    'IN': 'Indiana',
    'WV': 'West Virginia',
    'MN': 'Minnesota',
    'RI': 'Rhode Island',
    'DE': 'Delaware',
    'IL': 'Illinois',
    'SD': 'South Dakota',
    'AK': 'Alaska',
    'MS': 'Mississippi',
    'OK': 'Oklahoma',
    'PA': 'Pennsylvania',
    'WI': 'Wisconsin',
    'NY': 'New York',
    'KS': 'Kansas',
    'NM': 'New Mexico',
    'AZ': 'Arizona',
    'SC': 'South Carolina',
    'FL': 'Florida',
    'NC': 'North Carolina',
    'MD': 'Maryland',
    'UT': 'Utah',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'VA': 'Virginia',
    'GA': 'Georgia',
    'AL': 'Alabama',
    'CA': 'California',
    'MA': 'Massachusetts',
    'CT': 'Connecticut',
    'AR': 'Arkansas',
    'CO': 'Colorado',
    'MT': 'Montana',
    'DC': 'District of Columbia',
    'ID': 'Idaho',
    'IA': 'Iowa',
    'OH': 'Ohio',
    'MO': 'Missouri'
}

First lower case all the string columns, and then replace the occurances of null value strings with np.nan and the replace them with empty strings ``

In [5]:
for column in df.select(pl.col(pl.Utf8)).columns:
    df = df.with_columns(
        pl.col(column).str.to_lowercase().alias(column)
    )

In [6]:
for column in df.select(pl.col(pl.Utf8)).columns:
    df = df.with_columns(
        pl.when(pl.col(column).is_in(null_values)).then(np.nan).otherwise(pl.col(column)).alias(column)
    )

In [7]:
df = df.fill_null('')
df = df.fill_nan('')

In [8]:
for column in df.select(pl.col(pl.Utf8)).columns:
    x = df.filter(
        pl.col(column) == 'nan'
    )
    if len(x) > 0:
        print(x)

Function to build the full address and format it appropriately. I try to follow the address format from geopy's `Nominatim` class. This class connects with the openstreet maps data and provides the latitude, longitude and the full address.

In [9]:
def format_usdot_to_freeform_granular(data: dict, state_map: dict) -> str:
    # Custom null-like values to filter
    NULL_STRINGS = {"", None, "nan", "null"}

    def safe_get(key):
        val = data.get(key)
        if isinstance(val, str):
            val = val.lower()
        return None if (val in NULL_STRINGS or str(val).strip() in NULL_STRINGS) else str(val).strip()

    def safe_title(key):
        val = safe_get(key)
        return val.title() if val else None

    # House number
    number = " ".join(filter(None, [safe_get("AddNum_Pre"),
                                    safe_get("Add_Number"),
                                    safe_get("AddNum_Suf")]))

    # Street full
    street_parts = [
        safe_get("St_PreDir"),
        safe_title("St_Name"),
        safe_title("St_PosTyp"),
        safe_get("St_PosDir")
    ]
    street = " ".join(part for part in street_parts if part)

    # Unit/building details
    sub_parts = []
    if safe_get("Building"): sub_parts.append(f"Bldg {safe_get('Building')}")
    if safe_get("Floor"): sub_parts.append(f"Floor {safe_get('Floor')}")
    if safe_get("Unit"): sub_parts.append(f"Unit {safe_get('Unit')}")
    if safe_get("Room"): sub_parts.append(f"Room {safe_get('Room')}")

    sub_address = ", ".join(sub_parts)

    # Town/City
    town = safe_title("Uninc_Comm") or safe_title("Inc_Muni")

    # County
    county = safe_title("County")

    # State
    state_abbr = safe_get("State")
    state_full = state_map.get(state_abbr.upper(), state_abbr) if state_abbr else None

    # ZIP
    zip_raw = safe_get("Zip_Code")
    zip_code = zip_raw.zfill(5) if zip_raw and zip_raw.isdigit() else None

    # Compose full address
    components = [number, street]
    if sub_address:
        components.append(sub_address)
    components.extend([
        town,
        f"{county} County" if county else None,
        state_full,
        zip_code
    ])

    return ", ".join([c for c in components if c])

In [10]:
formatted_addresses = [
    format_usdot_to_freeform_granular(r, state_mapping) 
    for r in df.rows(named=True)
    ]

In [11]:
df = df.with_columns(
    pl.Series("FormattedFullAddress", formatted_addresses)
)

The mean and median address lengths have increased to 65 characters now. This new formatting makes the address strings more clear and easier to read

In [13]:
mean_add_len = df.with_columns(pl.col('FormattedFullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].mean()
median_add_len = df.with_columns(pl.col('FormattedFullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].median()

print(f'Mean Address Length: {mean_add_len} | Median Address Length: {median_add_len}')

Mean Address Length: 64.9411170212766 | Median Address Length: 64.0


In [14]:
idx = 0
df.select(['OID_', 'FullAddress', 'FormattedFullAddress', 'Latitude', 'Longitude'])[idx].to_dict(as_series=False)

{'OID_': [72099617],
 'FullAddress': ['472 south main street\n\ncamp hill, al, 36850'],
 'FormattedFullAddress': ['472, south Main Street, Camp Hill, Tallapoosa County, Alabama, 36850'],
 'Latitude': ['32.79596540137694'],
 'Longitude': ['-85.6535596907001']}

Validating the Formatted Full Address with the Full Address from Nominatim using reverse geocoding

In [15]:
# location = geolocator.geocode("13, John Daniels Place, New Haven County, Connecticut, 06511")
# location = geolocator.geocode({"postalcode": int("06511"), "country": "US"})
location = geolocator.reverse(['41.317783902424', '-72.9320178229665'])

if location is None:
    print("Location not found.")
else:
    print(location.address)
    print(location.latitude, location.longitude)

13, Daniels Place, Dixwell, New Haven, Connecticut, 06511, United States
41.3177839 -72.932018


In [15]:
idx = 100
print(df[idx]['FormattedFullAddress'].item())
print(df[idx]['Latitude'].item(), df[idx]['Longitude'].item())

4524, Old Caldwell Mill Road, Shelby County, Alabama, 35242
33.41236637208968 -86.73952124099591


In [None]:
# df.write_parquet(data_dir / 'new_formatted_addresses.parquet', compression='gzip')

Clearly the newly formatted address matches more closely with the standard open street maps address.

Now building the source-target pairs for supervised fine tuning. The source is the unnormalized / address with mistakes and target is the cleaned address. 
To generate noisy source addresses, we inject the following noise: <br>
 Noise Types Introduced

| Noise Type              | Field           | Description                                                                 |
|-------------------------|------------------|-----------------------------------------------------------------------------|
| Street Number Removal   | `Add_Number`     | 50% chance to remove the house/building number (`None`)                    |
| Character Corruption    | `St_Name`        | 20% per character: replace characters randomly (simulating typos)          |
| City Dropping           | `Post_City`      | 30% chance to remove city field                                             |
| ZIP Code Truncation     | `Zip_Code`       | 20% chance to truncate ZIP (e.g., `36078` → `3607`)                         |

# Instruction Fine Tuning Dataset

In [12]:
import json

In [None]:
ADDRESS_JSON_FORMAT = {
    "AddNum_Pre": "",
    "Add_Number": "",
    "AddNum_Suf": "",
    "St_PreDir": "",
    "St_Name": "",
    "St_PosTyp": "",
    "St_PosDir": "",
    "Building": "",
    "Floor": "",
    "Unit": "",
    "Room": "",
    "Uninc_Comm": "",
    "Inc_Muni": "",
    "County": "",
    "State": "",
    "Zip_Code": ""
}

In [24]:
# State abbreviation to full name mapping
STATE_MAP = {
    'TX': 'Texas',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'WY': 'Wyoming',
    'KY': 'Kentucky',
    'MI': 'Michigan',
    'WA': 'Washington',
    'VT': 'Vermont',
    'ND': 'North Dakota',
    'TN': 'Tennessee',
    'IN': 'Indiana',
    'WV': 'West Virginia',
    'MN': 'Minnesota',
    'RI': 'Rhode Island',
    'DE': 'Delaware',
    'IL': 'Illinois',
    'SD': 'South Dakota',
    'AK': 'Alaska',
    'MS': 'Mississippi',
    'OK': 'Oklahoma',
    'PA': 'Pennsylvania',
    'WI': 'Wisconsin',
    'NY': 'New York',
    'KS': 'Kansas',
    'NM': 'New Mexico',
    'AZ': 'Arizona',
    'SC': 'South Carolina',
    'FL': 'Florida',
    'NC': 'North Carolina',
    'MD': 'Maryland',
    'UT': 'Utah',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'VA': 'Virginia',
    'GA': 'Georgia',
    'AL': 'Alabama',
    'CA': 'California',
    'MA': 'Massachusetts',
    'CT': 'Connecticut',
    'AR': 'Arkansas',
    'CO': 'Colorado',
    'MT': 'Montana',
    'DC': 'District of Columbia',
    'ID': 'Idaho',
    'IA': 'Iowa',
    'OH': 'Ohio',
    'MO': 'Missouri'
}

### Task 1: Address Parsing

In [None]:
def extract_address_json(row: dict) -> dict:
    """Extract address fields from a row to create the address JSON."""
    result = ADDRESS_JSON_FORMAT.copy()
    
    field_standardizers = {
        "AddNum_Pre": lambda x: str(x).strip(),
        "Add_Number": lambda x: str(x).strip(),
        "AddNum_Suf": lambda x: str(x).strip(),
        "St_PreDir": lambda x: str(x).lower().strip(),  # Directionals standardized to lowercase
        "St_Name": lambda x: str(x).lower().strip(),    # Street names standardized to lowercase
        "St_PosTyp": lambda x: str(x).lower().strip(),  # Street types standardized to lowercase
        "St_PosDir": lambda x: str(x).lower().strip(),  # Directionals standardized to lowercase
        "Building": lambda x: str(x).strip(),
        "Floor": lambda x: str(x).strip(),
        "Unit": lambda x: str(x).strip(),
        "Room": lambda x: str(x).strip(),
        "Uninc_Comm": lambda x: str(x).lower().strip(), # Community names standardized to lowercase
        "Inc_Muni": lambda x: str(x).lower().strip(),   # Municipality standardized to lowercase
        "County": lambda x: str(x).lower().strip(),     # County standardized to lowercase
        "State": lambda x: str(x).lower().strip(),      # State standardized to lowercase
        "Zip_Code": lambda x: str(x).strip()
    }
    
    for key in result.keys():
        if key in row and row[key] is not None and str(row[key]).strip():
            # Apply the appropriate standardization function
            if key in field_standardizers:
                result[key] = field_standardizers[key](row[key])
            else:
                result[key] = str(row[key])
    
    return result

def create_address_parsing_task(row: dict) -> tuple:
    """Create instruction and ground truth for address parsing task."""
    address_json = extract_address_json(row)
    
    instruction = "Parse the following address into a structured JSON with these fields: AddNum_Pre, Add_Number, AddNum_Suf, St_PreDir, St_Name, St_PosTyp, St_PosDir, Building, Floor, Unit, Room, Uninc_Comm, Inc_Muni, County, State, Zip_Code."
    
    address_string = row.get('FormattedFullAddress')

    task_instruction = f"{instruction}\nAddress: {address_string}"
    ground_truth = json.dumps(address_json, indent=2)
    
    return task_instruction, ground_truth

def build_task1_instructions(df: pl.DataFrame) -> pl.DataFrame:
    """
    Build an instruction fine-tuning dataset for address parsing (Task 1).
    
    Args:
        df: Input polars DataFrame with address data
    
    Returns:
        A polars DataFrame with instruction fine-tuning tasks
    """
    # Convert polars DataFrame to list of dictionaries for easier processing
    rows = df.to_dicts()
    
    # Create lists to store the results
    instructions = []
    groundtruths = []
    
    for row in rows:
        # Address Parsing Task
        instruction, groundtruth = create_address_parsing_task(row)
        
        # Append to result lists
        instructions.append(instruction)
        groundtruths.append(groundtruth)

    df = df.with_columns(
        pl.Series(name='task1_instruction', values=instructions),
        pl.Series(name='task1_groundtruth', values=groundtruths)
    )
    
    return df

In [None]:
df_task1 = build_task1_instructions(df)

In [21]:
def print_instruction(df: pl.DataFrame, idx: int):
    record = df[idx]
    print(record["task1_instruction"].item())
    print(record["task1_groundtruth"].item())

In [23]:
print_instruction(df_task1, 3)

Parse the following address into a structured JSON with these fields: AddNum_Pre, Add_Number, AddNum_Suf, St_PreDir, St_Name, St_PosTyp, St_PosDir, Building, Floor, Unit, Room, Uninc_Comm, Inc_Muni, County, State, Zip_Code.
Address: 5551, Wares Ferry Road, Montgomery County, Alabama, 36117
{
  "AddNum_Pre": "",
  "Add_Number": "5551",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "wares ferry",
  "St_PosTyp": "road",
  "St_PosDir": "",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "",
  "Inc_Muni": "nan",
  "County": "montgomery",
  "State": "al",
  "Zip_Code": "36117"
}


### Task 2: Address Entity Prediction

In [None]:
# def add_character_noise(component: str) -> str:
#     """Add character noise while maintaining string type"""
#     return ''.join([
#         random.choice(string.ascii_lowercase) 
#         if c.isalpha() and random.random() < 0.2 
#         else c
#         for c in component
#     ]) if component else component

# def generate_noisy_address(row: Dict, state_map: Dict = STATE_MAP) -> str:
#     """Generate noisy address with type-safe modifications"""
#     modified = row.copy()
    
#     # 50% chance to remove street number (set to None)
#     if random.random() < 0.5:
#         modified['Add_Number'] = None
    
#     # Add noise to street name (keep as string)
#     if modified.get('St_Name'):
#         modified['St_Name'] = add_character_noise(str(modified['St_Name']))
    
#     # 30% chance to remove city (set to None)
#     if random.random() < 0.3:
#         modified['Post_City'] = None
    
#     # 20% chance to modify zip code (keep as integer)
#     if modified.get('Zip_Code') and random.random() < 0.2:
#         zip_code = int(modified['Zip_Code'])
#         if 10000 <= zip_code <= 99999:
#             modified['Zip_Code'] = zip_code // 10  # Truncate last digit
    
#     return format_usdot_to_freeform_granular(modified, state_map)

# def create_address_pairs(df: pl.DataFrame, n_noisy_varient_per_add: int = 3) -> pl.DataFrame:
#     """Generate address pairs with schema consistency"""
#     results = []

#     for row in df.to_dicts():
#         oid = row['OID_']
#         state = row['State']

#         # Original clean target
#         clean_target = row['FormattedFullAddress']
        
#         # Add clean pair
#         results.append({
#             'oid': oid,
#             'source': clean_target,
#             'target': clean_target,
#             'state': state
#         })
        
#         # Generate n noisy variants
#         for _ in range(n_noisy_varient_per_add):
#             noisy_source = generate_noisy_address(row)
#             results.append({
#                 'oid': oid,
#                 'source': noisy_source,
#                 'target': clean_target,
#                 'state': state
#             })
#     # Ensure schema consistency
#     return pl.DataFrame(results).unique()

In [None]:
# noisy_df = create_address_pairs(df_task1, n_noisy_varient_per_add=5)

In [None]:
# df_task1.join(noisy_df, left_on=['OID_'], right_on=['oid', ], how='inner')

In [74]:
def inject_noise_into_json(cleaned_json_str: str, noise_level: str = "medium") -> str:
    """
    Inject realistic noise into a well-formatted Address JSON string.
    Combines structured field-specific transformations with random noise.
    
    Args:
        cleaned_json_str: A string containing well-formatted Address JSON
        noise_level: Level of noise to inject ("low", "medium", "high", "extreme")
        
    Returns:
        A string containing the corrupted Address JSON
    """
    try:
        # Parse the JSON string
        if isinstance(cleaned_json_str, str):
            cleaned_json = json.loads(cleaned_json_str)
        else:
            # If it's already a dict, use it directly
            cleaned_json = cleaned_json_str
            
        # Make a copy to modify
        noisy_json = cleaned_json.copy()
        
        # Adjust noise parameters based on noise level
        if noise_level == "low":
            error_rate = 0.15
            max_fields_to_modify = 2
            field_weights = {'typo': 0.4, 'empty': 0.1, 'alternate_form': 0.3, 'random_noise': 0.2}
            empty_field_probability = 0.1
            special_mod_probability = 0.05
        elif noise_level == "medium":
            error_rate = 0.25
            max_fields_to_modify = 4
            field_weights = {'typo': 0.3, 'empty': 0.2, 'alternate_form': 0.2, 'random_noise': 0.3}
            empty_field_probability = 0.2
            special_mod_probability = 0.15
        elif noise_level == "high":
            error_rate = 0.4
            max_fields_to_modify = 6
            field_weights = {'typo': 0.25, 'empty': 0.25, 'alternate_form': 0.15, 'random_noise': 0.35}
            empty_field_probability = 0.3
            special_mod_probability = 0.25
        else:  # extreme
            error_rate = 0.6
            max_fields_to_modify = 8
            field_weights = {'typo': 0.2, 'empty': 0.3, 'alternate_form': 0.1, 'random_noise': 0.4}
            empty_field_probability = 0.4
            special_mod_probability = 0.4
        
        # Helper functions for adding different types of noise
        def add_typo(text: str) -> str:
            """Add realistic typos to text strings with higher error rate"""
            if not text or len(text) < 2:
                return text
                
            result = list(text)
            
            # Multiple errors possible in longer text
            num_errors = 1
            if len(text) > 5:
                num_errors = random.randint(1, min(len(text) // 3, 3))
            
            for _ in range(num_errors):
                # Determine error type based on text length
                error_types = []
                
                if len(text) >= 2:
                    error_types.extend(['swap', 'delete', 'substitute', 'insert_random'])
                if len(text) >= 4:
                    error_types.extend(['case', 'multiple_insert'])
                if len(text) >= 6:
                    error_types.extend(['double_letter', 'remove_multiple'])
                    
                error_type = random.choice(error_types)
                
                if error_type == 'swap' and len(text) >= 2:
                    # Swap two adjacent characters
                    pos = random.randint(0, len(result) - 2)
                    if pos < len(result) - 1:  # Ensure we're not at the end
                        result[pos], result[pos + 1] = result[pos + 1], result[pos]
                    
                elif error_type == 'delete':
                    # Delete a character
                    if len(result) > 1:  # Ensure we don't delete all characters
                        pos = random.randint(0, len(result) - 1)
                        result[pos] = ''
                    
                elif error_type == 'substitute':
                    # Substitute a character with a similar one
                    adjacent_keys = {
                        'a': 'sqzw', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx', 'e': 'wrsdf',
                        'f': 'drtgv', 'g': 'ftyhv', 'h': 'gyujbn', 'i': 'ujko', 'j': 'huiknm',
                        'k': 'jiolm', 'l': 'kop', 'm': 'njk', 'n': 'bhjm', 'o': 'iklp',
                        'p': 'ol', 'q': 'wa', 'r': 'edft', 's': 'awedxz', 't': 'rfgy',
                        'u': 'yhji', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc', 'y': 'tghu',
                        'z': 'asx', '0': '9po', '1': '2q', '2': '1w3', '3': '2e4', '4': '3r5',
                        '5': '4t6', '6': '5y7', '7': '6u8', '8': '7i9', '9': '8o0'
                    }
                    
                    if result:  # Ensure we have characters to work with
                        pos = random.randint(0, len(result) - 1)
                        if pos < len(result):  # Safety check
                            char = result[pos].lower()
                            if char in adjacent_keys:
                                replacement = random.choice(adjacent_keys[char])
                                # Match case if original was uppercase
                                if result[pos].isupper():
                                    replacement = replacement.upper()
                                result[pos] = replacement
                        
                elif error_type == 'case':
                    # Change case of one or more characters
                    num_to_change = random.randint(1, min(len(result), 3))
                    for _ in range(num_to_change):
                        if result:  # Ensure we have characters
                            pos = random.randint(0, len(result) - 1)
                            if pos < len(result) and result[pos].isalpha():  # Safety check
                                if result[pos].islower():
                                    result[pos] = result[pos].upper()
                                else:
                                    result[pos] = result[pos].lower()
                        
                elif error_type == 'insert_random':
                    # Insert random characters
                    if len(result) > 0:  # Ensure we have a string to work with
                        pos = random.randint(0, len(result))
                        random_char = random.choice(string.ascii_lowercase + string.digits)
                        if pos <= len(result):  # Safety check
                            result.insert(pos, random_char)
                
                elif error_type == 'multiple_insert':
                    # Insert multiple random characters
                    num_inserts = random.randint(1, 3)
                    for _ in range(num_inserts):
                        if len(result) > 0:  # Safety check
                            pos = random.randint(0, len(result))
                            random_char = random.choice(string.ascii_lowercase + string.digits)
                            if pos <= len(result):  # Safety check
                                result.insert(pos, random_char)
                
                elif error_type == 'double_letter':
                    # Double a letter (common typo)
                    if result:  # Ensure we have characters
                        pos = random.randint(0, len(result) - 1)
                        if pos < len(result) and result[pos].isalpha():  # Safety check
                            result.insert(pos + 1, result[pos])
                
                elif error_type == 'remove_multiple':
                    # Remove multiple characters in a row
                    if len(result) > 3:  # Ensure we have enough characters
                        start_pos = random.randint(0, len(result) - 3)
                        num_to_remove = random.randint(1, min(len(result) - start_pos - 1, 3))
                        for i in range(num_to_remove):
                            if start_pos < len(result):  # Safety check
                                result[start_pos] = ''
                                # Don't increment start_pos since we're removing items in place
            
            return ''.join(result)
            
        def add_character_noise(text: str) -> str:
            """Add character noise while maintaining string type (with higher error rate)"""
            if not text:
                return text
                
            result = []
            for c in text:
                # Higher chance of inserting random characters
                if c.isalpha() and random.random() < error_rate:
                    result.append(random.choice(string.ascii_lowercase + string.digits))
                else:
                    result.append(c)
                    
                # Chance to insert an extra character after
                if random.random() < error_rate / 2:
                    result.append(random.choice(string.ascii_lowercase + string.digits))
                    
            # Chance to remove random characters (up to 20%)
            if len(result) > 5 and random.random() < error_rate:
                num_to_remove = random.randint(1, max(1, int(len(result) * 0.2)))
                indices_to_remove = random.sample(range(len(result)), min(num_to_remove, len(result)))
                for i in sorted(indices_to_remove, reverse=True):
                    if 0 <= i < len(result):  # Safety check
                        result.pop(i)
                        
            return ''.join(result)
        
        def field_specific_noise(field: str, value: str) -> str:
            """Apply field-specific transformations and abbreviations"""
            if not value:
                return value
                
            # Sometimes use original character noise approach with higher probability
            if random.random() < error_rate:
                return add_character_noise(value)
                
            # Field-specific common errors/variations
            if field == "St_PosTyp":
                # Street type variations
                mapping = {
                    'street': ['st', 'str', 'strt', 'stret', 'streeet', 'strret'],
                    'avenue': ['ave', 'av', 'aven', 'avenu', 'avnue', 'avne'],
                    'boulevard': ['blvd', 'boul', 'blv', 'bld', 'boulevrd', 'boulevd'],
                    'road': ['rd', 'rod', 'rad', 'rroad', 'roadd'],
                    'lane': ['ln', 'la', 'lne', 'lanne'],
                    'drive': ['dr', 'drv', 'dirve', 'driv', 'drvie'],
                    'circle': ['cir', 'circ', 'crcl', 'cicle'],
                    'court': ['ct', 'crt', 'cort', 'courrt'],
                    'place': ['pl', 'plc', 'plce', 'plac']
                }
                
                value_lower = value.lower()
                for full, variants in mapping.items():
                    if value_lower == full:
                        return random.choice(variants)
                    elif value_lower in variants:
                        # Randomly expand abbreviation
                        if random.random() < 0.4:
                            return full
                        else:
                            return value
            
            elif field in ["St_PreDir", "St_PosDir"]:
                # Directional variations
                mapping = {
                    'north': ['n', 'no', 'nrth', 'nth', 'nort', 'norht'],
                    'south': ['s', 'so', 'sth', 'sout', 'souht', 'souh'],
                    'east': ['e', 'ea', 'est', 'eas', 'esat', 'esst'],
                    'west': ['w', 'wt', 'wst', 'wes', 'vest', 'wesst'],
                    'northwest': ['nw', 'n w', 'nortwest', 'north west', 'nor west', 'northwst'],
                    'northeast': ['ne', 'n e', 'northest', 'north east', 'nrth east', 'noreast'],
                    'southwest': ['sw', 's w', 'soutwest', 'south west', 'sout west', 'southwst'],
                    'southeast': ['se', 's e', 'southest', 'south east', 'sout east', 'sutheast']
                }
                
                value_lower = value.lower()
                for full, variants in mapping.items():
                    if value_lower == full:
                        return random.choice(variants)
                    elif value_lower in variants:
                        # Randomly expand abbreviation
                        if random.random() < 0.4:
                            return full
                        else:
                            return value
            
            elif field == "State":
                # State abbreviation/full name variations with typos
                state_mapping = {
                    'al': ['alabama', 'alabamma', 'alambama', 'alabma'],
                    'ak': ['alaska', 'alasca', 'alaskaa', 'alaka'],
                    'az': ['arizona', 'arizonna', 'arizonaa', 'arizon'],
                    'ar': ['arkansas', 'arkansass', 'arkansa', 'arkanss'],
                    'ca': ['california', 'califrnia', 'califronia', 'californa'],
                    'co': ['colorado', 'colordo', 'colorada', 'colorodo'],
                    'ct': ['connecticut', 'conneticut', 'conecticut', 'conneticut'],
                    'de': ['delaware', 'deleware', 'delware', 'delawar'],
                    'fl': ['florida', 'flrida', 'flordia', 'floria'],
                    'ga': ['georgia', 'gergia', 'geogia', 'georga'],
                    'hi': ['hawaii', 'hawai', 'hawii', 'hawai'],
                    'id': ['idaho', 'idao', 'idahoo', 'ideho'],
                    'il': ['illinois', 'illnois', 'illinoi', 'illinos'],
                    'in': ['indiana', 'indana', 'indianna', 'indina'],
                    'ia': ['iowa', 'ioa', 'iowaa', 'iwa'],
                    'ks': ['kansas', 'knsas', 'kansa', 'kanss'],
                    'ky': ['kentucky', 'kentuky', 'kentuckyy', 'kentuckey'],
                    'la': ['louisiana', 'lousiana', 'louisian', 'louisana'],
                    'me': ['maine', 'main', 'manie', 'mane'],
                    'md': ['maryland', 'mayland', 'marland', 'mariland'],
                    'ma': ['massachusetts', 'massachusets', 'massachsetts', 'massachusets'],
                    'mi': ['michigan', 'michign', 'michgan', 'michigann'],
                    'mn': ['minnesota', 'minesota', 'minnesot', 'minnisota'],
                    'ms': ['mississippi', 'missisippi', 'mississipi', 'missisipi'],
                    'mo': ['missouri', 'misouri', 'missori', 'missoury'],
                    'mt': ['montana', 'montna', 'montanna', 'montaa'],
                    'ne': ['nebraska', 'nebraka', 'nebaska', 'nebraksa'],
                    'nv': ['nevada', 'nevda', 'nevaa', 'neveda'],
                    'nh': ['new hampshire', 'new hamshire', 'new hampshir', 'new hampsher'],
                    'nj': ['new jersey', 'new jersery', 'new jrsey', 'new jersy'],
                    'nm': ['new mexico', 'new mexio', 'new mexcio', 'new mexco'],
                    'ny': ['new york', 'new yrk', 'new yorke', 'newyork'],
                    'nc': ['north carolina', 'north carolia', 'north carlina', 'north carolna'],
                    'nd': ['north dakota', 'north dakta', 'north dakoa', 'north dakotta'],
                    'oh': ['ohio', 'ohi', 'ohioo', 'ohio'],
                    'ok': ['oklahoma', 'oklahma', 'oklahoa', 'oklahome'],
                    'or': ['oregon', 'orgon', 'oregan', 'oreegon'],
                    'pa': ['pennsylvania', 'pennsylvnia', 'pensylvania', 'pennsylvana'],
                    'ri': ['rhode island', 'rhod island', 'rhode islnd', 'rode island'],
                    'sc': ['south carolina', 'south carolia', 'south carlina', 'south carolna'],
                    'sd': ['south dakota', 'south dakta', 'south dakoa', 'south dakotta'],
                    'tn': ['tennessee', 'tennesee', 'tennese', 'tennessse'],
                    'tx': ['texas', 'texass', 'texs', 'texxas'],
                    'ut': ['utah', 'utahh', 'uta', 'utha'],
                    'vt': ['vermont', 'vermnt', 'vermon', 'vermot'],
                    'va': ['virginia', 'virgina', 'virgnia', 'virginiaa'],
                    'wa': ['washington', 'washingtn', 'washinton', 'washingtton'],
                    'wv': ['west virginia', 'west virgina', 'west virgnia', 'west virginiaa'],
                    'wi': ['wisconsin', 'wisconsn', 'wisconsinn', 'wiscosin'],
                    'wy': ['wyoming', 'wyomng', 'wyomin', 'woming']
                }
                
                value_lower = value.lower()
                
                # If it's an abbreviation, possibly expand it (with typos)
                if value_lower in state_mapping:
                    if random.random() < 0.6:  # Higher chance of expansion
                        full_options = state_mapping[value_lower]
                        return random.choice(full_options)
                
                # If it's a full name, possibly abbreviate it or replace with another typo version
                for abbr, full_options in state_mapping.items():
                    if value_lower in full_options:
                        if random.random() < 0.4:
                            return abbr
                        else:
                            # Use a different typo variation
                            other_options = [opt for opt in full_options if opt != value_lower]
                            if other_options:
                                return random.choice(other_options)
            
            # Default: apply standard typo with higher error rate
            return add_typo(value)
        
        # Add special modifications based on field type with higher chance of changes
        def field_specific_modifications():
            # Handle Add_Number specially - might remove completely or modify
            if "Add_Number" in noisy_json and noisy_json["Add_Number"]:
                mod_choice = random.choices(
                    ['remove', 'truncate', 'add_digit', 'transpose'], 
                    weights=[0.3, 0.2, 0.2, 0.3], 
                    k=1
                )[0]
                
                if mod_choice == 'remove':
                    noisy_json["Add_Number"] = ""
                elif mod_choice == 'truncate' and len(str(noisy_json["Add_Number"])) > 1:
                    # Remove first or last digit
                    if random.random() < 0.5:
                        noisy_json["Add_Number"] = str(noisy_json["Add_Number"])[1:]
                    else:
                        noisy_json["Add_Number"] = str(noisy_json["Add_Number"])[:-1]
                elif mod_choice == 'add_digit':
                    # Add a random digit at beginning or end
                    digit = str(random.randint(0, 9))
                    if random.random() < 0.5:
                        noisy_json["Add_Number"] = digit + str(noisy_json["Add_Number"])
                    else:
                        noisy_json["Add_Number"] = str(noisy_json["Add_Number"]) + digit
                elif mod_choice == 'transpose' and len(str(noisy_json["Add_Number"])) > 1:
                    # Transpose digits
                    num_str = str(noisy_json["Add_Number"])
                    pos = random.randint(0, len(num_str) - 2)
                    noisy_json["Add_Number"] = num_str[:pos] + num_str[pos+1] + num_str[pos] + num_str[pos+2:]
                
            # Handle Zip_Code specially - might remove, truncate, or modify
            if "Zip_Code" in noisy_json and noisy_json["Zip_Code"]:
                mod_choice = random.choices(
                    ['remove', 'truncate', 'add_digit', 'transpose'], 
                    weights=[0.2, 0.3, 0.2, 0.3], 
                    k=1
                )[0]
                
                if mod_choice == 'remove':
                    noisy_json["Zip_Code"] = ""
                elif mod_choice == 'truncate' and len(str(noisy_json["Zip_Code"])) > 1:
                    # Remove last digit(s)
                    num_to_remove = random.randint(1, min(2, len(str(noisy_json["Zip_Code"])) - 1))
                    noisy_json["Zip_Code"] = str(noisy_json["Zip_Code"])[:-num_to_remove]
                elif mod_choice == 'add_digit':
                    # Add a random digit at beginning or end
                    digit = str(random.randint(0, 9))
                    if random.random() < 0.3:  # less likely at beginning
                        noisy_json["Zip_Code"] = digit + str(noisy_json["Zip_Code"])
                    else:
                        noisy_json["Zip_Code"] = str(noisy_json["Zip_Code"]) + digit
                elif mod_choice == 'transpose' and len(str(noisy_json["Zip_Code"])) > 1:
                    # Transpose digits
                    num_str = str(noisy_json["Zip_Code"])
                    pos = random.randint(0, len(num_str) - 2)
                    noisy_json["Zip_Code"] = num_str[:pos] + num_str[pos+1] + num_str[pos] + num_str[pos+2:]
            
            # Possible random structural changes with higher probability
            if random.random() < special_mod_probability:
                struct_mod = random.choice([
                    'merge_fields', 'swap_fields', 'duplicate_content', 'add_suffix', 'add_prefix'
                ])
                
                if struct_mod == 'merge_fields':
                    # Merge content of two fields
                    field_pairs = [
                        ("St_Name", "St_PosTyp"),
                        ("AddNum_Pre", "Add_Number"),
                        ("Add_Number", "St_Name"),
                        ("St_PreDir", "St_Name"),
                        ("Inc_Muni", "County")
                    ]
                    valid_pairs = [(f1, f2) for f1, f2 in field_pairs 
                                  if noisy_json.get(f1) and noisy_json.get(f2)]
                    
                    if valid_pairs:
                        f1, f2 = random.choice(valid_pairs)
                        # Merge with or without space
                        separator = "" if random.random() < 0.5 else " "
                        noisy_json[f1] = str(noisy_json[f1]) + separator + str(noisy_json[f2])
                        noisy_json[f2] = ""
                
                elif struct_mod == 'swap_fields':
                    # Swap content of two fields
                    field_pairs = [
                        ("St_Name", "Inc_Muni"),
                        ("County", "Inc_Muni"),
                        ("St_PreDir", "St_PosDir"),
                        ("AddNum_Pre", "AddNum_Suf")
                    ]
                    valid_pairs = [(f1, f2) for f1, f2 in field_pairs 
                                  if noisy_json.get(f1) and noisy_json.get(f2)]
                    
                    if valid_pairs:
                        f1, f2 = random.choice(valid_pairs)
                        noisy_json[f1], noisy_json[f2] = noisy_json[f2], noisy_json[f1]
                
                elif struct_mod == 'duplicate_content':
                    # Duplicate content from one field to another
                    source_fields = [f for f in noisy_json.keys() if noisy_json.get(f)]
                    target_fields = [f for f in noisy_json.keys() if not noisy_json.get(f)]
                    
                    if source_fields and target_fields:
                        source = random.choice(source_fields)
                        target = random.choice(target_fields)
                        noisy_json[target] = noisy_json[source]
                
                elif struct_mod == 'add_suffix' and "St_Name" in noisy_json and noisy_json["St_Name"]:
                    # Add random suffix to street name
                    suffixes = ["st", "street", "ave", "avenue", "rd", "road"]
                    noisy_json["St_Name"] += " " + random.choice(suffixes)
                
                elif struct_mod == 'add_prefix' and "St_Name" in noisy_json and noisy_json["St_Name"]:
                    # Add random prefix to street name
                    prefixes = ["n", "s", "e", "w", "north", "south", "east", "west"]
                    noisy_json["St_Name"] = random.choice(prefixes) + " " + noisy_json["St_Name"]
            
            # Random deletion of additional fields
            num_extra_fields_to_delete = random.randint(0, 2)
            potential_fields_to_delete = ["Inc_Muni", "County", "Room", "Unit", "Building", "Floor", "St_PosDir"]
            fields_with_values = [f for f in potential_fields_to_delete if f in noisy_json and noisy_json[f]]
            
            if fields_with_values:
                fields_to_delete = random.sample(
                    fields_with_values, 
                    min(num_extra_fields_to_delete, len(fields_with_values))
                )
                for field in fields_to_delete:
                    noisy_json[field] = ""
        
        # Select random fields to modify (with more fields based on noise level)
        all_fields = list(noisy_json.keys())
        non_empty_fields = [field for field in all_fields if noisy_json[field]]
        
        # Only corrupt fields that have values
        if non_empty_fields:
            # Determine how many fields to modify (based on noise level)
            num_fields_to_modify = random.randint(1, min(max_fields_to_modify, len(non_empty_fields)))
            fields_to_modify = random.sample(non_empty_fields, num_fields_to_modify)
            
            for field in fields_to_modify:
                field_value = noisy_json[field]
                if not field_value:
                    continue
                    
                # Choose modification type with weights based on noise level
                mod_type = random.choices(
                    ['typo', 'empty', 'alternate_form', 'random_noise'], 
                    weights=[
                        field_weights['typo'], 
                        field_weights['empty'], 
                        field_weights['alternate_form'], 
                        field_weights['random_noise']
                    ], 
                    k=1
                )[0]
                
                if mod_type == 'typo':
                    # Add a typo to the field
                    noisy_json[field] = field_specific_noise(field, field_value)
                elif mod_type == 'empty':
                    # Empty the field
                    noisy_json[field] = ""
                elif mod_type == 'alternate_form':
                    # Apply field-specific transformation
                    noisy_json[field] = field_specific_noise(field, field_value)
                elif mod_type == 'random_noise':
                    # Apply original random character insertion approach
                    noisy_json[field] = add_character_noise(field_value)
        
        # Apply additional field-specific modifications
        field_specific_modifications()
        
        # Return as formatted JSON string
        return json.dumps(noisy_json, indent=2)
        
    except Exception as e:
        # If any error occurs, return the original string
        print(f"Error injecting noise: {str(e)}")
        return cleaned_json_str

In [92]:
def create_task2_instruction_dataset(df, n_noise_variants_per_address: int = 3):
    """
    Create Task 2 (entity correction) instruction dataset with multiple noise variants.
    
    Args:
        df: A DataFrame containing task1_groundtruth column with clean Address JSON
        n_noise_variants_per_address: Number of noisy variants to create per clean address
        
    Returns:
        DataFrame with expanded rows containing task2_instruction and task2_groundtruth columns
    """
    # Convert the input DataFrame to a list of dictionaries for easier manipulation
    rows_as_dicts = df.to_dicts()
    all_rows = []
    
    for row_idx, original_row_dict in enumerate(rows_as_dicts):
        # Get the clean JSON from Task 1
        clean_json = original_row_dict['task1_groundtruth']
        
        # Create multiple noisy variants
        for variant_idx in range(n_noise_variants_per_address):
            # Create a copy of the original row
            new_row = original_row_dict.copy()
            
            # Determine noise level - mix of different noise levels
            if variant_idx == 0:
                noise_level = "medium"  # First variant is medium noise
            elif variant_idx == n_noise_variants_per_address - 1:
                noise_level = "extreme"  # Last variant is extreme noise
            else:
                # Other variants are random levels
                noise_level = random.choice(["low", "medium", "high", "extreme"])
            
            # Inject noise to create corrupted JSON
            noisy_json = inject_noise_into_json(clean_json, noise_level=noise_level)
            
            # Create the instruction
            task2_instruction = f"Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON\nAddress JSON: {noisy_json}"
            
            # The ground truth is the original clean JSON
            task2_groundtruth = clean_json
            
            # Add task2 columns
            new_row['task2_instruction'] = task2_instruction
            new_row['task2_groundtruth'] = task2_groundtruth
            new_row['noise_level'] = noise_level
            new_row['variant_idx'] = variant_idx
            
            # Add to results
            all_rows.append(new_row)
    
    # Ensure consistent data types by explicitly defining the schema
    # First get a list of all columns in the new rows
    column_names = list(all_rows[0].keys())
    
    # Create a schema to help with data type consistency
    schema = {}
    for col in column_names:
        # Sample the first value to determine type
        sample_val = all_rows[0][col]
        
        if isinstance(sample_val, int):
            schema[col] = pl.Int64
        elif isinstance(sample_val, float):
            schema[col] = pl.Float64
        else:
            schema[col] = pl.Utf8  # Default to string for everything else
    
    result_df = pl.DataFrame(all_rows, schema=schema)
    
    return result_df

In [93]:
df_task2 = create_task2_instruction_dataset(df_task1, n_noise_variants_per_address=3)
df_task2 = df_task2.unique()
print(f'Number of samples: {len(df_task2)}')

Number of samples: 1404780


In [94]:
record = df_task2[3000]
print(f"Correct Address: {record['FormattedFullAddress'].item()}")
print(f"Input:\n{record['task2_instruction'].item()}")
print(f"Target:\n{record['task2_groundtruth'].item()}")

Correct Address: 2807, New Hope Road, Marianna, Jackson County, Florida, 32448
Input:
Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "2870",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "nn8 hofp3e",
  "St_PosTyp": "",
  "St_PosDir": "",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "mar4nnh",
  "Inc_Muni": "",
  "County": "",
  "State": "fl",
  "Zip_Code": "324"
}
Target:
{
  "AddNum_Pre": "",
  "Add_Number": "2807",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "new hope",
  "St_PosTyp": "road",
  "St_PosDir": "",
  "

In [95]:
df_task2.select(['OID_', 'FormattedFullAddress', 'task1_instruction', 'task1_groundtruth', 'task2_instruction', 'task2_groundtruth']).schema

Schema([('OID_', Int64),
        ('FormattedFullAddress', String),
        ('task1_instruction', String),
        ('task1_groundtruth', String),
        ('task2_instruction', String),
        ('task2_groundtruth', String)])

In [96]:
df_task2.write_parquet(data_dir / 'address_with_instructions.parquet', compression='gzip')