In [1]:
# Dataset Preparation for Parquet
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import multiprocessing as mp
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


Loading the required columns from the parquet File with address details

In [2]:
# Method 1: Use columns directly during read to minimize memory usage
columns_to_load = ['OID_', 'State', 'Zip_Code', 'StreetAddress', 'CityStateZip', 'FullAddress', 'FormattedFullAddress', 'Latitude', 'Longitude']
df = pd.read_parquet('data/address_with_instructions.parquet', columns=columns_to_load)

# Normalize 'State' once to avoid repeat operations
df['State'] = df['State'].str.strip().str.lower()

Methods for creating the dataset

In [3]:
# --- Batchwise Processing ---
def process_ma_batches(df, total_count=5000, batch_size=500):
    ma_df = df[df['State'] == 'ma'].sample(n=total_count, random_state=42)
    ma_df = ma_df.sort_values('Zip_Code').reset_index(drop=True)

     # Convert 'Latitude' and 'Longitude' to numeric before processing
    ma_df['Latitude'] = pd.to_numeric(ma_df['Latitude'], errors='coerce')
    ma_df['Longitude'] = pd.to_numeric(ma_df['Longitude'], errors='coerce')

    total_batches = (len(ma_df) + batch_size - 1) // batch_size
    all_triplets = []

    for i in range(0, len(ma_df), batch_size):
        batch_df = ma_df.iloc[i:i+batch_size]
        print(f"Processing MA batch {i // batch_size + 1} of {total_batches}...")

        triplets = generate_triplets_parallel(
            batch_df,
            id_col='OID_',
            lat_col='Latitude',
            lon_col='Longitude',
            address_col='FormattedFullAddress',
            pos_thres=300,
            neg_thres=1000,
            max_triplets=None
        )

        all_triplets.extend(triplets)

    return all_triplets

# --- Triplet Generation Optimized ---
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2.0)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def process_anchor(anchor_row, data, id_col, lat_col, lon_col, address_col, pos_thres, neg_thres):
    anchor_id = anchor_row[id_col]
    anchor_lat = anchor_row[lat_col]
    anchor_lon = anchor_row[lon_col]
    anchor_text = anchor_row[address_col]
    if pd.isna(anchor_lat) or pd.isna(anchor_lon):
        return []
    pos, neg = [], []

    for _, row in data.iterrows():
        if row[id_col] == anchor_id or pd.isna(row[lat_col]) or pd.isna(row[lon_col]):
            continue
        dist = haversine_distance(anchor_lat, anchor_lon, row[lat_col], row[lon_col])
        if dist <= pos_thres:
            pos.append(row[address_col])
        elif dist >= neg_thres:
            neg.append(row[address_col])

    if pos and neg:
        return [[anchor_text, random.choice(pos), random.choice(neg)]]
    return []

# Multiprocessing method

def generate_triplets_parallel(df, id_col, lat_col, lon_col, address_col, pos_thres, neg_thres, max_triplets=None):
    data = df[[id_col, lat_col, lon_col, address_col]]
    data_list = [row for _, row in data.iterrows()]

    pool = mp.Pool(mp.cpu_count())
    func = partial(process_anchor, data=data, id_col=id_col, lat_col=lat_col, lon_col=lon_col, address_col=address_col, pos_thres=pos_thres, neg_thres=neg_thres)

    triplets = []
    for result in tqdm(pool.imap_unordered(func, data_list), total=len(data_list)):
        if result:
            triplets.extend(result)
            if max_triplets and len(triplets) >= max_triplets:
                break

    pool.close()
    pool.join()
    return triplets[:max_triplets] if max_triplets else triplets


triplets = process_ma_batches(df, total_count=30000, batch_size=500)

triplets_df = pd.DataFrame(triplets, columns=['anchor', 'positive', 'negative'])
triplets_df.to_csv('data/retiever_triplets_dataset.csv', index=False)

Processing MA batch 1 of 60...


100%|██████████| 500/500 [00:01<00:00, 284.14it/s]


Processing MA batch 2 of 60...


100%|██████████| 500/500 [00:01<00:00, 279.02it/s]


Processing MA batch 3 of 60...


100%|██████████| 500/500 [00:01<00:00, 278.87it/s]


Processing MA batch 4 of 60...


100%|██████████| 500/500 [00:01<00:00, 277.82it/s]


Processing MA batch 5 of 60...


100%|██████████| 500/500 [00:01<00:00, 260.71it/s]

Processing MA batch 6 of 60...



100%|██████████| 500/500 [00:01<00:00, 279.00it/s]


Processing MA batch 7 of 60...


100%|██████████| 500/500 [00:01<00:00, 279.03it/s]

Processing MA batch 8 of 60...



100%|██████████| 500/500 [00:01<00:00, 262.34it/s]


Processing MA batch 9 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.53it/s]


Processing MA batch 10 of 60...


100%|██████████| 500/500 [00:01<00:00, 268.33it/s]


Processing MA batch 11 of 60...


100%|██████████| 500/500 [00:01<00:00, 270.14it/s]

Processing MA batch 12 of 60...



100%|██████████| 500/500 [00:01<00:00, 280.07it/s]


Processing MA batch 13 of 60...


100%|██████████| 500/500 [00:01<00:00, 271.05it/s]


Processing MA batch 14 of 60...


100%|██████████| 500/500 [00:01<00:00, 270.08it/s]


Processing MA batch 15 of 60...


100%|██████████| 500/500 [00:01<00:00, 280.17it/s]

Processing MA batch 16 of 60...



100%|██████████| 500/500 [00:01<00:00, 258.98it/s]


Processing MA batch 17 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.12it/s]

Processing MA batch 18 of 60...



100%|██████████| 500/500 [00:01<00:00, 276.49it/s]


Processing MA batch 19 of 60...


100%|██████████| 500/500 [00:01<00:00, 271.14it/s]


Processing MA batch 20 of 60...


100%|██████████| 500/500 [00:01<00:00, 275.33it/s]


Processing MA batch 21 of 60...


100%|██████████| 500/500 [00:01<00:00, 270.59it/s]


Processing MA batch 22 of 60...


100%|██████████| 500/500 [00:01<00:00, 272.88it/s]


Processing MA batch 23 of 60...


100%|██████████| 500/500 [00:01<00:00, 277.46it/s]


Processing MA batch 24 of 60...


100%|██████████| 500/500 [00:01<00:00, 271.52it/s]


Processing MA batch 25 of 60...


100%|██████████| 500/500 [00:01<00:00, 272.79it/s]

Processing MA batch 26 of 60...



100%|██████████| 500/500 [00:01<00:00, 277.83it/s]


Processing MA batch 27 of 60...


100%|██████████| 500/500 [00:01<00:00, 269.69it/s]


Processing MA batch 28 of 60...


100%|██████████| 500/500 [00:01<00:00, 272.46it/s]


Processing MA batch 29 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.78it/s]


Processing MA batch 30 of 60...


100%|██████████| 500/500 [00:01<00:00, 271.23it/s]


Processing MA batch 31 of 60...


100%|██████████| 500/500 [00:01<00:00, 280.09it/s]

Processing MA batch 32 of 60...



100%|██████████| 500/500 [00:01<00:00, 271.65it/s]


Processing MA batch 33 of 60...


100%|██████████| 500/500 [00:01<00:00, 275.52it/s]


Processing MA batch 34 of 60...


100%|██████████| 500/500 [00:01<00:00, 279.32it/s]


Processing MA batch 35 of 60...


100%|██████████| 500/500 [00:01<00:00, 274.05it/s]


Processing MA batch 36 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.09it/s]


Processing MA batch 37 of 60...


100%|██████████| 500/500 [00:01<00:00, 274.98it/s]


Processing MA batch 38 of 60...


100%|██████████| 500/500 [00:01<00:00, 278.41it/s]


Processing MA batch 39 of 60...


100%|██████████| 500/500 [00:01<00:00, 277.98it/s]

Processing MA batch 40 of 60...



100%|██████████| 500/500 [00:01<00:00, 272.82it/s]


Processing MA batch 41 of 60...


100%|██████████| 500/500 [00:01<00:00, 273.74it/s]


Processing MA batch 42 of 60...


100%|██████████| 500/500 [00:01<00:00, 279.49it/s]

Processing MA batch 43 of 60...



100%|██████████| 500/500 [00:01<00:00, 271.89it/s]


Processing MA batch 44 of 60...


100%|██████████| 500/500 [00:01<00:00, 275.75it/s]


Processing MA batch 45 of 60...


100%|██████████| 500/500 [00:01<00:00, 273.66it/s]

Processing MA batch 46 of 60...



100%|██████████| 500/500 [00:01<00:00, 271.73it/s]

Processing MA batch 47 of 60...



100%|██████████| 500/500 [00:01<00:00, 279.03it/s]


Processing MA batch 48 of 60...


100%|██████████| 500/500 [00:01<00:00, 271.84it/s]


Processing MA batch 49 of 60...


100%|██████████| 500/500 [00:01<00:00, 272.15it/s]


Processing MA batch 50 of 60...


100%|██████████| 500/500 [00:01<00:00, 275.05it/s]


Processing MA batch 51 of 60...


100%|██████████| 500/500 [00:01<00:00, 273.07it/s]


Processing MA batch 52 of 60...


100%|██████████| 500/500 [00:01<00:00, 272.86it/s]


Processing MA batch 53 of 60...


100%|██████████| 500/500 [00:01<00:00, 274.29it/s]


Processing MA batch 54 of 60...


100%|██████████| 500/500 [00:01<00:00, 280.08it/s]


Processing MA batch 55 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.77it/s]

Processing MA batch 56 of 60...



100%|██████████| 500/500 [00:01<00:00, 269.74it/s]


Processing MA batch 57 of 60...


100%|██████████| 500/500 [00:01<00:00, 270.55it/s]

Processing MA batch 58 of 60...



100%|██████████| 500/500 [00:01<00:00, 279.35it/s]


Processing MA batch 59 of 60...


100%|██████████| 500/500 [00:01<00:00, 264.73it/s]


Processing MA batch 60 of 60...


100%|██████████| 500/500 [00:01<00:00, 276.10it/s]
