# Geographical Evaluation using Open Street Maps

In [45]:
import os
import polars as pl
import numpy as np
from pathlib import Path
from geopy.distance import geodesic
from typing import List, Tuple, Dict, Union
from geopy.geocoders import Nominatim

print(f'polars version: {pl.__version__}')

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'

geolocator = Nominatim(user_agent="geocoder_llm_project", timeout=300)

polars version: 1.9.0


In [46]:
df = pl.read_parquet(data_dir / 'new_formatted_addresses.parquet')

In [47]:
idx = 100
print(df[idx]['FormattedFullAddress'].item())
print(df[idx]['Latitude'].item(), df[idx]['Longitude'].item())
# df[idx].to_dict(as_series=False)

4524, Old Caldwell Mill Road, Shelby County, Alabama, 35242
33.41236637208968 -86.73952124099591


The GeoEvaluation class is designed to evaluate the accuracy of predicted geographic locations by comparing them with ground truth coordinates. It utilizes the OpenStreetMap's `Nominatim` geocoding service to convert textual addresses into geographic coordinates (latitude and longitude) and then calculates the geodesic distance between the predicted and actual locations.

A radius threshold of 0.2 kilometers (200 meters) to determine if two locations are considered the same

Methods:<br>
* `geocode_address(address: str) -> Union[Tuple[float, float], Tuple[None, None]]`<br>
This method takes a textual address as input and returns its geographic coordinates.

* `compute_geographic_distance(latitude1: float, longitude1: float, latitude2: float, longitude2: float) -> float`<br>
This method calculates the geodesic distance between two geographic points.

* `are_same(predicted_address: str, groundtruth_latitude: float, groundtruth_longitude: float) -> bool`<br>
This method determines whether a predicted address corresponds to the same location as the ground truth coordinates.

Process:
* * Geocodes the predicted address to obtain its coordinates
* * Calculates the distance between the predicted and ground truth coordinates
* * Prints the distance for debugging/information purposes
* * Returns True if the distance is less than or equal to the radius threshold (0.2 km)

Evaluation Logic
The class considers two locations to be the same if they are within 200 meters (0.2 kilometers) of each other. This threshold accounts for:

Geocoding Imprecision: Different geocoding services might return slightly different coordinates for the same address
Address Ambiguity: Addresses can sometimes refer to large buildings or areas rather than precise points
Practical Equivalence: For many applications, locations within 200 meters are functionally equivalent

The threshold can be adjusted by changing the radius_threshold property depending on the specific requirements of your evaluation task. A smaller threshold would increase precision requirements, while a larger threshold would be more lenient in considering locations as matches.

In [58]:
class GeoEvaluation:
    def __init__(self):
        self.timeout = 300
        self.geocoder = Nominatim(user_agent="geocoder_llm_evaluation_agent", timeout=self.timeout)
        self.radius_threshold = 0.2  # in kilometers

    def geocode_address(self, address: str) -> Union[Tuple[float, float], Tuple[None, None]]:
        try:
            location = self.geocoder.geocode(address)
            if location:
                return (location.latitude, location.longitude)
            print(f"Did not find location for: {address}")
        except Exception as e:
            print(f"Geocoding error for address '{address}': {e}")
        return (None, None)

    def compute_geographic_distance(self, latitude1: float, longitude1: float, latitude2: float, longitude2: float) -> float:
        """
        Compute the geodesic distance between two lat/lon pairs in kilometers using geopy.
        """
        return geodesic((latitude1, longitude1), (latitude2, longitude2)).kilometers

    def are_same(self, predicted_address: str, groundtruth_latitude: float, groundtruth_longitude: float) -> bool:
        # Geocode predicted address
        pred_latitude, pred_longitude = self.geocode_address(predicted_address)
        if pred_latitude is None or pred_longitude is None:
            return False

        # Compute the geographical distance
        distance = self.compute_geographic_distance(
            float(pred_latitude), float(pred_longitude),
            float(groundtruth_latitude), float(groundtruth_longitude)
        )

        print(f'Distance from ground truth: {round(distance, 3)} KM')

        return distance <= self.radius_threshold

In [59]:
evaluator = GeoEvaluation()
groundtruth_lat, groundtruth_lon = 32.79596540137694, -85.6535596907001
predicted_address = "south main st, Camp Hill, Alabama, 36850"

is_match = evaluator.are_same(predicted_address, groundtruth_lat, groundtruth_lon)
print("Match:", is_match)

Distance from ground truth: 0.195 KM
Match: True
