In [None]:
from time import sleep

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from Code.UtilityFunctions.get_data_path import get_path

For the GeoPy Documentation, see https://geopy.readthedocs.io/en/latest/#

For the Nominatim Documentation, see https://nominatim.org/release-docs/latest/

For the Nominatim Terms of Service, see https://operations.osmfoundation.org/policies/nominatim/

### Toy Example

In [None]:
geolocator = Nominatim(user_agent="YelpLocationMatching")

# Set RateLimiter parameters
min_delay_seconds = 2
max_retries = 3
error_wait_seconds = 5
zoom_level = 14  # Yields location to the neighbourhood level

geocode = RateLimiter(geolocator.geocode,
                      min_delay_seconds=min_delay_seconds,
                      max_retries=max_retries,
                      error_wait_seconds=error_wait_seconds)

location = geolocator.reverse("40.7127281, -74.0060152")
print(location.raw)

In [None]:
full_address = location.raw['address']
print(full_address)

### Actual Code

In [None]:
# Read in the business data
business_file = "yelp_academic_dataset_business.json"
businesses = pd.read_json(path_or_buf=get_path(business_file), lines=True)

# Remove the old city and state values from the businesses DataFrame
businesses.drop(['city', 'state'], inplace=True, axis=1)

# Create rounded coordinates
businesses["coordinate_set"] = businesses["latitude"].apply(round, args=(2,)).astype(str) + "," + businesses["longitude"].apply(round, args=(2,)).astype(str)

# Unique rounded coordinates
unique_locations = list(businesses["coordinate_set"].unique())

In [None]:
# Set RateLimiter parameters
min_delay_seconds = 2
max_retries = 3
error_wait_seconds = 5
zoom_level = 14  # Yields location to the neighbourhood level

# Create geolocator
geolocator = Nominatim(user_agent="YelpLocationMatching")
# and set rate limiter to (attempt to) avoid API timeout - DOESN'T WORK
# geocode = RateLimiter(geolocator.geocode,
#                       min_delay_seconds=min_delay_seconds,
#                       max_retries=max_retries,
#                       error_wait_seconds=error_wait_seconds)

In [None]:
# Calculate approximate time for geocoding
total_time = len(unique_locations) * min_delay_seconds / 60 / 60
print(f"Geocoding will take approximately {round(total_time, 2)} hours")

In [None]:
unique_locations = unique_locations[0:100]

In [None]:
# Create a dictionary with location as key and address as value
location_dict = {}
for location in unique_locations:
    try:
        location_dict[location] = geocode.reverse(location, zoom=zoom_level).raw['address']
    except:
        location_dict[location] = None

In [None]:
address_dict = {}
desired_address_levels = ["neighbourhood", "city", "county", "state", "country"]

# Extract only desired address levels from location_dict into address_dict
for location, address in location_dict.items():
    if address is not None:
        address_dict[location] = {level: address[level] for level in desired_address_levels if level in address}

In [None]:
# Count entries in test_dict that have empty city value
empty_city_count = 0

for location, address in address_dict.items():
    try:
        address["city"]
    except KeyError:
        empty_city_count += 1

print(f"There are {empty_city_count} entries with empty city value")

In [None]:
# Create address DataFrame from the address_dict
address_df = pd.DataFrame.from_dict(address_dict, orient="index")

# Merge the two DataFrames on the coordinate_set column of businesses and the index of address_df
updated_business = businesses.merge(address_df, how="left", left_on="coordinate_set", right_index=True)

# Remove the rounded coordinate set column
updated_business.drop("coordinate_set", inplace=True, axis=1)

In [None]:
updated_business

min_delay_seconds = 2
max_retries = 3
error_wait_seconds = 5
zoom_level = 14  # Yields location to the neighbourhood level

In [None]:
def update_business_locations(df: pd.DataFrame,
                              coordinate_rounding: int=2,
                              min_delay_seconds: int=2,
                              max_retries: int=3,
                              error_wait_seconds: int=5,
                              zoom_level: int=14,
                              report_missing: bool=False) -> pd.DataFrame:
    
    # Preprocess the DataFrame
    df.drop(['city', 'state'], inplace=True, axis=1)
    round_lat = df["latitude"].apply(round, args=(coordinate_rounding,)).astype(str)
    round_lon = df["longitude"].apply(round, args=(coordinate_rounding,)).astype(str)
    df["coordinate_set"] = round_lat + ',' + round_lon
    
    # Create list of unique rounded coordinates
    unique_locations = list(df["coordinate_set"].unique())
    
    # Create geolocator using the Nominatim API
    geolocator = Nominatim(user_agent="YelpLocationMatching")

    # Create a dictionary with location as key and address as value
    location_dict = {}
    for location in unique_locations:
        try:
            location_dict[location] = geolocator.reverse(location, zoom=zoom_level).raw['address']
        except:
            location_dict[location] = None
        sleep(min_delay_seconds)  # Sleep to avoid API timeout
    
    desired_address_levels = ["neighbourhood", "city", "county", "state", "country"]
    
    # Extract only desired address levels from location_dict into address_dict
    for location, address in location_dict.items():
        if address is not None:
            address_dict[location] = {level: address[level] for level in desired_address_levels if level in address}
    
    if report_missing:
        # Count entries in address_dict that have no key in desired_address_levels
        for level in desired_address_levels:
            level_missing = 0
            for _ in range(len(address_dict)):
                try:
                    address_dict[level]
                except KeyError:
                    level_missing += 1
            print(f"There are {level_missing} entries with empty {level} value")
    
    # Create address DataFrame from the address_dict
    address_df = pd.DataFrame.from_dict(address_dict, orient="index")

    # Merge the two DataFrames on the coordinate_set column of businesses and the index of address_df
    updated_businesses = businesses.merge(address_df, how="left", left_on="coordinate_set", right_index=True)

    # Remove the rounded coordinate set column
    updated_businesses.drop("coordinate_set", inplace=True, axis=1)
    
    return updated_businesses