In [14]:
from time import sleep

import pandas as pd
from geopy.geocoders import Nominatim

from Code.UtilityFunctions.get_data_path import get_path

For the GeoPy Documentation, see https://geopy.readthedocs.io/en/latest/#

For the Nominatim Documentation, see https://nominatim.org/release-docs/latest/

For the Nominatim Terms of Service, see https://operations.osmfoundation.org/policies/nominatim/

### Actual Code

In [15]:
# Read in the business data
business_file = "yelp_academic_dataset_business.json"
businesses = pd.read_json(path_or_buf=get_path(business_file), lines=True)

# Remove the old city and state values from the businesses DataFrame
businesses.drop(['city', 'state'], inplace=True, axis=1)

# Create rounded coordinates
businesses["coordinate_set"] = businesses["latitude"].apply(round, args=(2,)).astype(str) + "," + businesses["longitude"].apply(round, args=(2,)).astype(str)

# Unique rounded coordinates
unique_locations = list(businesses["coordinate_set"].unique())

In [16]:
# Set RateLimiter parameters
min_delay_seconds = 1

# Create geolocator
geolocator = Nominatim(user_agent="YelpLocationMatching")
# and set rate limiter to (attempt to) avoid API timeout - DOESN'T WORK
# geocode = RateLimiter(geolocator.geocode,
#                       min_delay_seconds=min_delay_seconds,
#                       max_retries=max_retries,
#                       error_wait_seconds=error_wait_seconds)

In [17]:
# Calculate approximate time for geocoding
total_time = len(unique_locations) * min_delay_seconds / 60 / 60
print(f"Geocoding will take approximately {round(total_time, 2)} hours")

Geocoding will take approximately 3.17 hours


In [18]:
unique_locations = unique_locations[0:100]

In [None]:
# Create a dictionary with location as key and address as value
location_dict = {}
for location in unique_locations:
    try:
        result = geolocator.reverse(location, zoom=14)
        location_dict[location] = result.raw['address']
    except:
        location_dict[location] = None
    sleep(min_delay_seconds)  # Sleep to avoid API timeout

In [26]:
location_dict

{'34.43,-119.71': {'neighbourhood': 'Westside',
  'city': 'Santa Barbara',
  'county': 'Santa Barbara County',
  'state_district': 'CAL Fire Southern Region',
  'state': 'California',
  'ISO3166-2-lvl4': 'US-CA',
  'postcode': '93101',
  'country': 'United States',
  'country_code': 'us'},
 '38.55,-90.34': {'town': 'Affton',
  'county': 'Saint Louis County',
  'state': 'Missouri',
  'ISO3166-2-lvl4': 'US-MO',
  'postcode': '63123',
  'country': 'United States',
  'country_code': 'us'},
 '32.22,-110.88': {'commercial': 'Williams Centre',
  'city': 'Tucson',
  'county': 'Pima County',
  'state': 'Arizona',
  'ISO3166-2-lvl4': 'US-AZ',
  'country': 'United States',
  'country_code': 'us'},
 '39.96,-75.16': {'suburb': 'Center City',
  'city': 'Philadelphia',
  'county': 'Philadelphia County',
  'state': 'Pennsylvania',
  'ISO3166-2-lvl4': 'US-PA',
  'country': 'United States',
  'country_code': 'us'},
 '40.34,-75.47': {'hamlet': 'Perkiomenville',
  'city': 'Marlborough Township',
  'county

In [27]:
address_dict = {}
desired_address_levels = ["neighbourhood", "postcode", "city", "county", "state", "country"]

# Extract only desired address levels from location_dict into address_dict
for location, address in location_dict.items():
    if address is not None:
        address_dict[location] = {level: address[level] for level in desired_address_levels if level in address}

In [28]:
address_dict

{'34.43,-119.71': {'neighbourhood': 'Westside',
  'postcode': '93101',
  'city': 'Santa Barbara',
  'county': 'Santa Barbara County',
  'state': 'California',
  'country': 'United States'},
 '38.55,-90.34': {'postcode': '63123',
  'county': 'Saint Louis County',
  'state': 'Missouri',
  'country': 'United States'},
 '32.22,-110.88': {'city': 'Tucson',
  'county': 'Pima County',
  'state': 'Arizona',
  'country': 'United States'},
 '39.96,-75.16': {'city': 'Philadelphia',
  'county': 'Philadelphia County',
  'state': 'Pennsylvania',
  'country': 'United States'},
 '40.34,-75.47': {'postcode': '18074',
  'city': 'Marlborough Township',
  'county': 'Montgomery County',
  'state': 'Pennsylvania',
  'country': 'United States'},
 '36.27,-87.06': {'postcode': '37015',
  'county': 'Cheatham County',
  'state': 'Tennessee',
  'country': 'United States'},
 '38.63,-90.34': {'county': 'Saint Louis County',
  'state': 'Missouri',
  'country': 'United States'},
 '27.77,-82.73': {'postcode': '33707',

In [29]:
# Count entries in test_dict that have empty city value
empty_city_count = 0

for location, address in address_dict.items():
    try:
        address["city"]
    except KeyError:
        empty_city_count += 1

print(f"There are {empty_city_count} entries with empty city value")

There are 12 entries with empty city value


In [30]:
# Create address DataFrame from the address_dict
address_df = pd.DataFrame.from_dict(address_dict, orient="index")

# Merge the two DataFrames on the coordinate_set column of businesses and the index of address_df
updated_business = businesses.merge(address_df, how="left", left_on="coordinate_set", right_index=True)

# Remove the rounded coordinate set column
updated_business.drop("coordinate_set", inplace=True, axis=1)

In [31]:
updated_business

Unnamed: 0,business_id,name,address,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,neighbourhood,postcode,city,county,state,country
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,Westside,93101,Santa Barbara,Santa Barbara County,California,United States
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...",,63123,,Saint Louis County,Missouri,United States
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...",,,Tucson,Pima County,Arizona,United States
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",,,Philadelphia,Philadelphia County,Pennsylvania,United States
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",,18074,Marlborough Township,Montgomery County,Pennsylvania,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3...",,,,,,
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3...",,,,,,
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",,,,,,,
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",,,,,,


min_delay_seconds = 2
max_retries = 3
error_wait_seconds = 5
zoom_level = 14  # Yields location to the neighbourhood level

In [24]:
def update_business_locations(df: pd.DataFrame,
                              coordinate_rounding: int=2,
                              min_delay_seconds: int=2,
                              max_retries: int=3,
                              error_wait_seconds: int=5,
                              zoom_level: int=14,
                              report_missing: bool=False) -> pd.DataFrame:
    
    # Preprocess the DataFrame
    df.drop(['city', 'state'], inplace=True, axis=1)
    round_lat = df["latitude"].apply(round, args=(coordinate_rounding,)).astype(str)
    round_lon = df["longitude"].apply(round, args=(coordinate_rounding,)).astype(str)
    df["coordinate_set"] = round_lat + ',' + round_lon
    
    # Create list of unique rounded coordinates
    unique_locations = list(df["coordinate_set"].unique())
    
    # Create geolocator using the Nominatim API
    geolocator = Nominatim(user_agent="YelpLocationMatching")

    # Create a dictionary with location as key and address as value
    location_dict = {}
    for location in unique_locations:
        try:
            location_dict[location] = geolocator.reverse(location, zoom=zoom_level).raw['address']
        except:
            location_dict[location] = None
        sleep(min_delay_seconds)  # Sleep to avoid API timeout
    
    desired_address_levels = ["neighbourhood", "city", "county", "state", "country"]
    
    # Extract only desired address levels from location_dict into address_dict
    for location, address in location_dict.items():
        if address is not None:
            address_dict[location] = {level: address[level] for level in desired_address_levels if level in address}
    
    if report_missing:
        # Count entries in address_dict that have no key in desired_address_levels
        for level in desired_address_levels:
            level_missing = 0
            for _ in range(len(address_dict)):
                try:
                    address_dict[level]
                except KeyError:
                    level_missing += 1
            print(f"There are {level_missing} entries with empty {level} value")
    
    # Create address DataFrame from the address_dict
    address_df = pd.DataFrame.from_dict(address_dict, orient="index")

    # Merge the two DataFrames on the coordinate_set column of businesses and the index of address_df
    updated_businesses = businesses.merge(address_df, how="left", left_on="coordinate_set", right_index=True)

    # Remove the rounded coordinate set column
    updated_businesses.drop("coordinate_set", inplace=True, axis=1)
    
    return updated_businesses