In [32]:
import pandas as pd
import geopandas as gp
from pathlib import Path
import pygris as pygris
from pygris.geocode import geolookup, batch_geocode
# import swifter
import urllib
import re
from shapely import wkt
# import geodatasets
import googlemaps
from IPython.display import display, HTML

from typing import Optional

In [9]:
# Constants and Params

# Base Path
BASE_PATH = Path("Path / to / directory / containing / CSV / file")
# Input Path
SCHOOL_ADDR_PATH = BASE_PATH / 'school_geo_nophi_handcode.csv'
# Output Path
OUTPUT_PATH = BASE_PATH / 'hbn_school_addrs_geo.csv'

### Pandas Options ###
# View whole dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [16]:
### Geocoding Functions ###

# NOTE: This function isn't tested! I'm not sure if PyGris works with just address or name,
# or if you need to specify city, state, etc, so may not be useful, but leaving here as an example
# of how to use pygris. Let me know if you want to try it and I'll fix it up :)
def add_geometry_pygris(df: pd.DataFrame):
    census_geocoded = batch_geocode(df, address='address', as_gdf=True)
    return df.join(census_geocoded.set_index('id'), rsuffix='_geo') 


### Below functions add rows to the input DataFrame with geocoded information. 
# Can specify suffix so that the rows don't overwrite existing geometry columns, or add merging to only overwrite blank spaces.


# Geocode using Photon.
def add_geometry_column_photon(df: pd.DataFrame, suffix: Optional[str] = None):
    if suffix is None:
        df[['geometry', 'geocoded_address']] = gp.tools.geocode(df['address'])
    else:
        df[['geometry' + suffix, 'geocoded_address' + suffix]] = gp.tools.geocode(df['address'])

# Geocode using Nominatim.
def add_geometry_column_nominatim(df: pd.DataFrame, suffix: Optional[str] = None):
    if suffix is None:
        df[['geometry', 'geocoded_address']] = gp.tools.geocode(df['address'], provider='nominatim', user_agent='cmiDiskGeo')
    else:
        df[['geometry' + suffix, 'geocoded_address' + suffix]] = gp.tools.geocode(
            df['address'], provider='nominatim', user_agent='cmiDiskGeo')
        

def google_geocode(
    address: str,
    gm: googlemaps.Client,
    region: Optional[str] = None,
    bounds: Optional[dict[dict[str,float]]] = None):
    try:
        res = gm.geocode(address, region=region, bounds=bounds)
        if not res:
            print("No results for: " + address)
            return [None, None, None, None, None]
        place_ids = None
        if len(res) >= 2:
            print("Multiple results for: " + address)
            print(res[0]['formatted_address'])
            print(res[1]['formatted_address'])
            place_ids = ','.join([r['place_id'] for r in res])
        loc = res[0]['geometry']['location']
        geometry = Point(loc['lng'], loc['lat'])
        place_id = res[0]['place_id']
        place_res = gm.place(place_id)
        return [geometry, res[0]['formatted_address'], place_id, place_res['result']['name'], place_ids]
    except Exception as e:
        print(f'Error for: {address}\n{e}')
        return [None, None, None, None, None]

# Geocode using Google.
def add_geometry_google(df: pd.DataFrame, suffix: Optional[str] = None):
    """Adds geometry and related columns to df."""
    gm = googlemaps.Client(key='')
    region = 'US'
    bounds = {
        'northeast': {'lat': 41.714520,'lng': -76.001612},  
        'southwest': {'lat': 39.991077, 'lng': -72.461996}}
    if suffix is None:
        df[['geometry', 'geocoded_address', 'place_id', 'place_name', 'multiple_result_ids']] = df.swifter.progress_bar(
            True).allow_dask_on_strings().apply(
            lambda r: google_geocode(r.address, gm=gm, region=region, bounds=bounds), axis=1, result_type='expand')
    else:
        df[[
            'geometry' + suffix,
            'geocoded_address' + suffix,
            'place_id' + suffix,
            'place_name' + suffix,
            'multiple_result_ids' + suffix
        ]] = df.swifter.progress_bar(
            True).allow_dask_on_strings().apply(
            lambda r: google_geocode(r.address, gm=gm, region=region, bounds=bounds), axis=1, result_type='expand')


In [36]:
# Read CSV file to pandas DataFrame (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)
school_geo_nophi = pd.read_csv(SCHOOL_ADDR_PATH, index_col='Unnamed: 0')

for index, row in school_geo_nophi.iterrows():
    # Printing as HTML to avoid needing to replace spaces in URL with + or %20 URL encoding.
    # Can also print other info here to help map back to spreadsheet/etc
    display(HTML(f"<a href='https://google.com/maps/?q={row['address']}+school'>{row['address']}</a>"))
    if index > 20: # Remove this to print all
        break