In [1]:
#Import libraries
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.extra.rate_limiter import RateLimiter
from geopy.point import Point

import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
#reads in National Address Database data for the Bronx (36005) - 'county_data'
file_path = '../dspg23_reverse_geocoder/Data/36005.csv'
county_data = pd.read_csv(file_path)

In [3]:
#Creates Reverse geocoder tool
#Input: df w/ a 'latitude' and 'longitude' column, Output: same df with updated 'address' column
def reverse_geocode_tool(df):
    #Creates Geo column: combines latitude and longitude 
    df["Geo"] = df["latitude"].astype(str)+ ',' + df["longitude"].astype(str)
    
    geolocator = Nominatim(user_agent="reverse_geocoder", timeout=10) 
    rgeocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.001)
    
    #Reverse geocodes 'Geo' column and populates 'address' column with results
    df['address'] = df['Geo'].apply(rgeocode)
    
    return df

In [4]:
#'Missing_data': df containing all the rows of 'county_data' that have missing values in 'address' columns
missing_data = county_data[county_data['address'].isna()]

In [5]:
#Reads in shape file - 'county_shape'
shape_file_path = '../dspg23_reverse_geocoder/Data/tl_2020_36005_tabblock20/tl_2020_36005_tabblock20.shp'
county_shape = gpd.read_file(shape_file_path)

#Creates 'centroid' column based on 'geometry' - gives us latitude and longitude values
county_shape['centroid'] = county_shape['geometry'].centroid

#Converts 'GEOID20' column to int, so it can be merged with 'missing_data'
county_shape['GEOID20']=county_shape['GEOID20'].astype(int)


  county_shape['centroid'] = county_shape['geometry'].centroid


In [6]:
#Merges 'missing_data' with 'county_shape' on 'GEOID20' column (left join)
merged_df = missing_data.merge(county_shape, on='GEOID20', how="left")
merged_df = merged_df.drop(['STATEFP20', 'COUNTYFP20', 'TRACTCE20', 'BLOCKCE20', 'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'UATYPE20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'geometry'], axis=1)

#Poulates 'latitude' and 'longitude' columns of 'merged_df' with latitude and longitude values from 'centroid'
for i in merged_df.index:   
    point = merged_df['centroid'][i]
    lat = point.y
    long = point.x
    
    merged_df['latitude'][i] = lat
    merged_df['longitude'][i] = long
    
merged_df = merged_df.drop('centroid', axis=1)

#Reverse geocodes 'merged_df'
reverse_geocode_tool(merged_df)
merged_df = merged_df.drop('Geo', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['latitude'][i] = lat
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['longitude'][i] = long


In [7]:
#Checks number of 'address' values that are NaN
#Should be 1008
sum(county_data['address'].isna())

1008

In [8]:
np.where(county_data['address'].isna())

(array([    76,    242,    264, ..., 105440, 105443, 106275]),)

In [9]:
#Checks if 'address' column in 'county_data' is NaN
#if so, updates 'address', 'latitude', and 'longitude' columns of 'county_data' 
#with 'address', 'latitude', and 'longitude' columns of 'merged_df', based on 'GEOID20'
for i in county_data.index:
    if pd.isna(county_data['address'][i]):  
        county_data['address'][i] = merged_df['address'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]
        county_data['latitude'][i] = merged_df['latitude'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]
        county_data['longitude'][i] = merged_df['longitude'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_data['address'][i] = merged_df['address'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_data['latitude'][i] = merged_df['latitude'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_data['longitude'][i] = merged_df['longitude'][merged_df.loc[merged_df['GEOID20']==county_data['GEOID20'][i]].index[0]]


In [10]:
county_data

Unnamed: 0,address,GEOID20,longitude,latitude
0,"3001 henry hudson parkway west,bronx,ny,10463",360050301001001,-73.916170,40.882786
1,"3053 henry hudson parkway west,bronx,ny,10463",360050301001001,-73.915858,40.883261
2,"3051 henry hudson parkway west,bronx,ny,10463",360050301001001,-73.915876,40.883211
3,"3055 henry hudson parkway west,bronx,ny,10463",360050301001001,-73.915833,40.883307
4,"3057 henry hudson parkway west,bronx,ny,10463",360050301001001,-73.915808,40.883353
...,...,...,...,...
106600,"1122 pugsley avenue,bronx,ny,10472",360050078001003,-73.856574,40.829606
106601,"2012 haviland avenue,bronx,ny,10472",360050078001003,-73.856259,40.829611
106602,"2016 haviland avenue,bronx,ny,10472",360050078001003,-73.856139,40.829627
106603,"2014 haviland avenue,bronx,ny,10472",360050078001003,-73.856200,40.829619


In [11]:
##Checks number of 'address' values that are NaN
#Should be 0
sum(county_data['address'].isna())

0

In [12]:
np.where(county_data['address'].isna())

(array([], dtype=int64),)

In [13]:
#Saves output df as csv
county_data.to_csv('../dspg23_reverse_geocoder/36005_output.csv')