In [1]:
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim

## Lat/Long Information for Street Easy Neighborhoods

While working with Tableau, visualization errors occurred while trying to visualize data that was obtained from blending two different sources. To solve this problem, we proceed by adding lat/long information to the existing `street_easy_nyc.csv` file, and will use this new file as the data source in Tableau.

In [2]:
df = pd.read_csv('../../data/dataxp/airbnb_project/cleaned_data/street_easy_nyc.csv')

In [3]:
# Making sure that this dataframe is equivalent in size to the SELECT * FROM street_easy_nyc SQL query.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110880 entries, 0 to 110879
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   area_name  110880 non-null  object 
 1   borough    110320 non-null  object 
 2   bedrooms   110880 non-null  object 
 3   price      49234 non-null   float64
 4   year       110880 non-null  int64  
 5   month      110880 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 5.1+ MB


## Adding Lat/Long Information for Each Neighborhood

In [4]:
# We need a lat/long coordinate for all of the neighborhoods in the DataFrame
# Keeping 'area_name' consistent allows for easer merging later on.

locations = pd.DataFrame({'area_name': df['area_name'].unique()})

In [5]:
# Adding ', New York City' will make sure we geolocate the correct neighborhood.

locations['area_name'] = locations['area_name'].apply(lambda x: str(x) + ', New York City')

In [6]:
# Viewing the change

locations.head()

Unnamed: 0,area_name
0,"Downtown, New York City"
1,"Midtown, New York City"
2,"Upper East Side, New York City"
3,"Upper Manhattan, New York City"
4,"Upper West Side, New York City"


In [7]:
geolocator = Nominatim(timeout = 10, user_agent = 'my_geolocator')

In [8]:
# This is how we will get the lat/long coordinates

geolocator.geocode('Upper East Side, New York City')[1]

(40.7737016, -73.9641196)

In [9]:
lat_long = []

for neighborhood in locations['area_name']:
    
    try:
        coords = geolocator.geocode(neighborhood)[1]
        lat_long.append(coords)
        
    except (TypeError, NameError):
        lat_long.append(None)

In [10]:
# Making sure that lat_long and locations are the same length

print (len(lat_long))
print (len(locations))

195
195


In [11]:
locations['coords'] = lat_long

In [12]:
locations.head()

Unnamed: 0,area_name,coords
0,"Downtown, New York City","(40.5997561, -73.9463899)"
1,"Midtown, New York City","(40.76008455, -73.97815861015326)"
2,"Upper East Side, New York City","(40.7737016, -73.9641196)"
3,"Upper Manhattan, New York City","(40.7870455, -73.9754163)"
4,"Upper West Side, New York City","(40.7870455, -73.9754163)"


In [13]:
# Check the None values

locations[locations['coords'].isna() == True]

Unnamed: 0,area_name,coords
136,"Old Mill Basin, New York City",
146,"Prospect Lefferts Gardens, New York City",
172,"Stuyvesant Town/PCV, New York City",
175,"The Rockaways, New York City",
186,"Westchester Village, New York City",


After checking these neighborhoods for pricing information in SQL, it was determined that Prospect Lefferts Gardens (PLG), Stuyvesant Town, and the Rockaways were the only neighborhoods that had pricing information in the database.

We will add the information for PLG and Stuyvesant Town using individual geocoding searches. Some searches use 

In [14]:
locations.loc[146, 'coords'] = geolocator.geocode('PLG, Brooklyn, New York, New York')[1]
locations.loc[172, 'coords'] = geolocator.geocode('Stuyvesant Town, New York City')[1]
locations.loc[175, 'coords'] = geolocator.geocode('Rockaway Beach, Queens, New York')[1]

In [15]:
# Checking the alteration

locations[locations['coords'].isna() == True]

Unnamed: 0,area_name,coords
136,"Old Mill Basin, New York City",
186,"Westchester Village, New York City",


In [16]:
# We will drop the above two columns because they do not have any pricing information.

locations = locations.dropna().reset_index().drop('index', axis = 1)

In [17]:
# Create individual lat and long columns

locations['lat'] = locations['coords'].apply(lambda x: x[0])
locations['long'] = locations['coords'].apply(lambda x: x[1])

In [18]:
# We can drop the 'coords' column because we have lat and long information in individual columns

locations = locations.drop('coords', axis = 1)

In [19]:
# We can now remove ', New York City' from the 'neighborhood' column, as that information was only used
# for geolocation purposes.

locations['area_name'] = locations['area_name'].apply(lambda x: x.replace(', New York City', ''))

In [20]:
# Checking the above alterations.

locations.sample(10)

Unnamed: 0,area_name,lat,long
82,Gravesend,40.596134,-73.973943
58,East Elmhurst,40.761212,-73.865136
160,Sheepshead Bay,40.591216,-73.944582
74,Forest Hills,40.719594,-73.844855
3,Upper Manhattan,40.787045,-73.975416
14,Bellerose,40.732778,-73.717778
134,Oakland Gardens,40.753991,-73.765966
111,Marine Park,40.595568,-73.91661
116,Midtown South,40.749842,-73.984251
183,West Harlem,40.801136,-73.959446


## Merging the DataFrames

In [22]:
df = df.merge(locations, on = 'area_name', how = 'left')

In [23]:
# Checking the merge

df.sample(10)

Unnamed: 0,area_name,borough,bedrooms,price,year,month,lat,long
15602,Roosevelt Island,Manhattan,2,3100.0,2011,8,40.761418,-73.950228
71777,Kew Gardens Hills,Queens,2,1975.0,2017,7,40.727499,-73.82172
86731,Bath Beach,Brooklyn,2,1900.0,2019,2,40.596451,-73.994659
28214,Kensington,Brooklyn,2,,2012,12,40.646215,-73.970694
1398,Bedford Park,Bronx,3 plus,,2010,2,40.8701,-73.885691
54070,Bensonhurst,Brooklyn,1,1450.0,2015,9,40.604977,-73.993406
19893,Inwood,Manhattan,0,,2012,2,40.869258,-73.920495
38104,Hillcrest,Queens,0,,2014,1,40.709643,-73.802642
28760,Country Club,Bronx,1,,2013,1,40.839167,-73.819722
62445,Fort Greene,Brooklyn,3 plus,3998.0,2016,7,40.690771,-73.976624


## Saving the Results as a New `.csv`

In [24]:
df.to_csv('../../data/nyc_housing_rec/cleaned_data/street_easy_lat_long.csv')