## Data Acquisition

In [75]:
import json

In [76]:
with open('nyc_geo.json') as f:
    nyc_geo_json = json.load(f)

In [77]:
nyc_geo_json['features'][0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [78]:
from pandas import json_normalize  
features = json_normalize(nyc_geo_json, record_path ='features')
features.head()

Unnamed: 0,type,id,geometry_name,geometry.type,geometry.coordinates,properties.name,properties.stacked,properties.annoline1,properties.annoline2,properties.annoline3,properties.annoangle,properties.borough,properties.bbox
0,Feature,nyu_2451_34572.1,geom,Point,"[-73.84720052054902, 40.89470517661]",Wakefield,1,Wakefield,,,0.0,Bronx,"[-73.84720052054902, 40.89470517661, -73.84720..."
1,Feature,nyu_2451_34572.2,geom,Point,"[-73.82993910812398, 40.87429419303012]",Co-op City,2,Co-op,City,,0.0,Bronx,"[-73.82993910812398, 40.87429419303012, -73.82..."
2,Feature,nyu_2451_34572.3,geom,Point,"[-73.82780644716412, 40.887555677350775]",Eastchester,1,Eastchester,,,0.0,Bronx,"[-73.82780644716412, 40.887555677350775, -73.8..."
3,Feature,nyu_2451_34572.4,geom,Point,"[-73.90564259591682, 40.89543742690383]",Fieldston,1,Fieldston,,,0.0,Bronx,"[-73.90564259591682, 40.89543742690383, -73.90..."
4,Feature,nyu_2451_34572.5,geom,Point,"[-73.9125854610857, 40.890834493891305]",Riverdale,1,Riverdale,,,0.0,Bronx,"[-73.9125854610857, 40.890834493891305, -73.91..."


In [79]:
features['properties.name'].value_counts(), features['properties.annoline1'].value_counts()

(Bay Terrace      2
 Murray Hill      2
 Chelsea          2
 Sunnyside        2
 Wakefield        1
                 ..
 Fort Hamilton    1
 Ocean Parkway    1
 South Side       1
 North Side       1
 Fox Hills        1
 Name: properties.name, Length: 302, dtype: int64,
 East                5
 New                 5
 Sunnyside           3
 Jamaica             3
 West                3
                    ..
 Central             1
 Yorkville           1
 Lenox               1
 Roosevelt Island    1
 Fox                 1
 Name: properties.annoline1, Length: 263, dtype: int64)

In [80]:
features['geometry.coordinates'].value_counts(), features['properties.bbox'].value_counts()

([-73.84720052054902, 40.89470517661]        1
 [-74.1071817826561, 40.63187892654607]      1
 [-74.11918058534842, 40.61333593766742]     1
 [-74.17464532993542, 40.63968297845542]     1
 [-74.15008537046981, 40.632546390481124]    1
                                            ..
 [-73.93690027985234, 40.85190252555305]     1
 [-73.99427936255978, 40.71561842231432]     1
 [-74.03197914537984, 40.61476812694226]     1
 [-73.96836678035541, 40.61305976667942]     1
 [-74.08173992211962, 40.61731079252983]     1
 Name: geometry.coordinates, Length: 306, dtype: int64,
 [-73.84720052054902, 40.89470517661, -73.84720052054902, 40.89470517661]            1
 [-74.1071817826561, 40.63187892654607, -74.1071817826561, 40.63187892654607]        1
 [-74.11918058534842, 40.61333593766742, -74.11918058534842, 40.61333593766742]      1
 [-74.17464532993542, 40.63968297845542, -74.17464532993542, 40.63968297845542]      1
 [-74.15008537046981, 40.632546390481124, -74.15008537046981, 40.63254639048112

## Data Organization
#### Parse the json nyc_geo.json into the dataframe with the following columns:

- Borough
- Neighborhood
- Latitude
- Longitude

In [82]:
import pandas as pd
LatLon = pd.DataFrame(features['geometry.coordinates'].to_list(), columns=['Longitude', 'Latitude'])
LatLon.head()

Unnamed: 0,Longitude,Latitude
0,-73.847201,40.894705
1,-73.829939,40.874294
2,-73.827806,40.887556
3,-73.905643,40.895437
4,-73.912585,40.890834


In [83]:
df1=features[['properties.name', 'properties.borough']]
df1.head()

Unnamed: 0,properties.name,properties.borough
0,Wakefield,Bronx
1,Co-op City,Bronx
2,Eastchester,Bronx
3,Fieldston,Bronx
4,Riverdale,Bronx


In [84]:
dataFrames=[df1, LatLon]
nyc_df = pd.concat(dataFrames, axis=1)
nyc_df = nyc_df.rename(columns={'properties.name' : 'Neighborhood', 'properties.borough': 'Borough'})
nyc_df.head()

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
0,Wakefield,Bronx,-73.847201,40.894705
1,Co-op City,Bronx,-73.829939,40.874294
2,Eastchester,Bronx,-73.827806,40.887556
3,Fieldston,Bronx,-73.905643,40.895437
4,Riverdale,Bronx,-73.912585,40.890834


In [86]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(nyc_df['Borough'].unique()),
        nyc_df.shape[0]))

The dataframe has 5 boroughs and 306 neighborhoods.


### Dealing with Duplicates

##### The following neighborhoods share a name with another neighborhood in a separate borough.

In [87]:
nyc_doubles = nyc_df['Neighborhood'].value_counts()
nyc_doubles[nyc_doubles>1]

Bay Terrace    2
Murray Hill    2
Chelsea        2
Sunnyside      2
Name: Neighborhood, dtype: int64

In [88]:
nyc_df[nyc_df['Neighborhood'] == 'Chelsea']

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
116,Chelsea,Manhattan,-74.003116,40.744035
244,Chelsea,Staten Island,-74.18956,40.594726


##### We will parse through the dataframe to place the borough name at the end of the neighborhood name so as to distinguish between them.

In [89]:
for i in range(nyc_df.shape[0]):
    nyc_in = nyc_df.loc[i, 'Neighborhood']
    if nyc_df[nyc_df['Neighborhood'] == nyc_in].shape[0] > 1:
        nyc_in1 = nyc_df[nyc_df['Neighborhood'] == nyc_in].index.tolist()
        for j in nyc_in1:
            nyc_in2 = nyc_df.loc[j, 'Borough']
            nyc_df.loc[j, 'Neighborhood'] = nyc_in + ', ' + nyc_in2

In [90]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Chelsea')]

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
116,"Chelsea, Manhattan",Manhattan,-74.003116,40.744035
244,"Chelsea, Staten Island",Staten Island,-74.18956,40.594726


In [91]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Sunnyside')]

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
140,"Sunnyside, Queens",Queens,-73.926916,40.740176
220,"Sunnyside, Staten Island",Staten Island,-74.097126,40.61276
277,Sunnyside Gardens,Queens,-73.918193,40.745652


In [92]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Murray Hill')]

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
115,"Murray Hill, Manhattan",Manhattan,-73.978332,40.748303
180,"Murray Hill, Queens",Queens,-73.812763,40.764126


In [93]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Bay Terrace')]

Unnamed: 0,Neighborhood,Borough,Longitude,Latitude
175,"Bay Terrace, Queens",Queens,-73.776802,40.782843
235,"Bay Terrace, Staten Island",Staten Island,-74.139166,40.553988


In [94]:
nyc_df.isnull().sum()

Neighborhood    0
Borough         0
Longitude       0
Latitude        0
dtype: int64

In [95]:
nyc_df.to_csv('nyc_df.csv')

In [125]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library
import matplotlib.pyplot as plt

address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [126]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10, 
                         min_zoom=9, max_zoom=15)

# add markers to map
for lat, lon, borough, neighborhood in zip(nyc_df['Latitude'], 
                                           nyc_df['Longitude'], 
                                           nyc_df['Borough'], 
                                           nyc_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        weight=2,
        color='#333333',
        fill=True,
        fill_color='#ffb300',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### Downloading and Parsing the NYC Housing Mean Prices DataFrame

In [96]:
nyc_housing = pd.read_excel('nyc_housing_mean_prices.xlsx')
nyc_housing.head()

Unnamed: 0,neighborhood,studio,Unnamed: 2,neighborhood.1,1_bedroom,Unnamed: 5,neighborhood.2,2_bedroom
0,Upper West Side,"$1,975",,Upper West Side,"$2,750",,Upper West Side,"$4,350"
1,Bedford-Stuyvesant,"$1,650",,Bedford-Stuyvesant,"$1,975",,Bedford-Stuyvesant,"$2,200"
2,Bushwick,"$1,731",,Bushwick,"$2,150",,Bushwick,"$2,100"
3,Upper East Side,"$1,898",,Upper East Side,"$2,395",,Upper East Side,"$3,295"
4,Williamsburg,"$2,491",,Williamsburg,"$2,500",,Williamsburg,"$2,584"


In [97]:
nyc_housing.shape

(205, 8)

In [98]:
nyc_housing.isnull().sum()

neighborhood       68
studio             68
Unnamed: 2        205
neighborhood.1     11
1_bedroom          11
Unnamed: 5        205
neighborhood.2      0
2_bedroom           0
dtype: int64

In [99]:
nyc_housing = nyc_housing.drop(['Unnamed: 2', 'Unnamed: 5'], axis=1)

In [100]:
nyc_housing['studio'] = nyc_housing['studio'].str.replace('$', '')
nyc_housing['1_bedroom'] = nyc_housing['1_bedroom'].str.replace('$', '')
nyc_housing['2_bedroom'] = nyc_housing['2_bedroom'].str.replace('$', '')
nyc_housing.head()

  nyc_housing['studio'] = nyc_housing['studio'].str.replace('$', '')
  nyc_housing['1_bedroom'] = nyc_housing['1_bedroom'].str.replace('$', '')
  nyc_housing['2_bedroom'] = nyc_housing['2_bedroom'].str.replace('$', '')


Unnamed: 0,neighborhood,studio,neighborhood.1,1_bedroom,neighborhood.2,2_bedroom
0,Upper West Side,1975,Upper West Side,2750,Upper West Side,4350
1,Bedford-Stuyvesant,1650,Bedford-Stuyvesant,1975,Bedford-Stuyvesant,2200
2,Bushwick,1731,Bushwick,2150,Bushwick,2100
3,Upper East Side,1898,Upper East Side,2395,Upper East Side,3295
4,Williamsburg,2491,Williamsburg,2500,Williamsburg,2584


In [101]:
nyc_housing['studio']=nyc_housing['studio'].str.replace(',', '')

In [102]:
nyc_housing['1_bedroom']=nyc_housing['1_bedroom'].str.replace(',', '')
nyc_housing['2_bedroom']=nyc_housing['2_bedroom'].str.replace(',', '')

In [103]:
nyc_housing['studio']=nyc_housing['studio'].astype(float)
nyc_housing['1_bedroom']=nyc_housing['1_bedroom'].astype(float)
nyc_housing['2_bedroom']=nyc_housing['2_bedroom'].astype(float)
nyc_housing.head(70)

Unnamed: 0,neighborhood,studio,neighborhood.1,1_bedroom,neighborhood.2,2_bedroom
0,Upper West Side,1975.0,Upper West Side,2750.0,Upper West Side,4350.0
1,Bedford-Stuyvesant,1650.0,Bedford-Stuyvesant,1975.0,Bedford-Stuyvesant,2200.0
2,Bushwick,1731.0,Bushwick,2150.0,Bushwick,2100.0
3,Upper East Side,1898.0,Upper East Side,2395.0,Upper East Side,3295.0
4,Williamsburg,2491.0,Williamsburg,2500.0,Williamsburg,2584.0
...,...,...,...,...,...,...
65,Bensonhurst,4200.0,Bensonhurst,1500.0,Bensonhurst,1795.0
66,Kew Gardens Hills,1450.0,Kew Gardens Hills,1699.0,Kew Gardens Hills,2100.0
67,Glendale,1400.0,Glendale,1675.0,Glendale,2000.0
68,Richmond Hill,1675.0,Middle Village,1800.0,Middle Village,2100.0


In [104]:
nyc_housing.groupby('neighborhood').sum()


Unnamed: 0_level_0,studio,1_bedroom,2_bedroom
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Allerton,1450.0,1590.0,2200.0
Astoria,1575.0,1875.0,2200.0
Auburndale,1795.0,1555.0,2888.0
Bath Beach,3000.0,1700.0,1900.0
Battery Park City,2200.0,2733.0,4431.0
...,...,...,...
Westchester Village,1100.0,1625.0,2200.0
Williamsburg,2491.0,2500.0,2584.0
Windsor Terrace,1900.0,2325.0,2800.0
Woodhaven,1627.0,1550.0,2295.0


In [105]:
nyc_housing.set_index('neighborhood.2').stack().droplevel(1).reset_index()

Unnamed: 0,neighborhood.2,0
0,Upper West Side,Upper West Side
1,Upper West Side,1975.0
2,Upper West Side,Upper West Side
3,Upper West Side,2750.0
4,Upper West Side,4350.0
...,...,...
862,Van Cortlandt Park,1983.0
863,Castleton Corners,1400.0
864,Brooklyn Navy Yard,5345.0
865,Travis - Chelsea,1800.0


#### The shape and neighborhood columns of this dataframe will not match either the average prices data set, nor the original dataframe from NYC Json. We will re-visit this.

In [106]:
df_neighborhood = nyc_housing[['neighborhood','studio', '1_bedroom', '2_bedroom']]

In [107]:
df_neighborhood = df_neighborhood.dropna(subset = ['neighborhood'])
df_neighborhood

Unnamed: 0,neighborhood,studio,1_bedroom,2_bedroom
0,Upper West Side,1975.0,2750.0,4350.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0
2,Bushwick,1731.0,2150.0,2100.0
3,Upper East Side,1898.0,2395.0,3295.0
4,Williamsburg,2491.0,2500.0,2584.0
...,...,...,...,...
132,University Heights,1300.0,1700.0,2645.0
133,Foxhurst,947.0,1500.0,2198.0
134,Flushing Meadows-Corona Park,1650.0,1350.0,2350.0
135,Van Cortlandt Park,1400.0,1575.0,1750.0


In [108]:
df_neighborhood1 = nyc_housing[['neighborhood.1','studio', '1_bedroom', '2_bedroom']]
df_neighborhood1 = df_neighborhood1.dropna(subset = ['neighborhood.1'])
df_neighborhood1 = df_neighborhood1.rename(columns={'neighborhood.1': 'neighborhood'})
df_neighborhood1

Unnamed: 0,neighborhood,studio,1_bedroom,2_bedroom
0,Upper West Side,1975.0,2750.0,4350.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0
2,Bushwick,1731.0,2150.0,2100.0
3,Upper East Side,1898.0,2395.0,3295.0
4,Williamsburg,2491.0,2500.0,2584.0
...,...,...,...,...
189,Flushing Meadows-Corona Park,,1975.0,1475.0
190,Manor Heights,,1200.0,1900.0
191,Blissville,,1600.0,1700.0
192,Van Cortlandt Park,,1500.0,1575.0


In [109]:
df_neighborhood2 = nyc_housing[['neighborhood.2','studio', '1_bedroom', '2_bedroom']]
df_neighborhood2 = df_neighborhood2.dropna(subset = ['neighborhood.2'])
df_neighborhood2 = df_neighborhood2.rename(columns={'neighborhood.2': 'neighborhood'})

In [110]:
df_neighborhood2

Unnamed: 0,neighborhood,studio,1_bedroom,2_bedroom
0,Upper West Side,1975.0,2750.0,4350.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0
2,Bushwick,1731.0,2150.0,2100.0
3,Upper East Side,1898.0,2395.0,3295.0
4,Williamsburg,2491.0,2500.0,2584.0
...,...,...,...,...
200,Van Cortlandt Park,,,1983.0
201,Castleton Corners,,,1400.0
202,Brooklyn Navy Yard,,,5345.0
203,Travis - Chelsea,,,1800.0


In [111]:
df_neighborhood.merge(df_neighborhood1, how='outer', on='neighborhood')

Unnamed: 0,neighborhood,studio_x,1_bedroom_x,2_bedroom_x,studio_y,1_bedroom_y,2_bedroom_y
0,Upper West Side,1975.0,2750.0,4350.0,1975.0,2750.0,4350.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0,1650.0,1975.0,2200.0
2,Bushwick,1731.0,2150.0,2100.0,1731.0,2150.0,2100.0
3,Upper East Side,1898.0,2395.0,3295.0,1898.0,2395.0,3295.0
4,Williamsburg,2491.0,2500.0,2584.0,2491.0,2500.0,2584.0
...,...,...,...,...,...,...,...
189,Fort Wadsworth,,,,,3350.0,1825.0
190,Holliswood,,,,,1895.0,2300.0
191,Huguenot,,,,,1350.0,1850.0
192,Manor Heights,,,,,1200.0,1900.0


In [112]:
df_merge1 = df_neighborhood.merge(df_neighborhood1, how='outer', on='neighborhood')

In [113]:
df_merge2 = df_merge1.merge(df_neighborhood2, how='outer', on='neighborhood')
df_combined = df_merge2[['neighborhood', 'studio', '1_bedroom', '2_bedroom']]
df_combined

Unnamed: 0,neighborhood,studio,1_bedroom,2_bedroom
0,Upper West Side,1975.0,2750.0,4350.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0
2,Bushwick,1731.0,2150.0,2100.0
3,Upper East Side,1898.0,2395.0,3295.0
4,Williamsburg,2491.0,2500.0,2584.0
...,...,...,...,...
211,Grant City,,1200.0,1900.0
212,Co-Op City,,1500.0,1575.0
213,Castleton Corners,,,1400.0
214,Travis - Chelsea,,,1800.0


In [121]:
%store nyc_df

Stored 'nyc_df' (DataFrame)


In [115]:
%store df_combined

Stored 'df_combined' (DataFrame)


In [123]:
nyc_df = nyc_df[['Neighborhood', 'Borough', 'Latitude', 'Longitude']]

In [124]:
nyc_df

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
0,Wakefield,Bronx,40.894705,-73.847201
1,Co-op City,Bronx,40.874294,-73.829939
2,Eastchester,Bronx,40.887556,-73.827806
3,Fieldston,Bronx,40.895437,-73.905643
4,Riverdale,Bronx,40.890834,-73.912585
...,...,...,...,...
301,Hudson Yards,Manhattan,40.756658,-74.000111
302,Hammels,Queens,40.587338,-73.805530
303,Bayswater,Queens,40.611322,-73.765968
304,Queensbridge,Queens,40.756091,-73.945631


In [None]:
!pip install folium

In [122]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10, 
                         min_zoom=9, max_zoom=15)

# add markers to map
for lat, lon, borough, neighborhood in zip(nyc_df['Latitude'], 
                                           nyc_df['Longitude'], 
                                           nyc_df['Borough'], 
                                           nyc_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        weight=2,
        color='#333333',
        fill=True,
        fill_color='#ffb300',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork