## Data Acquisition

In [56]:
import json

In [57]:
with open('nyc_geo.json') as f:
    nyc_geo_json = json.load(f)

In [None]:
nyc_geo_json['features']

In [59]:
from pandas import json_normalize  
features = json_normalize(nyc_geo_json, record_path ='features')
features.head()

Unnamed: 0,type,id,geometry_name,geometry.type,geometry.coordinates,properties.name,properties.stacked,properties.annoline1,properties.annoline2,properties.annoline3,properties.annoangle,properties.borough,properties.bbox
0,Feature,nyu_2451_34572.1,geom,Point,"[-73.84720052054902, 40.89470517661]",Wakefield,1,Wakefield,,,0.0,Bronx,"[-73.84720052054902, 40.89470517661, -73.84720..."
1,Feature,nyu_2451_34572.2,geom,Point,"[-73.82993910812398, 40.87429419303012]",Co-op City,2,Co-op,City,,0.0,Bronx,"[-73.82993910812398, 40.87429419303012, -73.82..."
2,Feature,nyu_2451_34572.3,geom,Point,"[-73.82780644716412, 40.887555677350775]",Eastchester,1,Eastchester,,,0.0,Bronx,"[-73.82780644716412, 40.887555677350775, -73.8..."
3,Feature,nyu_2451_34572.4,geom,Point,"[-73.90564259591682, 40.89543742690383]",Fieldston,1,Fieldston,,,0.0,Bronx,"[-73.90564259591682, 40.89543742690383, -73.90..."
4,Feature,nyu_2451_34572.5,geom,Point,"[-73.9125854610857, 40.890834493891305]",Riverdale,1,Riverdale,,,0.0,Bronx,"[-73.9125854610857, 40.890834493891305, -73.91..."


In [60]:
features['properties.name'].value_counts(), features['properties.annoline1'].value_counts()

(Bay Terrace      2
 Murray Hill      2
 Chelsea          2
 Sunnyside        2
 Wakefield        1
                 ..
 Fort Hamilton    1
 Ocean Parkway    1
 South Side       1
 North Side       1
 Fox Hills        1
 Name: properties.name, Length: 302, dtype: int64,
 East                5
 New                 5
 Sunnyside           3
 Jamaica             3
 West                3
                    ..
 Central             1
 Yorkville           1
 Lenox               1
 Roosevelt Island    1
 Fox                 1
 Name: properties.annoline1, Length: 263, dtype: int64)

In [61]:
features['geometry.coordinates'].value_counts(), features['properties.bbox'].value_counts()

([-73.84720052054902, 40.89470517661]        1
 [-74.1071817826561, 40.63187892654607]      1
 [-74.11918058534842, 40.61333593766742]     1
 [-74.17464532993542, 40.63968297845542]     1
 [-74.15008537046981, 40.632546390481124]    1
                                            ..
 [-73.93690027985234, 40.85190252555305]     1
 [-73.99427936255978, 40.71561842231432]     1
 [-74.03197914537984, 40.61476812694226]     1
 [-73.96836678035541, 40.61305976667942]     1
 [-74.08173992211962, 40.61731079252983]     1
 Name: geometry.coordinates, Length: 306, dtype: int64,
 [-73.84720052054902, 40.89470517661, -73.84720052054902, 40.89470517661]            1
 [-74.1071817826561, 40.63187892654607, -74.1071817826561, 40.63187892654607]        1
 [-74.11918058534842, 40.61333593766742, -74.11918058534842, 40.61333593766742]      1
 [-74.17464532993542, 40.63968297845542, -74.17464532993542, 40.63968297845542]      1
 [-74.15008537046981, 40.632546390481124, -74.15008537046981, 40.63254639048112

## Data Organization
#### Parse the json nyc_geo.json into the dataframe with the following columns:

- Borough
- Neighborhood
- Latitude
- Longitude

In [67]:
import pandas as pd
LatLon = pd.DataFrame(features['geometry.coordinates'].to_list(), columns=['Latitude', 'Longitude'])
LatLon.head()

Unnamed: 0,Latitude,Longitude
0,-73.847201,40.894705
1,-73.829939,40.874294
2,-73.827806,40.887556
3,-73.905643,40.895437
4,-73.912585,40.890834


In [63]:
df1=features[['properties.name', 'properties.borough']]
df1.head()

Unnamed: 0,properties.name,properties.borough
0,Wakefield,Bronx
1,Co-op City,Bronx
2,Eastchester,Bronx
3,Fieldston,Bronx
4,Riverdale,Bronx


In [71]:
dataFrames=[df1, LatLon]
nyc_df = pd.concat(dataFrames, axis=1)
nyc_df = nyc_df.rename(columns={'properties.name' : 'Neighborhood', 'properties.borough': 'Borough'})
nyc_df.head()

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
0,Wakefield,Bronx,-73.847201,40.894705
1,Co-op City,Bronx,-73.829939,40.874294
2,Eastchester,Bronx,-73.827806,40.887556
3,Fieldston,Bronx,-73.905643,40.895437
4,Riverdale,Bronx,-73.912585,40.890834


In [76]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(nyc_df['Borough'].unique()),
        nyc_df.shape[0]))

The dataframe has 5 boroughs and 306 neighborhoods.


### Dealing with Duplicates

##### The following neighborhoods share a name with another neighborhood in a separate borough.

In [77]:
nyc_doubles = nyc_df['Neighborhood'].value_counts()
nyc_doubles[nyc_doubles>1]

Bay Terrace    2
Murray Hill    2
Chelsea        2
Sunnyside      2
Name: Neighborhood, dtype: int64

In [78]:
nyc_df[nyc_df['Neighborhood'] == 'Chelsea']

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
116,Chelsea,Manhattan,-74.003116,40.744035
244,Chelsea,Staten Island,-74.18956,40.594726


##### We will parse through the dataframe to place the borough name at the end of the neighborhood name so as to distinguish between them.

In [80]:
for i in range(nyc_df.shape[0]):
    nyc_in = nyc_df.loc[i, 'Neighborhood']
    if nyc_df[nyc_df['Neighborhood'] == nyc_in].shape[0] > 1:
        nyc_in1 = nyc_df[nyc_df['Neighborhood'] == nyc_in].index.tolist()
        for j in nyc_in1:
            nyc_in2 = nyc_df.loc[j, 'Borough']
            nyc_df.loc[j, 'Neighborhood'] = nyc_in + ', ' + nyc_in2

In [81]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Chelsea')]

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
116,"Chelsea, Manhattan",Manhattan,-74.003116,40.744035
244,"Chelsea, Staten Island",Staten Island,-74.18956,40.594726


In [84]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Sunnyside')]

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
140,"Sunnyside, Queens",Queens,-73.926916,40.740176
220,"Sunnyside, Staten Island",Staten Island,-74.097126,40.61276
277,Sunnyside Gardens,Queens,-73.918193,40.745652


In [85]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Murray Hill')]

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
115,"Murray Hill, Manhattan",Manhattan,-73.978332,40.748303
180,"Murray Hill, Queens",Queens,-73.812763,40.764126


In [86]:
nyc_df[nyc_df['Neighborhood'].str.startswith('Bay Terrace')]

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
175,"Bay Terrace, Queens",Queens,-73.776802,40.782843
235,"Bay Terrace, Staten Island",Staten Island,-74.139166,40.553988


In [89]:
nyc_df.isnull().sum()

Neighborhood    0
Borough         0
Latitude        0
Longitude       0
dtype: int64