In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

<h3>Loading New York Data</h3>

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    print('Data Loaded')

Data Loaded


<h3>Explore Data</h3>

In [4]:
neighborhoods_data = newyork_data['features']

In [5]:
#explore data
neighborhoods_data[100]

{'type': 'Feature',
 'id': 'nyu_2451_34572.101',
 'geometry': {'type': 'Point',
  'coordinates': [-73.99427936255978, 40.71561842231432]},
 'geometry_name': 'geom',
 'properties': {'name': 'Chinatown',
  'stacked': 1,
  'annoline1': 'Chinatown',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Manhattan',
  'bbox': [-73.99427936255978,
   40.71561842231432,
   -73.99427936255978,
   40.71561842231432]}}

<h3>Create Pandas Dataframe from Json Data</h3>

In [6]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [7]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [8]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [9]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


<h3>Display Data pionts on map using Folium</h3>

In [10]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [11]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

<h3> FourSquare Credential</h3>

In [12]:
CLIENT_ID = 'MRS1Y54TM1DJBGNJUHSVD2DBMVHGVDVBCYS00F0TGNTMFLR2' # your Foursquare ID
CLIENT_SECRET = '03DR0DS3MIDUUEEPIQSUB0X4C1SJPZNIQXBANJLFDZ2LUOOI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MRS1Y54TM1DJBGNJUHSVD2DBMVHGVDVBCYS00F0TGNTMFLR2
CLIENT_SECRET:03DR0DS3MIDUUEEPIQSUB0X4C1SJPZNIQXBANJLFDZ2LUOOI


In [13]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


<h3>Use FourSquare API for getting details of Neighbourhood of NewYork</h3>

In [16]:
LIMIT = 100
radius = 500

In [17]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:

neighborhoods_values = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )
print(neighborhoods_values.shape)

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

KeyError: 'groups'

<h3>Explore filtered data of Neighbourhood obtained through FOurSquare</h3>

In [20]:
neighborhoods_values

NameError: name 'neighborhoods_values' is not defined

In [None]:
print(neighborhoods_values.shape)
neighborhoods_values.head(120)

In [None]:
neighborhoods_values.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(neighborhoods_values['Venue Category'].unique())))

<h3>Include Velue Category in Dataframe as one hot vector</h3>

In [None]:
# one hot encoding
neighborhoods_onehot = pd.get_dummies(neighborhoods_values[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
neighborhoods_onehot['Neighborhood'] = neighborhoods_values['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [neighborhoods_onehot.columns[-1]] + list(neighborhoods_onehot.columns[:-1])
neighborhoods_onehot = neighborhoods_onehot[fixed_columns]

neighborhoods_onehot.head()

<h3> check whether Indian Restaurant exist in Dataframe</h3>

In [None]:
neighborhoods_grouped = neighborhoods_onehot.groupby('Neighborhood').mean().reset_index()
len(neighborhoods_grouped)

In [None]:
neighborhoods_grouped['Indian Restaurant']

In [None]:
len(neighborhoods_grouped[neighborhoods_grouped["Indian Restaurant"] > 0])

<h3> filtering Indian Restaurant from data frame along with neighbourhood</h3>

In [None]:
to_indian = neighborhoods_grouped[["Neighborhood","Indian Restaurant"]]

In [None]:
to_indian[to_indian['Indian Restaurant']>=0]

In [None]:
to_indian.head()

<h3>Apply <b>k mean Algorithm</b> </h3>

In [None]:
#apply k mean to to_indian dataframe based on its mean value
from sklearn.cluster import KMeans
toclusters = 4

to_clustering = to_indian.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=toclusters, random_state=1,precompute_distances='auto')
kmeans.fit_transform(to_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

In [None]:
#print labels
kmeans.labels_

In [None]:
#add Predicted class to the to_indian dataframe
to_merged = to_indian.copy()

# add clustering labels
to_merged["Cluster Labels"] = kmeans.labels_


In [None]:
#renaming colum  for the purpose of join
to_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
to_merged.head(5)

In [None]:
#join Dataframes
to_merged = to_merged.join(neighborhoods_values.set_index("Neighborhood"), on="Neighborhood")

print(to_merged.shape)
to_merged.head(1)

<h4>Explore joined dataframe</h4>

In [None]:
to_merged.head(100)

In [None]:
to_merged.sort_values(["Cluster Labels"], inplace=True)
to_merged.tail()

In [None]:
to_merged.tail(10)

In [None]:
to_merged.head(10)

<h4>Group joined data frame </h4>

In [26]:
#hroup by neighbourhood as it contains repeated rows becuase of join operation
Indian_restuarant_merged_data=to_merged.groupby('Neighborhood').mean().reset_index()

NameError: name 'to_merged' is not defined

In [None]:
#print length of the gruped dataframe
len(Indian_restuarant_merged_data)

<h3>display  Map Along with clusters  </h3>

In [None]:
#folium map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
for lat, lon, cluster in zip(to_merged['Neighborhood Latitude'], to_merged['Neighborhood Longitude'], to_merged['Cluster Labels']):
    
    
    folium.features.CircleMarker(
        [lat, lon],
        radius=3,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
#display map     
map_clusters

<h3>Explore Clusters<h3>

In [None]:
#Cluster 0
Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels']==0]

In [None]:
#cluster 1
Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels']==1]

In [None]:
#cluster 2
Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels']==2]

In [None]:
#cluster 3
Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels']==3]

<h3> finding number of neighbourhood in cluster<h3>

In [None]:
cluster_0=len(Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels'] ==0])
cluster_1=len(Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels'] ==1])
cluster_2=len(Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels'] ==2])
cluster_3=len(Indian_restuarant_merged_data[Indian_restuarant_merged_data['Cluster Labels'] ==3])

In [None]:
print("Total datapionts in clusters")
print('Cluster-0:',cluster_0)
print('Cluster-1:',cluster_1)
print('Cluster-2:',cluster_2)
print('Cluster-3:',cluster_3)

above cities are most likely cities to open Indian Restuarant 