## Coursera Capstone Project 
# Segmenting & Clustering Neighborhood in Toronto
# Section 1
In this section, the list of postal codes of Canada is read from Wikipedia and loaded to dataframe. The following assumptions have been made: 
1. Borough with Not assigned input will be ignored. 
2. More than one neighborhood can exist in one postal code area.
3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [65]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

!pip install lxml
from lxml import etree

print('Libraries imported.')

Libraries imported.


In [66]:
Wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html     = requests.get(Wiki_url).content
df_list  = pd.read_html(html) # get the list

column_names = ['PostalCode', 'Borough' ,'Neighborhood']
df_      = pd.DataFrame(df_list[0]) # get the dataframe
df_.shape

(180, 3)

In [67]:
# drop all not assigned Borough
df_an = df_[df_['Borough'] != 'Not assigned'].reset_index(drop=True) 
df_an.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [68]:
df_an.shape

(103, 3)

# Section 2

In this section, Geospatial coordinates will be loaded to dataframe

In [72]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!pip install geocoder
import geocoder
# initialize your variable to None
latitude=[]
longitude=[]
for code in df_an['Postal Code']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    #print(code, g.latlng)
    #while (g.latlng is None):
    #    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    #    print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

df_an['Latitude']  = latitude
df_an['Longitude'] = longitude

df_an.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66263,-79.52831
6,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
7,M3B,North York,Don Mills,43.74923,-79.36186
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


# Section 3
Explore and cluster the neighborhoods in Downtown Toronto

#### 3.1 Use geopy library to get the latitude and longitude values of Toronto, Ontario.

In [73]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [None]:
#### 3.2 Create a map of Toronto with neighborhoods superimposed on top.

In [76]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_an['Latitude'], df_an['Longitude'], df_an['Borough'], df_an['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### 3.3 Simplify the above map and segment and cluster only the neighborhoods in boroughs that contain the word Toronto

In [83]:
dt_toronto = df_an[df_an['Borough'].str.contains('Toronto')].reset_index(drop=True)
dt_toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547
5,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
6,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
7,M6G,Downtown Toronto,Christie,43.66869,-79.42071
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891


In [84]:
CLIENT_ID = 'PSBR41RBPPSLYTTXWKGWY2XC1CYVUAHWZYITIF2QPSJ2UQW1' # your Foursquare ID
CLIENT_SECRET = 'FFLMJOG0HZHJHYS1RENTVI13VYNE0MXXWOCQXP4GTWNFACIL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

#### 3.4 Explore the first neighborhood in dt_toronto dataframe.

In [89]:
neighborhood_latitude = dt_toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dt_toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = dt_toronto.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.65512000000007, -79.36263999999994.


#### 3.5 Let's get the top 100 venues that are in Regent Park, Harbourfront within a radius of 500 meters.

In [91]:
radius = 500

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=PSBR41RBPPSLYTTXWKGWY2XC1CYVUAHWZYITIF2QPSJ2UQW1&client_secret=FFLMJOG0HZHJHYS1RENTVI13VYNE0MXXWOCQXP4GTWNFACIL&v=20180605&ll=43.65512000000007,-79.36263999999994&radius=500&limit=100'

In [92]:
## Examen the result
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fb4022bcce9aa167470652a'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 21,
  'suggestedBounds': {'ne': {'lat': 43.65962000450007,
    'lng': -79.3564319112327},
   'sw': {'lat': 43.650619995500065, 'lng': -79.36884808876718}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label':

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']