# Segmenting and Clustering Neighborhoods in Toronto - Part 3
## Yu Deng

First import the necessary packages.

In [2]:
import pandas as pd
import numpy as np

Import the csv file I created in former session, containing the final dataframe of neighbourhoods in Toronto and their geospatial information.

In [3]:
toronto = pd.read_csv("toronto.csv", index_col=0)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Import other neccesary packages.

In [4]:
import json
import requests
from pandas.io.json import json_normalize

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

Get the latitude and longitude values of Toronto using geopy library

In [8]:
address = 'Toronto'
geolocater = Nominatim(user_agent="my-application")
location = geolocater.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The geographical coordinate of Toronto are {}.{}".format(latitude, longitude))

The geographical coordinate of Toronto are 43.653963.-79.387207


Create a map of Toronto with neighborhoods and their postal codes superimposed on top.

In [23]:
map_toronto= folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, postcode, borough, neighbourhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['PostalCode'], toronto['Borough'], toronto['Neighbourhood']):
    label = '{},{}:\n{}'.format(borough, postcode, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare credentials and version for next part.

In [24]:
CLIENT_ID = '5IK3E2GVEJJOHW0KLB1OJLSMUDU45GS1H1ZOEFIGKWAWPOKG'
CLIENT_SECRET = '2NBAOZ2QHNIXEOA3BCWXXWQMIRTGAYCSGAW2QPFFUPQNCLII'
VERSION = '20200315'

Create a function to detect venues around specific neibourhoods within 500 meters.

In [29]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(nearby_venues)

toronto_venues = getNearbyVenues(names = toronto['Neighbourhood'],
                                   latitudes = toronto['Latitude'],
                                   longitudes = toronto['Longitude']
                                  )

Check the retrieved results.

In [82]:
toronto_venues.rename(columns={'Neighborhood':'Neighborhood Name'}, inplace=True)
toronto_venues.head()

Unnamed: 0,Neighborhood Name,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


Use dummy variables to analyze every venue and neighborhood.

In [90]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood Name'] = toronto_venues['Neighborhood Name']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Neighborhood Name,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group the venues together and calculate the mean of the frequency of occurrence of each category.

In [91]:
toronto_grouped = toronto_onehot.groupby(['Neighborhood Name']).mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood Name,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Run k-means to cluster the neighborhoods into 10 clusters.

In [103]:
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood Name', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:100] 

array([0, 5, 1, 5, 0, 0, 5, 0, 0, 5, 0, 0, 5, 5, 0, 0, 1, 0, 5, 0, 0, 5,
       0, 5, 5, 0, 9, 0, 0, 5, 0, 0, 0, 5, 5, 5, 2, 5, 4, 5, 0, 1, 2, 0,
       0, 0, 5, 5, 5, 0, 0, 0, 5, 8, 5, 0, 5, 5, 5, 5, 5, 0, 0, 5, 0, 0,
       5, 1, 0, 5, 0, 0, 5, 0, 1, 6, 7, 0, 0, 4, 3, 0, 0, 0, 0, 5, 5, 0,
       5, 5, 0, 0, 0, 5, 0, 0, 0, 5, 5, 5])

Create the final results.

In [139]:
toronto_cluster = toronto_grouped[['Neighborhood Name']]
toronto_cluster.insert(1, 'Cluster Labels', kmeans.labels_)

toronto_final = pd.merge(toronto_cluster, toronto, how='left', left_on='Neighborhood Name', right_on='Neighbourhood')
toronto_final.drop('Neighbourhood', axis=1, inplace=True)
toronto_final

Unnamed: 0,Neighborhood Name,Cluster Labels,PostalCode,Borough,Latitude,Longitude
0,"Adelaide, King, Richmond",0,M5H,Downtown Toronto,43.650571,-79.384568
1,Agincourt,5,M1S,Scarborough,43.794200,-79.262029
2,"Agincourt North, L'Amoreaux East, Milliken, St...",1,M1V,Scarborough,43.815252,-79.284577
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",5,M9V,Etobicoke,43.739416,-79.588437
4,"Alderwood, Long Branch",0,M8W,Etobicoke,43.602414,-79.543484
...,...,...,...,...,...,...
95,Willowdale West,0,M2R,North York,43.782736,-79.442259
96,Woburn,0,M1G,Scarborough,43.770992,-79.216917
97,"Woodbine Gardens, Parkview Hill",5,M4B,East York,43.706397,-79.309937
98,Woodbine Heights,5,M4C,East York,43.695344,-79.318389


That is my final part.Let's visualize the resulting clusters!

In [144]:
map_toronto_clusters= folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lng, postcode, borough, neighbourhood, cluster in zip(toronto_final['Latitude'], toronto_final['Longitude'], toronto_final['PostalCode'], toronto_final['Borough'], toronto_final['Neighborhood Name'], toronto_final['Cluster Labels']):
    label = '{},{}:\n{}'.format(borough, postcode, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.8,
        parse_html=False).add_to(map_toronto_clusters)  
    
map_toronto_clusters