# KMeans Clustering Toronto Neighborhood Applied Data Science course

In [174]:
import random

import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

from geopy.geocoders import Nominatim
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html

import folium


In [175]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source, "html.parser")
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


In [176]:
dfs = pd.read_html(tab)
df_neighborhoods = dfs[0]
df_neighborhoods.head(40)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [177]:
#Open file locally, due to the issue (as per the lab notes, mentioned in the Coursera lab).
df_geodata = pd.read_csv("file:///Users/francishineman/Downloads/Geospatial_Coordinates.csv")

#Verify the table (print the first and last five rows)
#latt_long.head(103)
df_geodata

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [178]:
df_toronto = pd.merge(df_neighborhoods, df_geodata, on='Postal Code')
#df_toronto.head(103)
df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# "You can decide to work with only boroughs that contain the word Toronto... It is up to you."

In [179]:
#Reduce data to Toronto in the df table.

df = df_toronto[df_toronto['Borough'].str.contains('Toronto',regex=False)]
df
#print the resulting table which looks cleaner and we can use to map after add kmeans.

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [180]:
map_cluster = folium.Map(location=[43.712751,-79.390197],zoom_start=11)

for lat,lng,borough,neighbourhood in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighbourhood']):
    label = '{}, {}'.format(borough, neighbourhood)

In [181]:
map_cluster

In [None]:
#I mapped several different k values and finally converged on value of K = "4", for clarity.
#See map, bottom of this jupyter notebook.
k=4

In [None]:
#We are using the k means algorithm for the toronto data
#Drop everything but the cluster labels and location, for presentation in map.
toronto_kmeans = df.drop(['Postal Code','Borough','Neighbourhood'],1)

In [None]:
#Create the kMeans, using k = 4.
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_kmeans)
kmeans.labels_

In [182]:
#Attach KMeans labels to Cluster Labels . create table.
df.insert(0, 'Cluster Labels', kmeans.labels_)
df

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,3,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,3,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [183]:
#Use Folium for data presentation.
#Use Central Toronto for the central map location and zoom in to level 11.
map_clusters = folium.Map(zoom_start = 11, location=[43.712751,-79.390197])

In [184]:
# set color scheme for the clusters
nd_array = np.arange(k)
y_size = [n + nd_array + (n * nd_array) ** 2 for n in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(y_size)))
rainbow = [colors.rgb2hex(n) for n in colors_array]

In [185]:
#Map the neighborhood with distinguishing dots.
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], df['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=8, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters)

In [186]:
map_clusters