# Segmenting and Clustering Neighborhoods in Toronto

Loading libraries

## First part: Download the data

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np 
import pandas as pd

Download the Wikipwdia page and find the table with the data

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table')

Get the names of the columns from the first row of the table

In [3]:
headline = table.tbody.tr.text
headline = headline.split('\n')
while('' in headline) :
    headline.remove('') 
print(headline)

['Postcode', 'Borough', 'Neighbourhood']


Get all the rows in the table and store in a numpy array, ignoring the "Not assigned" ones

In [4]:
data_table=np.array(headline)

for element in table.find_all('tr') [1:]:
    columns = element.find_all('td')
    el=[]
    for i, column in enumerate(columns, start=0):
        el.append(column.get_text().strip('\n'))
        #print(i,el)
    #print(el)
    if(el[1] != 'Not assigned') :
        data_table=np.vstack((data_table,el))

Create a pandas dataframe from the numpy array

In [5]:
df=pd.DataFrame(data=data_table[1:,0:],
                  columns=data_table[0,0:])
df=df.rename(columns={'Postcode':'Postal Code', 'Neighbourhood':'Neighborhood'})
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Shape of the dataframe

In [6]:
df.shape

(211, 3)

## Second part: geolocalization

Download the csv file with the geospatial data

In [7]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv

--2019-05-20 19:11:37--  https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-05-20 19:11:37--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.25.197, 107.152.24.197
Connecting to ibm.box.com (ibm.box.com)|107.152.25.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-05-20 19:11:38--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjj

In [8]:
geo_data = pd.read_csv("Geospatial_Coordinates.csv", delimiter=",")

Create a pandas data frame with the geospacial data

In [9]:
neighborhoods = pd.merge(df,geo_data, on='Postal Code')
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


Merge the geospacial data with the previous dataframe

## Third part: Explore and cluster the neighborhoods in Toronto

Load more libaries

In [10]:
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Get the coordinates of Toronto

In [11]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6523873, -79.3835641.


Map of Toronto with the neighborhoods

In [12]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Group rows by neighborhood and take the mean of the frequency of occurrence of each category

In [13]:
toronto_grouped = neighborhoods.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Adelaide,43.650571,-79.384568
1,Agincourt,43.7942,-79.262029
2,Agincourt North,43.815252,-79.284577
3,Albion Gardens,43.739416,-79.588437
4,Alderwood,43.602414,-79.543484


Run k-means to cluster the neighborhood into 5 clusters.

In [14]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 2, 2, 1, 3, 0, 4, 0, 1, 0], dtype=int32)

In [15]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

toronto_merged=toronto_merged.drop(columns='Latitude')
toronto_merged=toronto_merged.drop(columns='Longitude')

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighborhood,Cluster Labels,Latitude,Longitude
0,M3A,North York,Parkwoods,0,43.753259,-79.329656
1,M4A,North York,Victoria Village,2,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,4,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,4,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,1,43.718518,-79.464763


In [16]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Latitude'], toronto_grouped['Longitude'], toronto_grouped['Neighborhood'], toronto_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters