# Toronto Battle of the Neighborhoods

##### January 14, 2020

### Introduction:

# Obtaining and preparing the data

In [1]:
# import necessary libraries

import requests # to handle requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #import BeautifulSoup package
from html.parser import HTMLParser

print ('Libraries imported')

Libraries imported


### Scrape the data from the Wikipedia page to a pandas dataframe

In [2]:
# get the data from the wikipedia page

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url)

print(page.status_code) # the http response status code should print 200 if correct

print(page.content) # to see what the webpage contains

200
b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xh6LeApAIDEAACUs1LEAAACJ","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications

In [3]:
# change the above to a better format using prettify()
soup=BeautifulSoup(page.content,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xh6LeApAIDEAACUs1LEAAACJ","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [4]:
# find the table and extract the data to a pandas dataframe
table = soup.find('table', class_='wikitable')
table_rows=table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])
    
toronto=pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])

# filter out rows with data quality issues
toronto=toronto[~toronto['PostalCode'].isnull()]

toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Clean up the dataset

In [5]:
# remove any rows that have Borough = 'Not assigned'

toronto=toronto[toronto.Borough != 'Not assigned']

# reset index

toronto1=toronto.reset_index(drop=True)

toronto1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [6]:
# combine neighborhoods that belong to the same postal code into one row, comma-separated values

toronto2=toronto1.groupby(['PostalCode','Borough'])['Neighborhood'].agg([('Neighborhood',','.join)]).reset_index()

toronto2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# replace any neighborhoods that are 'Not assigned' with their borough name instead

toronto2.loc[toronto2['Neighborhood'] == 'Not assigned', 'Neighborhood'] = toronto2['Borough']

toronto2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
# check that the above clean up was done using a row that is known to have neighborhood 'Not assigned' in source table

toronto2.loc[toronto2['PostalCode']=='M9A']

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Queen's Park


In [9]:
toronto2.shape

(103, 3)

### In order to utilize Foursqaure location data, need to get latitude and longitude coordinates of each neighborhood.

In [10]:
# import csv file that has geographical coordinates of each postal code

geodata=pd.read_csv('http://cocl.us/Geospatial_data')

geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# merge geodata dataframe on Postal Code with the toronto2 dataframe to list PostalCode, Borough, Neighborhood, Latitude and Longitude

torontogeodata=pd.merge(toronto2,geodata, how='outer', left_on='PostalCode', right_on='Postal Code')
torontogeodata.drop('Postal Code',axis=1,inplace=True)

torontogeodata

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# Clustering the Neighborhoods

### Create a map of Toronto neighborhoods 

In [19]:
# import libraries

import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes   commented out after first install
import folium

print('Libraries imported.')

Libraries imported.


In [32]:
# create map of Toronto using latitude and longitude

# get geographical coordinates of Toronto

address='Toronto, Ontario'
geolocator=Nominatim()
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geographilca coordinates of Toronto are {}, {}'.format(latitude,longitude))

# create map

map_toronto=folium.Map(location=[latitude,longitude],zoom_start=10)

# add markers to map

for Latitude, Longitude, Neighborhood, Borough, PostalCode in zip(torontogeodata['Latitude'], torontogeodata['Longitude'], torontogeodata['Neighborhood'], torontogeodata['Borough'], torontogeodata['PostalCode']):
    label='{}, {}'.format(Neighborhood, Borough)
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [Latitude, Longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto



The geographilca coordinates of Toronto are 43.653963, -79.387207


In order to narrow down the analysis a bit, let's focus only neighborhoods within Downtown Toronto.

In [36]:
toronto_core = torontogeodata[torontogeodata['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_core.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


### Using Foursquare API

In [38]:
# The code was removed by Watson Studio for sharing.