# Segmenting and Clustering Neighbourhoods in Toronto

## PART ONE: SCRAPPING AND CLEANING THE TORONTO DATA ON WIKIPEDIA

In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 
from geopy.geocoders import Nominatim 

import requests 
from pandas.io.json import json_normalize 


import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

print('Libraries imported.')

Libraries imported.


In [4]:
from bs4 import BeautifulSoup
link = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(link,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XjaFqgpAIC8AAL5QNVEAAABD","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [5]:
table = soup.find('table',{'class':'wikitable sortable'})
print(table.tr.text)


Postcode
Borough
Neighbourhood



In [6]:
tab = table.find_all('tr')
my_table= ""
for tr in tab:
    row = ""
    for tds in tr.find_all('td'):
        row = row + "," + tds.text
    my_table= my_table + row[1:]
print(my_table)

M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Downtown Toronto,Queen's Park
M8A,Not assigned,Not assigned
M9A,Queen's Park,Not assigned
M1B,Scarborough,Rouge
M1B,Scarborough,Malvern
M2B,Not assigned,Not assigned
M3B,North York,Don Mills North
M4B,East York,Woodbine Gardens
M4B,East York,Parkview Hill
M5B,Downtown Toronto,Ryerson
M5B,Downtown Toronto,Garden District
M6B,North York,Glencairn
M7B,Not assigned,Not assigned
M8B,Not assigned,Not assigned
M9B,Etobicoke,Cloverdale
M9B,Etobicoke,Islington
M9B,Etobicoke,Martin Grove
M9B,Etobicoke,Princess Gardens
M9B,Etobicoke,West Deane Park
M1C,Scarborough,Highland Creek
M1C,Scarborough,Rouge Hill
M1C,Scarborough,Port Union
M2C,Not assigned,Not assigned
M3C,North York,Flemingdon Park
M3C,North York,Don Mills South
M4C,East York,Woodbine Heights
M5C,Downtown Toronto,St. James 

In [9]:
with open('toronto.csv', 'wb') as tor:
    tor.write(bytes(my_table, encoding='ascii',errors='ignore'))

In [10]:
tor_data = pd.read_csv("toronto.csv")
tor_data.columns= ["Postalcode","Borough","Neighbourhood"]
tor_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights


In [16]:
tor_data.shape

(286, 3)

In [17]:
tor_data.drop(tor_data[tor_data['Borough']=="Not assigned"].index, inplace=True)
tor_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor


In [27]:
tor_data.loc[tor_data['Neighbourhood'] =='Not assigned','Neighbourhood']=tor_data['Borough']
tor_data.sample(20)

Unnamed: 0,Postalcode,Borough,Neighbourhood
23,M9B,Etobicoke,Princess Gardens
92,M2K,North York,Bayview Village
243,M9W,Etobicoke,Northwest
195,M4T,Central Toronto,Summerhill East
33,M6C,York,Humewood-Cedarvale
241,M8W,Etobicoke,Alderwood
37,M9C,Etobicoke,Eringate
175,M9R,Etobicoke,Martin Grove Gardens
114,M5L,Downtown Toronto,Victoria Hotel
157,M6P,West Toronto,High Park


In [33]:
tor_df = tor_data.groupby(['Postalcode','Borough'],sort=False).agg( ','.join).reset_index()
tor_df.head(11)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [34]:
tor_df.shape

(103, 3)

# PART TW0: OBTAINING THE GEOGRAPHICAL COORDINATES OF THE NEIGHBOURHOODS IN TORONTO

In [35]:
tor_geo = pd.read_csv('Geospatial_Coordinates.csv')
tor_geo.head(11)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [37]:
tor_geo.shape

(103, 3)

In [38]:
tor_df.rename(columns={'Postalcode':'Postal Code'}, inplace=True)
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [39]:
tor_df2 = pd.merge(tor_df,tor_geo[['Postal Code','Latitude','Longitude']],on='Postal Code')
tor_df2.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [40]:
tor_df2.shape

(103, 5)

## PART 3: EXPLORING AND CLUSTERING THE NEIGHBOURHOODS IN TORONTO

In [42]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent = 'Toronto')
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print("The geograpical coordinate of Toronto are {}, {}.".format(latitude_toronto, longitude_toronto))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [44]:
my_toronto_map = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=10 )

for lat, lng, borough, Neighbourhood in zip(tor_df2['Latitude'], tor_df2['Longitude'], 
                                            tor_df2['Borough'], tor_df2['Neighbourhood']):
    label = '{},{}'.format(Neighbourhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity = 0.7,
    parse_html = False).add_to(my_toronto_map)
    
my_toronto_map

In [45]:
CLIENT_ID = 'KUYU5TIHQBZSMBG25XCU3LSOQYUERZSGMG4SEXZWRQMGWWCN' 
CLIENT_SECRET = '3H21KUGZVEHRCFQ5OXZ5435Q1MLGFVYIEPM010GHE5RVDTAV' 
VERSION = '20180605' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KUYU5TIHQBZSMBG25XCU3LSOQYUERZSGMG4SEXZWRQMGWWCN
CLIENT_SECRET:3H21KUGZVEHRCFQ5OXZ5435Q1MLGFVYIEPM010GHE5RVDTAV


In [46]:
toronto_data = tor_df2[tor_df2['Borough'] == 'Toronto'].reset_index(drop=True)
toronto_data.shape

(0, 5)

In [48]:
tor_df_lat = tor_df2.loc[0, 'Latitude'] 
tor_df_long = tor_df2.loc[0, 'Longitude']

tor_df_name = tor_df2.loc[0, 'Neighbourhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(tor_df_name, 
                                                               tor_df_lat, 
                                                               tor_df_long))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


In [51]:
LIMIT=100
radius=500

In [52]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
          
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:

tor_venues = getNearbyVenues(names=tor_df2['Neighbourhood'],
                                   latitudes=tor_df2['Latitude'],
                                   longitudes=tor_df2['Longitude']
                                  )



Parkwoods
Victoria Village
Harbourfront
Lawrence Heights,Lawrence Manor
Queen's Park
Queen's Park
Rouge,Malvern
Don Mills North
Woodbine Gardens,Parkview Hill
Ryerson,Garden District
Glencairn
Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park
Highland Creek,Rouge Hill,Port Union
Flemingdon Park,Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe
Guildwood,Morningside,West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
