# Segmenting and Clustering Neighborhoods in Toronto

## Loi Dinh

# Part 1 - Scrape Data from Wiki

In [21]:
from bs4 import BeautifulSoup
import requests
import re
import json
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import lxml.html as lh
import pandas as pd
import numpy as np

### Scrape Data from Wiki

In [22]:
url ="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [23]:
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

In [24]:
my_table = soup.find("table", class_ = 'wikitable sortable')

In [25]:
A=[]
B=[]
C=[]
for row in my_table.find_all('tr'):
    cells=row.find_all('td')
    if len(cells)==3:
        A.append(cells[0].text)
        B.append(cells[1].text)
        C.append(cells[2].text.rstrip('\n')) # remove the new line char from neighborhood c

### Create a Data Frame from Data Scrape

In [26]:
df = pd.DataFrame()
df['Postcode']=A
df['Borough']=B
df['Neighbourhood']=C


### Drop rows with Borough != 'Not assigned'

In [27]:

df = df[df.Borough != 'Not assigned'].reset_index(drop=True)

### Merge row with the same Postcode

In [28]:
df = df.groupby(['Postcode', 'Borough'], as_index=False).agg(lambda x: ','.join(x))
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


### Clean with Not assigned neighborhood

In [29]:
na_rows = df.Neighbourhood == 'Not assigned'
df.loc[na_rows, 'Neighbourhood'] = df.loc[na_rows, 'Borough']
df[na_rows]

Unnamed: 0,Postcode,Borough,Neighbourhood


### Shape of Data Frame

In [30]:
df.shape

(103, 3)

# Part 2 - Geospatial

In [31]:
geocoder = "https://cocl.us/Geospatial_data"
coords = pd.read_csv(geocoder)

In [32]:
print(coords.shape, "\n", coords.head())

(103, 3) 
   Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


### Merge 2 Data Frame

In [35]:
df_temp = df.set_index('Postcode')
coords_temp = coords.set_index('Postal Code')
df_coords = pd.concat([df_temp, coords_temp], axis=1, join='inner')

In [37]:
# reset index
df_coords.index.name = 'Postcode'
df_coords.reset_index(inplace=True)

In [39]:
df_coords.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3 - Explore and cluster the neighborhoods in Toronto

In [42]:
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\abist\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages

geographiclib-1.50   | 34 KB     |            |   0% 
geographiclib-1.50   | 34 KB     | ####7      |  47% 
geographiclib-1.50   | 34 KB

In [43]:
from geopy.geocoders import Nominatim

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [46]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



#### Create a map of Toronto with neighborhoods superimposed on top

In [52]:
import folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(df_coords['Latitude'], df_coords['Longitude'], df_coords['Postcode'], 
                                           df_coords['Borough'], df_coords['Neighbourhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

#### Reduce the number of Boroughs to explore

In [58]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = df_coords[df_coords['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

(39, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [63]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], 
                                           toronto_central_df['Postcode'], toronto_central_df['Borough'], 
                                           toronto_central_df['Neighbourhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [53]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#### Using FourSquare API to explore the Boroughs

In [64]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'JK5UTFSDWAXW2WVJJKGMZG5BAC3KXFRIOYQJFH1PHQEL22CN' # your Foursquare ID
CLIENT_SECRET = 'P453DIF5VQPE5EM0DHNZL5QN3MTML1AVBIKGPL24XB1XKX4O' # your Foursquare Secret
VERSION = '20200803' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JK5UTFSDWAXW2WVJJKGMZG5BAC3KXFRIOYQJFH1PHQEL22CN
CLIENT_SECRET:P453DIF5VQPE5EM0DHNZL5QN3MTML1AVBIKGPL24XB1XKX4O


In [66]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], 
                                                  toronto_central_df['Postcode'], toronto_central_df['Borough'], 
                                                  toronto_central_df['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [79]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Postcode', 'Borough', 'Neighbourhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1729, 9)


Unnamed: 0,Postcode,Borough,Neighbourhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


#### Check how many venues returned for each postal code

In [80]:
venues_df.groupby(['Postcode', 'Borough', 'Neighbourhood'])['VenueName'].count()

Postcode  Borough           Neighbourhood                                                                                       
M4E       East Toronto      The Beaches                                                                                               5
M4K       East Toronto      The Danforth West,Riverdale                                                                              41
M4L       East Toronto      The Beaches West,India Bazaar                                                                            19
M4M       East Toronto      Studio District                                                                                          44
M4N       Central Toronto   Lawrence Park                                                                                             3
M4P       Central Toronto   Davisville North                                                                                          8
M4R       Central Toronto   North Toronto West         

#### Check how many kinds of venues categories is returned

In [70]:
len(venues_df['VenueCategory'].unique())

233

### Analyze venues in each area

In [81]:
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['Postcode'] = venues_df['Postcode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighbourhoods'] = venues_df['Neighbourhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]

print(toronto_central_onehot.shape)
toronto_central_onehot.head()

(1729, 236)


Unnamed: 0,Postcode,Borough,Neighbourhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Get the frequency of occurance of each category in an area

In [82]:
toronto_central_venues_freq = toronto_central_onehot.groupby(['Postcode', 'Borough', 'Neighbourhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)
toronto_central_venues_freq.head()

(39, 236)


Unnamed: 0,Postcode,Borough,Neighbourhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.02439,...,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.022727
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Get 10 most occurance venue types in each area

In [84]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['Postcode', 'Borough', 'Neighbourhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_central_venues_freq['Postcode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighbourhoods'] = toronto_central_venues_freq['Neighbourhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted

Unnamed: 0,Postcode,Borough,Neighbourhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Airport,Airport Food Court,Bar,Sculpture Garden,Rental Car Location,Harbor / Marina
31,M6H,West Toronto,"Dovercourt Village,Dufferin",Bakery,Pharmacy,Grocery Store,Music Venue,Café,Supermarket,Middle Eastern Restaurant,Bar,Bank,Pool
32,M6J,West Toronto,"Little Portugal,Trinity",Bar,Coffee Shop,Asian Restaurant,Restaurant,Wine Bar,Vietnamese Restaurant,Café,Pizza Place,Bakery,Men's Store
26,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",Bar,Vietnamese Restaurant,Café,Bakery,Coffee Shop,Mexican Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant,Dumpling Restaurant,Park
25,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Bakery,Italian Restaurant,Japanese Restaurant,Bookstore,Restaurant,Bar,Chinese Restaurant,Beer Bar,Beer Store
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",Café,Breakfast Spot,Coffee Shop,Grocery Store,Bar,Burrito Place,Restaurant,Climbing Gym,Pet Store,Convenience Store
3,M4M,East Toronto,Studio District,Café,Coffee Shop,Gastropub,Brewery,Italian Restaurant,Bakery,American Restaurant,Sandwich Place,Cheese Shop,Pizza Place
36,M6S,West Toronto,"Runnymede,Swansea",Café,Sushi Restaurant,Coffee Shop,Pizza Place,Italian Restaurant,Yoga Studio,Diner,Burrito Place,Restaurant,Pub
6,M4R,Central Toronto,North Toronto West,Clothing Store,Coffee Shop,Yoga Studio,Chinese Restaurant,Salon / Barbershop,Spa,Diner,Sporting Goods Shop,Fast Food Restaurant,Dessert Shop
19,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",Coffee Shop,Aquarium,Café,Hotel,Restaurant,Scenic Lookout,Italian Restaurant,Brewery,Sporting Goods Shop,Fried Chicken Joint


### Clustering area
Run k-means to cluster the neighborhood into 3 clusters.

In [85]:
kclusters = 3

toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['Postcode', 'Borough', 'Neighbourhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_venues_freq_clustering)

toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.\
join(neighborhoods_venues_sorted.drop(['Borough', 'Neighbourhoods'], 1).set_index('Postcode'), on='Postcode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
E

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,0,Park,Jewelry Store,Trail,Sushi Restaurant,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Swim School,Bus Line,Falafel Restaurant,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Park,Trail,Pub,Neighborhood,Health Food Store,Donut Shop,Doner Restaurant,Dog Run,Department Store,Dumpling Restaurant
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442,1,Airport Service,Airport Lounge,Airport Terminal,Boutique,Airport,Airport Food Court,Bar,Sculpture Garden,Rental Car Location,Harbor / Marina
31,M6H,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259,1,Bakery,Pharmacy,Grocery Store,Music Venue,Café,Supermarket,Middle Eastern Restaurant,Bar,Bank,Pool
32,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,1,Bar,Coffee Shop,Asian Restaurant,Restaurant,Wine Bar,Vietnamese Restaurant,Café,Pizza Place,Bakery,Men's Store
26,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049,1,Bar,Vietnamese Restaurant,Café,Bakery,Coffee Shop,Mexican Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant,Dumpling Restaurant,Park
25,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049,1,Café,Bakery,Italian Restaurant,Japanese Restaurant,Bookstore,Restaurant,Bar,Chinese Restaurant,Beer Bar,Beer Store
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191,1,Café,Breakfast Spot,Coffee Shop,Grocery Store,Bar,Burrito Place,Restaurant,Climbing Gym,Pet Store,Convenience Store


#### Show those clusters onto a map

In [87]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], 
                                             toronto_central_clustered_df['Longitude'], 
                                             toronto_central_clustered_df['Postcode'], 
                                             toronto_central_clustered_df['Borough'], 
                                             toronto_central_clustered_df['Neighbourhood'], 
                                             toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters