# Toronto Neighborhood Clustering

This notebook scrapes data from Wikipedia about Toronto zip codes and applies a clustering analysis.

Edwin Sutrisno. May, 2020.

## Part 1: Scrape Wikipedia page for Postal Code

In [1]:
import pandas as pd

## Part 1: Scrape Data from Wikipedia

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables_list = pd.read_html(url)

# Number of tables found in the web page
len(tables_list)

3

In [4]:
# The first table is what we want
tables_list[0].head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Cleaning

In [5]:
postal_df = tables_list[0] # select the first table

# Select only rows where Borough != Not Assigned
postal_df = postal_df[postal_df['Borough'] != 'Not assigned']

# Check for any duplicate in Postal Code
len(postal_df['Postal Code'].unique()) == postal_df.shape[0]
# No duplicate postal code found

True

In [6]:
# Check if there is any row where Neighborhood == Not assigned
sum(postal_df['Neighborhood'].str.lower() == 'not assigned')
# Number of 'Not assigned' neighborhoods =

0

In [7]:
# Print number of rows, columns of the clean table
postal_df.shape

(103, 3)

## Part 2: Get Geo Location Data

In [8]:
import geocoder # import geocoder

In [9]:
# Initialize new columns for Lat and Lon
postal_df = postal_df.reset_index(drop=True)
postal_df['Latitude'] = 0.0
postal_df['Longitude'] = 0.0
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,0.0,0.0
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0.0,0.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0.0,0.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",0.0,0.0


In [10]:
# Query Lat and Lon Data
for ii in postal_df.index:
    pcode = postal_df.loc[ii, 'Postal Code']
    if postal_df.loc[ii, 'Latitude'] != 0:
        # Latlon data was already obtained previously
        continue
    
    lat_lng_coords = None # initialize variable
    timeout = 0 # initialize timeout variable
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pcode))
        lat_lng_coords = g.latlng
        timeout += 1
        if timeout > 20:
            print('Timeout.')
            break
        
    postal_df.loc[ii, 'Latitude'] = lat_lng_coords[0]
    postal_df.loc[ii, 'Longitude'] = lat_lng_coords[1]
postal_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


In [11]:
# (Optional) Save the results into a CSV file for future re-runs
#postal_df.to_csv('PostalDF.csv', index=False)
#postal_df = pd.read_csv('PostalDF.csv')

## Part 3: Clustering with Foursquare Data

I want to find neighborhoods (clusters) in which has high number of well rated restaurants. I like trying new restaurants so finding a place to live around this area will be nice for me.

In [12]:
import requests # library to handle requests
from pandas.io.json import json_normalize
import configparser
import folium

In [13]:
# Select all boroughs with name containing Toronto
hasToronto = postal_df['Borough'].apply(lambda name: 'Toronto' in name)
hood_df = postal_df.loc[hasToronto, ['Borough', 'Neighborhood', 'Latitude', 'Longitude']]
hood_df.reset_index(drop=True, inplace=True)
hood_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
2,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
3,Downtown Toronto,St. James Town,43.651734,-79.375554
4,East Toronto,The Beaches,43.678148,-79.295349


In [14]:
# Foursquare Credentials
config = configparser.ConfigParser()
config.read('foursquare_creds.conf')
CLIENT_ID = config['CREDENTIALS']['CLIENT_ID']
CLIENT_SECRET = config['CREDENTIALS']['CLIENT_SECRET']

In [15]:
# Search for restaurant venues
keyword = 'restaurant'
radius = 1000
apiversion = '20180604'
limit = 100
venues = []
for ii in hood_df.index:
    lat = hood_df.loc[ii, 'Latitude']
    lon = hood_df.loc[ii, 'Longitude']
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lon, apiversion, keyword, radius, limit)
    results = requests.get(url).json()
    venues += results['response']['venues']

In [16]:
# Convert venues dict to Pandas
venues_df = json_normalize(venues)

# Take only necessary columns
venues_df = venues_df[['id', 'name', 'location.address', 'location.lat', 'location.lng']]

# Delete duplicates due to overlapped search radius
venues_df.drop_duplicates(subset='id', inplace=True)

# Delete rows where address == nan
venues_df = venues_df.loc[venues_df['location.address'].isna() == False]
venues_df.reset_index(drop=True, inplace=True)
venues_df.head(10)

Unnamed: 0,id,name,location.address,location.lat,location.lng
0,5d3127c715cfea0007e44d52,est Restaurant,729 Queen St. East,43.658911,-79.349035
1,4ad4c05cf964a520e1f520e3,Docks Restaurant & Night Club The,11 Polson St.,43.641806,-79.354171
2,4b631e60f964a5205f642ae3,Lucky Star Restaurant,739 Queen Street East,43.659069,-79.348691
3,5b9e897ac9a5170039679335,Ryan Restaurant,356 Queen Street E,43.655724,-79.364129
4,59719d3c65211f46b1172eb2,Caribbean Sunset Restaurant and Bar,753A Queen St E,43.659348,-79.347954
5,4ae5b91ff964a520a6a121e3,Morning Glory Cafe,457 King St. E,43.653947,-79.361149
6,4ac3e6cef964a520629d20e3,Archeo,31 Trinity St.,43.650667,-79.359431
7,4ad4c05ff964a52048f720e3,Hemispheres Restaurant & Bistro,110 Chestnut Street,43.654884,-79.385931
8,5750b013498e755287c6de97,Some Time BBQ Grill Restaurant 碳烤屋,988 Baldwin Street,43.655874,-79.393826
9,4bd47e6fcfa7b7139f2924da,Studio Restaurant,389 Church St.,43.6615,-79.379319


### Visualize on Map

In [17]:
center_lat = venues_df['location.lat'].mean()
center_lng = venues_df['location.lng'].mean()
venues_map = folium.Map(location=[center_lat, center_lng], zoom_start=13) # generate map centred around the Conrad Hotel

# Add circles for restaurants
for lat, lng, label in zip(venues_df['location.lat'], venues_df['location.lng'], venues_df['name']):
    folium.CircleMarker([lat, lng], radius=5, color='blue', popup=label,
                        fill=True, fill_color='blue', fill_opacity=0.6).add_to(venues_map)
venues_map

### Create Clusters

In [18]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

In [19]:
X = np.array(venues_df[['location.lat', 'location.lng']])
X.shape
k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)
k_means.fit(X)
k_means_labels = k_means.labels_ # cluster label for each row
k_means_cluster_centers = k_means.cluster_centers_ # cluster centroids

In [20]:
colors = ['purple', 'green', 'blue', 'yellow']

# Loop through the centroids and plot the cluster points
# k will range from 0-3, which will match the number of clusters in the dataset.
for label, colr in zip(range(len(k_means_cluster_centers)), colors):
    # Get members of the cluster
    X_members = X[k_means_labels == label]

    # Get the centers
    cluster_center = k_means_cluster_centers[label]
    
    # plot the members
    for coords in X_members:
        lat = coords[0]
        lng = coords[1]
        folium.CircleMarker([lat, lng], radius=5, color=colr,
                            fill=True, fill_color=colr, fill_opacity=0.6).add_to(venues_map)
    
    # plot the centroids with specified color, but with a darker outline
    folium.CircleMarker([cluster_center[0], cluster_center[1]], radius=6, color='red',
                        fill=True, fill_color='red', fill_opacity=0.6).add_to(venues_map)

In [21]:
venues_map

### Count the number of restaurants in each cluster

In [25]:
print('Size of Cluster Purple 0 = ', sum(k_means_labels == 0))
print('Size of Cluster Green 1 = ', sum(k_means_labels == 1))
print('Size of Cluster Blue 2 = ', sum(k_means_labels == 2))
print('Size of Cluster Yellow 3 = ', sum(k_means_labels == 3))

Size of Cluster Purple 0 =  82
Size of Cluster Green 1 =  214
Size of Cluster Blue 2 =  39
Size of Cluster Yellow 3 =  38


### Get the neighborhoods in closest to the largest cluster
The largest cluster is Cluster 1, so let's get the name of the neighborhoods nearby

In [26]:
cen_lat = k_means_cluster_centers[1][0]
cen_lng = k_means_cluster_centers[1][1]
dlat = hood_df['Latitude'] - cen_lat
dlng = hood_df['Longitude'] - cen_lng
hood_df['Distance'] = np.sqrt(dlat**2+dlng**2)

### Final result: Sort Neighborhoods by Distance

In [27]:
hood_df.sort_values('Distance').head(2)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Distance
6,Downtown Toronto,Central Bay Street,43.656072,-79.385653,0.003187
8,Downtown Toronto,"Richmond, Adelaide, King",43.650542,-79.384116,0.006169


### The top neighborhoods are Central Bay Street, Richmond, Adelaide, and King