# Toronto neighborhoods clustering

### 1. Data scrapring

Get a list with all Toronto neighborhoods and boroughs from Wikipedia. Since Wikipedia pages are intended for human-read, we have to download the corresponding page, extract the information we want with a web scraper, and convert that information to a Pandas DataFrame.

In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# Download the article from Wikipedia and parse it with BeautifulSoup
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(wikipedia_url)
soup = BeautifulSoup(response.text, "html.parser")

In [3]:
# Find the table in the HTML
table = soup.find('table', {'class':'wikitable sortable'})
items = table.find_all('td')

# Iterate over the table cells
data = {'PostalCode': [], 'Borough': [], 'Neighborhood': []}
for i in range(0, int(len(items)/3)):
    # Iterate over the table rows (skipping header)
    code = items[3*i].text.rstrip()
    borough = items[3*i+1].text.rstrip()
    neighborhood = items[3*i+2].text.rstrip()
    if borough != 'Not assigned':
        if code in data['PostalCode']:
            # Join neighborhoods beloging to the same postal code
            data['Neighborhood'][3*i] = data['Neighborhood'][3*i] + ", " + neighborhood
        else:
            data['PostalCode'].append(code)
            data['Borough'].append(borough)
            if neighborhood != 'Not assigned':
                data['Neighborhood'].append(neighborhood)
            else:
                # If the neighborhood name is not specified, take the borough name as the neighborhood's
                data['Neighborhood'].append(borough)
        
neighborhoods = pd.DataFrame(data)
neighborhoods.head(12)
        

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### 2. Get neighborhood coordinates

The Wikipedia page doesn't give the coordinates of the neighborhoods. Then, we have to use a separate dataset, and join it with the previous DataFrame.

In [4]:
# Get the coordinates from the CSV
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates = coordinates.rename(columns={'Postal Code': 'PostalCode'})

# Add the coordinates columns to the neighborhoods DataFrame
neighborhoods = pd.merge(neighborhoods, coordinates, on='PostalCode')
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### 3. Cluster the neighborhoods

The last step is to cluster the main Toronto neighborhoods based in the most popular venues category, using Foursquare API and scikit-learn.

#### 3.1. Exclude those borough whose name doesn't have "Toronto" and remove the postal code, since we don't need it

In [58]:
neighborhoods_toronto = neighborhoods[neighborhoods['Borough'].str.contains("Toronto")].reset_index(drop=True)
neighborhoods_toronto = neighborhoods_toronto.drop(columns=['PostalCode'])
neighborhoods_toronto.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031


#### 3.2. Define Foursquare API credentials

In [6]:
CLIENT_ID = '5JXQFKUSHUJTB23CXAEGRPSS0WN1U3LVAHQTPKCAOBFVROP5'
CLIENT_SECRET = 'KBNU1HVJ22GBB3JBWQ2WKAQTJ42OMQZDDZDFPRF2VQ3EHGQP'
VERSION = '20200613'
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius in meters

#### 3.3. Get from Foursquare the top 100 venues of each neighbhorhood

In [12]:
toronto_venues = pd.DataFrame(columns=['Neighborhood', 'Latitude', 'Longitude', 'Venue Category'])
for neighborhood in neighborhoods_toronto.itertuples():
    # For each neighborhood...
    name = neighborhood.Neighborhood
    lat = neighborhood.Latitude
    lng = neighborhood.Longitude
    
    # ... get the closest venues...
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
    results = requests.get(url).json()["response"]['groups'][0]['items']

    # ... and add them to the DataFrame
    for v in results:
        toronto_venues = toronto_venues.append({
            'Neighborhood': name, 
            'Latitude': lat, 
            'Longitude': lng,  
            'Venue Category': v['venue']['categories'][0]['name']}, ignore_index=True)

toronto_venues.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Spa
5,"Regent Park, Harbourfront",43.65426,-79.360636,Restaurant
6,"Regent Park, Harbourfront",43.65426,-79.360636,Park
7,"Regent Park, Harbourfront",43.65426,-79.360636,Gym / Fitness Center
8,"Regent Park, Harbourfront",43.65426,-79.360636,Historic Site
9,"Regent Park, Harbourfront",43.65426,-79.360636,Breakfast Spot


#### 3.4. Prepare the DataFrame for clustering

In [34]:
# One-hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="").drop(columns='Neighborhood')
toronto_onehot = pd.concat([toronto_venues['Neighborhood'], toronto_onehot], axis=1)

# Group one-hot by neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0,0.0,0.015385
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.012821,0.0,0.025641
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Create a DataFrame with the top 10 venues for each neighborhood
num_top_venues = 10
columns = ['Neighborhood']
columns.extend(['{}. Most Common Venue'.format(str(i)) for i in range(1, num_top_venues+1)])
neighborhoods_venues = pd.DataFrame(columns=columns)
neighborhoods_venues['Neighborhood'] = toronto_grouped['Neighborhood']

for i in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[i, 1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues.iloc[i, 1:] = row_categories_sorted.index.values[0:num_top_venues]
    
neighborhoods_venues.head()

Unnamed: 0,Neighborhood,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Beer Bar,Bakery,Seafood Restaurant,Restaurant,Café,Clothing Store,Irish Pub
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Bakery,Gym,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
2,"Business reply mail Processing Centre, South C...",Yoga Studio,Auto Workshop,Park,Pizza Place,Restaurant,Butcher,Burrito Place,Skate Park,Brewery,Comic Shop
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Rental Car Location,Harbor / Marina,Plane,Boat or Ferry,Boutique,Bar,Sculpture Garden
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Department Store,Salad Place,Burger Joint,Bubble Tea Shop,Indian Restaurant


#### 3.5. Cluster the neighborhoods using k-means

In [77]:
from sklearn.cluster import KMeans

# Set number of clusters
k = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [78]:
#neighborhoods_venues.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = neighborhoods_toronto
toronto_merged = toronto_merged.join(neighborhoods_venues.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Restaurant,Theater,Farmers Market,Chocolate Shop
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Discount Store,Sandwich Place,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop,Fried Chicken Joint
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Bubble Tea Shop,Japanese Restaurant,Italian Restaurant,Café,Middle Eastern Restaurant,Cosmetics Shop,Diner,Tea Room
3,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Cocktail Bar,Gastropub,American Restaurant,Restaurant,Hotel,Italian Restaurant,Lingerie Store,Department Store
4,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Health Food Store,Pub,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop


In [79]:
# Show the number of neighborhoods on each cluster
toronto_merged.groupby('Cluster Labels').count()['Neighborhood']

Cluster Labels
0    34
1     1
2     1
3     2
4     1
Name: Neighborhood, dtype: int64

#### 3.6. Show the clusters in a map

In [69]:
# Install and import the needed libraries
!pip install folium geopy

import folium
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

# Get Toronto coordinates
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode('Toronto, Canada')
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

