# Segmenting and Clustering Neighborhoods in Toronto

In [4]:
#import libraries

import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim, ArcGIS
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print('All libraries loaded!')

All libraries loaded!


### Getting the data from Wikipedia

    BeautifulSoup is the tool to go if we want get the information needed as the table html format does not allow Pandas to read it straight with pd.read_html()
    The main steps are
    
    1. Send a GET request to get the HTML code 
    2. Create a BeautifulSoup instance to simplify the extraction 
    3. Get the relevant table from the HTML body code
    4. Get the information inside the rows
    5. Create a dataframe with the gathered information
    6. Preprocess the dataframe to remove unrelevant data

In [5]:
from bs4 import BeautifulSoup
import requests

# Getting the HTML data
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text

soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all("table")

data_table = tables[0] # The first table is the one containing the info we want

In [19]:
rows = data_table.find_all('tr')[1:]

columns = ['PostalCode','Borough', 'Neighborhood']
l = []

#Adding the content of each table row to a list
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.rstrip() for tr in td]
    l.append(row)

toronto_df = pd.DataFrame(l, columns=columns)
toronto_df['Neighborhood'].replace('',value= np.nan, inplace=True) # Replace empty strings with NaN
toronto_df.dropna(inplace=True) # Drop nan rows 
toronto_df = toronto_df.reset_index(drop=True) # Reseting index

In [7]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
toronto_df.shape

(103, 3)

### Adding Location Data

In [21]:
user_agent = "toronto_n"
geolocator = ArcGIS(user_agent=user_agent) #The ArcGIS geolocator doesn't need any credentials like Google's


def get_ll(postal_code):
    
    address = f'{postal_code}, Toronto, Ontario'
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

In [23]:
toronto_df['location'] = toronto_df['PostalCode'].apply(lambda x: get_ll(x))

In [24]:
def lat(location):
    return location[0]
    
def long(location):
    return location[1]
    
toronto_df['Latitude'] = toronto_df['location'].apply(lambda x: lat(x))
toronto_df['Longitude'] = toronto_df['location'].apply(lambda x: long(x))

toronto_df.drop('location', inplace=True, axis=1)

In [25]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


# Clustering Neighborhoods

#### Filter only the boroughs that contain 'Toronto'

In [31]:
only_toronto = toronto_df[toronto_df['Borough'].str.contains('Toronto')]
only_toronto.reset_index(drop=True, inplace=True)
only_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
3,M5C,Downtown Toronto,St. James Town,43.651734,-79.375554
4,M4E,East Toronto,The Beaches,43.678148,-79.295349


#### Let's check the neighborhoods' distribution

In [60]:
toronto_map = folium.Map([43.67,-79.37], zoom_start=12)

for lat, lng, label in zip(only_toronto['Latitude'], only_toronto['Longitude'], only_toronto['PostalCode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  


In [61]:
toronto_map

## Lets get the data from the venues

In [122]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 50
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [152]:
toronto_venues = getNearbyVenues(names=only_toronto['PostalCode'],
                                   latitudes=only_toronto['Latitude'],
                                   longitudes=only_toronto['Longitude']
                                  )

In [153]:
print(toronto_venues.shape)
toronto_venues.head()

(1135, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.650964,-79.353041,Souk Tabule,43.653756,-79.35439,Mediterranean Restaurant
1,M5A,43.650964,-79.353041,Young Centre for the Performing Arts,43.650825,-79.357593,Performing Arts Venue
2,M5A,43.650964,-79.353041,SOMA chocolatemaker,43.650622,-79.358127,Chocolate Shop
3,M5A,43.650964,-79.353041,BATLgrounds,43.647088,-79.351306,Athletics & Sports
4,M5A,43.650964,-79.353041,Cluny Bistro & Boulangerie,43.650565,-79.357843,French Restaurant


In [176]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories Store,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
toronto_onehot.shape

(1135, 212)

### Group the venues by postal code

In [178]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Accessories Store,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
toronto_grouped.shape

(39, 212)

### Select only the top 10 venues for each postal code

In [180]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [181]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Trail,Neighborhood,Pub,Church,Dance Studio,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,M4K,Bus Line,Park,Business Service,Grocery Store,Discount Store,Yoga Studio,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
2,M4L,Sandwich Place,Fast Food Restaurant,Park,Sushi Restaurant,Steakhouse,Food & Drink Shop,Fish & Chips Shop,Burrito Place,Liquor Store,Gym
3,M4M,Baseball Field,Business Service,Government Building,Night Market,Yoga Studio,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
4,M4N,Bus Line,Swim School,Yoga Studio,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


## Create and Train the Model

In [182]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([9, 0, 1, 6, 4, 1, 2, 1, 8, 1], dtype=int32)

In [183]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = only_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')
toronto_merged.dropna(inplace=True)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,1,Pub,Café,Athletics & Sports,Distribution Center,Bank,Mexican Restaurant,Coffee Shop,Food Truck,Chocolate Shop,Tech Startup
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,1,Coffee Shop,Café,Wings Joint,Bank,Restaurant,Sandwich Place,Burrito Place,Creperie,Smoothie Shop,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529,1,Coffee Shop,Café,Cosmetics Shop,Middle Eastern Restaurant,Ramen Restaurant,Restaurant,Japanese Restaurant,Clothing Store,Tea Room,Theater
3,M5C,Downtown Toronto,St. James Town,43.651734,-79.375554,1,Café,Gastropub,Coffee Shop,Seafood Restaurant,Creperie,Cosmetics Shop,Cocktail Bar,Grocery Store,Camera Store,Fountain
4,M4E,East Toronto,The Beaches,43.678148,-79.295349,9,Health Food Store,Trail,Neighborhood,Pub,Church,Dance Studio,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


In [188]:
## Draw the clusters in the map

In [189]:
# create map
toronto_clusters = folium.Map([43.67,-79.37], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)

palette = ['#5856d6','#337aff','#ff9500','#4cd964','#ff2d55','#ffcc00','#ff3b30','#f5a7f7','#67d54c','#c85025']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=palette[int(cluster)-1],
        fill=True,
        fill_color=palette[int(cluster)-1],
        fill_opacity=0.7).add_to(toronto_clusters)
       
toronto_clusters

It seems like downtown Toronto has an even distribution of venues however we can see few places that are different if we look at the outskirts