# Segmenting and Clustering Neighborhoods in Toronto

## Creating a dataframe of neighborhoods in Toronto

Import the necessary libraries for this part

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from requests import get

Get the table in the URL page to a pandas dataframe

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # Define the URL to get the data from
postalHTML = get(url) # Get the data
postalSoup = BeautifulSoup(postalHTML.text, 'html.parser') # Get the soup
postalTable = postalSoup.find('table') # Find the table inside the soup
postal_df_orig = pd.read_html(str(postalTable),header=0)[0] # Convert to a pandas dataframe
postal_df_orig.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Cleanup the dataframe

In [3]:
postal_df = postal_df_orig[postal_df_orig['Borough'] != 'Not assigned'].reset_index() # dropping out all the postal codes with boroughs not assigned
postal_df.drop('index',axis=1,inplace=True)
postal_df.replace(['\W* /'], ',', regex=True, inplace=True) # Replacing the / for , in the Boroughs with multiple Neighborhoods
postal_df.loc[postal_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = postal_df.loc[postal_df['Neighborhood'] == 'Not assigned', 'Borough'] # assigning the Borough name to the Neighborhood if the Neighborhood name is 'Not assigned'
postal_df.sort_values(by='Postal code',inplace=True)
postal_df.reset_index(inplace=True)
postal_df.drop('index',axis=1,inplace=True)
postal_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Print the shape of the cleaned dataframe

In [4]:
print(postal_df.shape)

(103, 3)


End of this part

***
## Getting the latitude and longitude of each neighborhood

Import additional necessary libraries for this part

In [5]:
import pgeocode # the suggested geocoder requires an API key, so I'm using the pgeocode
import urllib.request

Obtain the latitude and longitude values for each postal code

In [6]:
print('Obtaining latitude and longitude for the postal codes:')
latitudes = []
longitudes = []
postalCodes = []

nomi = pgeocode.Nominatim('ca')

for postalCode in postal_df['Postal code']:
    print(postalCode, end =" ")
    
    geoData = nomi.query_postal_code(postalCode)
    
    postalCodes.append(postalCode)
    latitudes.append(geoData.latitude)
    longitudes.append(geoData.longitude)
    
    print('... done!')

Obtaining latitude and longitude for the postal codes:
M1B ... done!
M1C ... done!
M1E ... done!
M1G ... done!
M1H ... done!
M1J ... done!
M1K ... done!
M1L ... done!
M1M ... done!
M1N ... done!
M1P ... done!
M1R ... done!
M1S ... done!
M1T ... done!
M1V ... done!
M1W ... done!
M1X ... done!
M2H ... done!
M2J ... done!
M2K ... done!
M2L ... done!
M2M ... done!
M2N ... done!
M2P ... done!
M2R ... done!
M3A ... done!
M3B ... done!
M3C ... done!
M3H ... done!
M3J ... done!
M3K ... done!
M3L ... done!
M3M ... done!
M3N ... done!
M4A ... done!
M4B ... done!
M4C ... done!
M4E ... done!
M4G ... done!
M4H ... done!
M4J ... done!
M4K ... done!
M4L ... done!
M4M ... done!
M4N ... done!
M4P ... done!
M4R ... done!
M4S ... done!
M4T ... done!
M4V ... done!
M4W ... done!
M4X ... done!
M4Y ... done!
M5A ... done!
M5B ... done!
M5C ... done!
M5E ... done!
M5G ... done!
M5H ... done!
M5J ... done!
M5K ... done!
M5L ... done!
M5M ... done!
M5N ... done!
M5P ... done!
M5R ... done!
M5S ... done!
M5T ...

Convert to a pandas dataframe

In [7]:
lat_long_df = pd.DataFrame({'Latitude': latitudes, 'Longitude': longitudes}, postalCodes)
lat_long_df.sort_index(inplace=True)
lat_long_df.head()

Unnamed: 0,Latitude,Longitude
M1B,43.8113,-79.193
M1C,43.7878,-79.1564
M1E,43.7678,-79.1866
M1G,43.7712,-79.2144
M1H,43.7686,-79.2389


Download the .csv with the latitudes and longitudes

In [8]:
url = 'http://cocl.us/Geospatial_data'
fileName = 'Geospatial_Coordinates.csv'
urllib.request.urlretrieve(url, fileName)

('Geospatial_Coordinates.csv', <http.client.HTTPMessage at 0xaad9988>)

Converting the data from the .csv into a pandas dataframe

In [9]:
lat_long_csv_df = pd.read_csv(fileName)
lat_long_csv_df.set_index('Postal Code', inplace=True)
lat_long_csv_df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [10]:
# Comparative statistics
dif = pd.DataFrame({'Dif in Latitude': [], 'Dif in Longitude': []})
dif['Dif in Latitude'] = lat_long_df.Latitude - lat_long_csv_df.Latitude
dif['Dif in Longitude'] = lat_long_df.Longitude - lat_long_csv_df.Longitude
print('The dataframe created using pgeocode has %d NaN values, while the downloaded .csv has %d NaN values.' % (max(lat_long_df.isna().sum()[0],lat_long_df.isna().sum()[1]),max(lat_long_csv_df.isna().sum()[0],lat_long_csv_df.isna().sum()[1])))
print('The dataframe created using pgeocode has average differences in latitude and longitude of %.4f and %.4f, respectively' % (dif.abs().mean()[0],dif.abs().mean()[1]))

The dataframe created using pgeocode has 1 NaN values, while the downloaded .csv has 0 NaN values.
The dataframe created using pgeocode has average differences in latitude and longitude of 0.0034 and 0.0035, respectively


Due to the NaN value in the dataframe obtained through the pgeocode library, I opted to use the provided .csv file

Since the postal_df and lat_long_csv_df are both ordered the same way, drop the index to be able to merge.

In [11]:
lat_long_csv_df.reset_index(inplace=True)
lat_long_csv_df.drop('Postal Code', axis=1, inplace=True)
lat_long_csv_df.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


Merging the postal codes with the latitudes and longitudes

In [12]:
postalLatLng_df = pd.concat([postal_df, lat_long_csv_df], axis=1)
postalLatLng_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


End of this part

***
## Exploring and clustering the neighborhoods in Toronto

Import additional necessary libraries for this part

In [13]:
import json
import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

Visualization of the map of Toronto with all the neighborhoods

In [25]:
map_TO = folium.Map(
    location=[
        postalLatLng_df[postalLatLng_df['Neighborhood'] == 'Leaside'].Latitude,
        postalLatLng_df[postalLatLng_df['Neighborhood'] == 'Leaside'].Longitude
    ], 
    zoom_start=11)

# add markers to map
for lat, lng, label in zip(postalLatLng_df['Latitude'], postalLatLng_df['Longitude'], postalLatLng_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TO)

display(map_TO)

Foursquare credentials

In [15]:
try:
    with open('FS_cred.txt', 'r') as file:
        creds = file.readlines()
    
    CLIENT_ID = creds[0] # your Foursquare ID
    CLIENT_SECRET = creds[1] # your Foursquare Secret
    VERSION = '20180605' # Foursquare API version
    
    print('Credentials loaded successfully!')

except:
    print('Unable to access file containing the credentials.')

Credentials loaded successfully!


Define a function to get the nearby venues of all the neighborhoods

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get the venues (limited to 100 per neighborhood)

In [17]:
LIMIT = 100

TO_venues = getNearbyVenues(names=postalLatLng_df['Neighborhood'],
                            latitudes=postalLatLng_df['Latitude'],
                            longitudes=postalLatLng_df['Longitude']
                            )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence P

In [18]:
print('Found a total of %d venues in %d different categories!' % (TO_venues.shape[0], len(TO_venues['Venue Category'].unique())))

Found a total of 2124 venues in 268 different categories!


Preparing the dataframe for the analysis

One-hot encoding

In [19]:
TO_onehot = pd.get_dummies(TO_venues[['Venue Category']], prefix="", prefix_sep="") # one hot encoding

TO_onehot['Neighborhood'] = TO_venues['Neighborhood'] # add neighborhood column back to dataframe
cols = list(TO_onehot) # get a list of columns
cols.insert(0, cols.pop(cols.index('Neighborhood'))) # move the column to head of list using index, pop and insert
TO_onehot = TO_onehot[ cols] # move the neighborhood to the first column

TO_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Average of the occurence of venues near each neighborhood

In [20]:
TO_grouped = TO_onehot.groupby('Neighborhood').mean().reset_index()
TO_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.052632,0.000,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.025,0.0,0.0,0.0,0.0,0.0
90,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
91,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0
92,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0


K-Clustering

In [21]:
kclusters = 5 # set number of clusters

TO_grouped_clustering = TO_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, n_init=50).fit(TO_grouped_clustering) # run k-means clustering

# check cluster labels generated for each row in the dataframe
print('There are %d neighborhoods and %d labels:' % (TO_grouped.shape[0], kclusters))
for label in list(set(kmeans.labels_)):
    print('The label %d was applied to %.2f%% of the neighborhoods' % (label, 100*sum(kmeans.labels_ == label)/len(kmeans.labels_)))

There are 94 neighborhoods and 5 labels:
The label 0 was applied to 82.98% of the neighborhoods
The label 1 was applied to 11.70% of the neighborhoods
The label 2 was applied to 2.13% of the neighborhoods
The label 3 was applied to 2.13% of the neighborhoods
The label 4 was applied to 1.06% of the neighborhoods


Ordering the most popular venues in each neighborhood to compare with the cluster choices

In [22]:
# Function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Number of top venues to order
num_top_venues = 10

# create columns according to number of top venues
columns = ['Neighborhood']
indicators = ['st', 'nd', 'rd']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = TO_grouped['Neighborhood']

# add the top venues
for ind in np.arange(TO_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(TO_grouped.iloc[ind, :], num_top_venues)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# create a dataframe with each neighborhood, latitude, longitude, cluster label, and most popular venues
TO_merged = postalLatLng_df
TO_merged = TO_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
TO_merged.dropna(inplace=True)
TO_merged['Cluster Labels'] = TO_merged['Cluster Labels'].astype('int8')
TO_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,2,Fast Food Restaurant,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Bar,Golf Course,Yoga Studio,Dumpling Restaurant,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Breakfast Spot,Rental Car Location,Bank,Electronics Store,Mexican Restaurant,Intersection,Medical Center,Donut Shop,Dog Run,Doner Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Convenience Store,Korean Restaurant,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Hakka Restaurant,Gas Station,Fried Chicken Joint,Bank,Bakery,Thai Restaurant,Athletics & Sports,Caribbean Restaurant,Cupcake Shop,Distribution Center


Visualize the cluster choices

In [24]:
# create map
map_clusters = folium.Map(
    location=[
        postalLatLng_df[postalLatLng_df['Neighborhood'] == 'Leaside'].Latitude,
        postalLatLng_df[postalLatLng_df['Neighborhood'] == 'Leaside'].Longitude
    ], 
    zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TO_merged['Latitude'], TO_merged['Longitude'], TO_merged['Neighborhood'], TO_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
display(map_clusters)