# Segmenting and Clustering Neighborhoods in Toronto

## Import the required packages

In [201]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

## Load the data from Wikipedia
We will use the default `requests` package for this.

In [202]:
# Set the URL of the Wikipedia page for the postal codes of Canada
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# Get the data from the URL
response = requests.get(url)

## Scrape the data from the page using BeautifulSoup

In [203]:
# Initialize BeautofulSoup by passing the HTML content from the Wikipedia page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the postal code table on the page
portal_code_table = soup.find('table')

## Create Dataframe from the HTML content of the table

In [204]:
# Cast the data to a String for Pandas to understand it
portal_code_table = str(portal_code_table)

# Create list from the HTML string
df = pd.read_html(portal_code_table)

# Convert list to a Dataframe
df = pd.DataFrame(df[0])

## Clean the data
We will apply two transformations here:
* Remove all items where Borough is 'Not assigned'
* If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough

There are no rows with a duplicate on the Postal Code, so I will ignore this transformation.

In [205]:
# remove all items where Borough is 'Not assigned'
df = df[df.Borough != 'Not assigned']

# If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])

# Print shape as it's part of the instructions 8)
df.shape

(103, 3)

## Get the location of the postal codes

### First option: use the Google geocoder
Although this solution works perfectly fine, and in a real-world example would be the way forward, this method will not be used because the Geocoder required a private paid API token that I would have needed to share with my peers.

In [206]:
import geocoder

def get_latlong(a):
    postal_code = a['Postal Code']
    lat_lng_coords = None

    # for the geocoder you need a Google API key
    secret_google_key = os.getenv('GOOGLE_SECRET_KEY')
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code), key=secret_google_key)        
        lat_lng_coords = g.latlng
        
    return pd.Series(lat_lng_coords)

# df[['Latitude', 'Longitude']] = df.apply(get_latlong, axis=1)

### Second option: use the given CSV
This solution uses the given CSV and joins the CSV location data with the original dataframe. 

In [207]:
location_df = pd.read_csv('Geospatial_Coordinates.csv')

def get_lat_long_from_location_data_set(row):
    postal_code = row['Postal Code']
    serie_for_postal_code = location_df[location_df['Postal Code'] == postal_code]    
    return pd.Series(serie_for_postal_code[['Latitude', 'Longitude']].values[0])

df[['Latitude', 'Longitude']] = df.apply(get_lat_long_from_location_data_set, axis=1)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Exploring the neightbourhoods

In [208]:
import folium
from geopy.geocoders import Nominatim

In [209]:
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Show map
First I will plot the nieghtbourhoods on a map to see how they look and if I can already spot some interesting things.

In [210]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

# show map
map_toronto

Nothing very interesting to see here, only that a lot of neightbourhoods are concentrated around the harbour region. 

As a coffee lover, I'd like to find and cluster neightbourhoods by the number of coffee places. First, I need to pass in my Foursquare credentials.

In [68]:
CLIENT_ID = 'GSQFVMUE1GUEI34TF3AVKCTTTDDVXJJ4VYEMPUQCKTEUBW1D' # your Foursquare ID
CLIENT_SECRET = 'JR3DIIVQMA45F33TB5T5VDK5UIEQO3EBNJJOVDXFEZRQETVK' # your Foursquare Secret
VERSION = '20180323' # Foursquare API version
LIMIT = 100

Now, I will create a method that will get the coffee places near a given location.

In [211]:
def getCoffeePlacesNearby(names, latitudes, longitudes, radius=500):    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?query=Coffee&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request        
        results = requests.get(url).json()["response"]['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude']
    
    return(nearby_venues)

In [212]:
places = getCoffeePlacesNearby(df['Neighbourhood'], df['Latitude'], df['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

Let's how these coffee places on a map to see how they are distributed. I will also show the neighbourhoods on the same map.

In [265]:
# Create map of Toronto using latitude and longitude values
map_toronto_coffee_places = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers for the neighbourhoods to map and make them red
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=False,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(map_toronto_coffee_places) 

# Add coffee places markers to the map and make they blue
for lat, lng, label in zip(places['Venue Latitude'], places['Venue Longitude'], places['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_coffee_places)  


# show map
map_toronto_coffee_places

Okey, so as a coffee lover there are a lot of neighbourhoods that will not make me very happy. Let's see the top 5 neighbourhoods when it comes to number of coffee places.

In [266]:
places_grouped_with_count = places.groupby('Neighbourhood').count()[['Venue']]
places_grouped_with_count.sort_values(by='Venue',ascending=False)

Unnamed: 0_level_0,Venue
Neighbourhood,Unnamed: 1_level_1
"Richmond, Adelaide, King",43
"First Canadian Place, Underground city",40
"Commerce Court, Victoria Hotel",39
"Toronto Dominion Centre, Design Exchange",35
Central Bay Street,31
"Garden District, Ryerson",30
Stn A PO Boxes,26
St. James Town,26
Berczy Park,14
"Queen's Park, Ontario Provincial Government",13


In [267]:

# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_coffee_places_grouped_clustering = places_grouped_with_count

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_coffee_places_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 3, 0, 4, 4, 4, 2, 3, 1, 0], dtype=int32)

In [268]:
# add clustering labels
places_grouped_with_count.insert(0, 'Cluster Labels', kmeans.labels_)
places_grouped_with_count

Unnamed: 0_level_0,Cluster Labels,Venue
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
"Alderwood, Long Branch",4,2
Berczy Park,3,14
"Brockton, Parkdale Village, Exhibition Place",0,7
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",4,1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",4,1
Cedarbrae,4,1
Central Bay Street,2,31
Church and Wellesley,3,10
"Commerce Court, Victoria Hotel",1,39
Davisville,0,4


In [276]:
toronto_neightbourhoods = df
toronto_neightbourhoods = toronto_neightbourhoods.join(places_grouped_with_count, on='Neighbourhood')
# remove neighbourhoods without coffee places
toronto_neightbourhoods = toronto_neightbourhoods.dropna()
toronto_neightbourhoods


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Venue
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3.0,12.0
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,3.0,13.0
11,M3B,North York,Don Mills,43.745906,-79.352188,4.0,1.0
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,4.0,1.0
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2.0,30.0
14,M6B,North York,Glencairn,43.709577,-79.445073,4.0,1.0
20,M3C,North York,Don Mills,43.7259,-79.340923,4.0,1.0
21,M4C,East York,Woodbine Heights,43.695344,-79.318389,4.0,1.0
22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2.0,26.0
30,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.0,3.0


## Name the clusters
The clustering is very simple is the current stage. It's only based on one parameters, that is the number of coffee places.

In [291]:
cluster_names = ['Quite some coffee places', 'A lot of coffee places', 'Quite a lot coffee places', 'Not too many coffee places', 'Very little coffee places']

## Draw the cluster to a map 

In [293]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_neightbourhoods['Latitude'], toronto_neightbourhoods['Longitude'], toronto_neightbourhoods['Neighbourhood'], toronto_neightbourhoods['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + cluster_names[int(cluster)], parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Not a lot of big suprises here, but I love to see there are a few places away from downtown that offer enough coffee places.