Import pandas

In [1]:
import pandas as pd
import numpy as np

Scrape tables from html to dataframe and select the one we are looking for

In [2]:
#scraping

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)
toronto_fsa_df = tables[0]

Check the results of scraping

In [3]:
# see if it worked correctly

print(toronto_fsa_df.head())
print(toronto_fsa_df.shape)

  Postal Code           Borough               Neighborhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront
(180, 3)


Data cleaning and transformation

In [4]:
# transfomation

# drop rows where Borough is Not assigned
toronto_fsa_df.drop(toronto_fsa_df.loc[toronto_fsa_df['Borough']=="Not assigned"].index, inplace=True)
# there are no neigborhoods with Not assigned value so there is no need to fill in the cells with the asociated Borough name
# leave only one row for each Postal Code and combine the associated neigborhoods into one in that row
toronto_fsa_df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


Check the shape of the clean dataset

In [5]:
toronto_fsa_df.shape

(103, 3)

Write a function that returns longitude and latitude values for a given Postal Code

In [6]:
import pgeocode # import pgeocode. decided to use this since it seems to be reliable and easy to use
from geopy.geocoders import Nominatim # pgeocode fails to return latlong values for some of the items so we need a fallback
import math

def pc2ll(postalcode,country='CA'):
    nomo = pgeocode.Nominatim(country)
    a = nomo.query_postal_code(postalcode)
    return a[['latitude','longitude']]

def adr2ll(address):
    geolocator = Nominatim(user_agent="my-application")
    a = geolocator.geocode(address)
    return [a.latitude, a.longitude]

def gen0(s):
    a = pc2ll(s['Postal Code'])[0]
    if (np.isnan(a)):
        return adr2ll(s['Borough'])[0]
    else:
        return a

def gen1(s):
    a = pc2ll(s['Postal Code'])[1]
    if (np.isnan(a)):
        return adr2ll(s['Borough'])[1] 
    else:
        return a

Add longitude and latitude columns to the dataframe

In [7]:
toronto_fsa_df['Latitude'] = toronto_fsa_df.apply(gen0, axis=1)
toronto_fsa_df['Longitude'] = toronto_fsa_df.apply(gen1, axis=1)

Check if se got what we expected

In [9]:
toronto_fsa_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7545,-79.33
3,M4A,North York,Victoria Village,43.7276,-79.3148
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


In [9]:
toronto_fsa_df.shape

(103, 5)

Everything seems to be OK

From now I'm goint to replicate the same clustering analisys that we've done in the LAB session on NYC data.

Loading Foursqure data

In [10]:
import requests
import json
import time
from pandas.io.json import json_normalize

CLIENT_ID = 'BB2USBOQK0P0BCBMGAEHCS2NPWXK23D4NSWVNCJJVU3WT02I' # your Foursquare ID
CLIENT_SECRET = '2VVDCFZMRVPA01SIWXT5I1HXA0K34VLYQHTZTJQVZ0HVNJJT' # your Foursquare Secret
VERSION = '20180530'
LIMIT=100

def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        # give it a second try in case it fails
        except:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'PC Latitude', 
                  'PC Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_fsa_df['Postal Code'],
                                   latitudes=toronto_fsa_df['Latitude'],
                                   longitudes=toronto_fsa_df['Longitude']
                                  )
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


Perform the very same K-means clustering as in the labs session.

In [11]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([5, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 1, 3, 3, 2, 1, 0, 0, 0, 4,
       4, 0, 0, 1, 0, 4, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 4, 1, 4, 4, 0, 4,
       4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 1, 4, 4,
       4, 4, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 4, 1, 1, 0, 4, 4, 4, 4, 4, 4,
       0, 0, 0, 1, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0], dtype=int32)

Create the dataframe for the visualization

In [12]:
toronto_grouped.insert(0,'Cluster Labels',kmeans.labels_)

In [13]:
toronto_map = pd.merge(toronto_fsa_df, toronto_grouped, on='Postal Code')

Visualizing the clusters on the map of toronto

In [16]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude, longitude = adr2ll('Toronto')

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_map['Latitude'], toronto_map['Longitude'], toronto_map['Postal Code'], toronto_map['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

For the k=6 parameter the algorithm returned tree larger clusters and tree smaller ones with only a few items in them.The disctrics in each cluster tend to be close as well.