Toronto Neighborhood Clustering

Import relevant libraries and reading the html table from wikipedia

In [1]:
import pandas as pd
import requests
import numpy as np
from sklearn.cluster import KMeans

!conda install -c conda-forge geopy --yes
!conda install -c conda-forge lxml --yes

from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import lxml

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df=pd.read_html(url, header=0)[0]
df['Neighborhood'] = df['Neighborhood'].astype(str)
df.head()

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libxslt-1.1.33             |       h7d1a2b0_0         426 KB
    lxml-3.8.0                 |           py36_0         3.8 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.2 MB

The following NEW packages will be INSTALLED:

  libxslt            pkgs/main/linux-64::libxslt-1.1.33-h7d1a2b0_0
  lxml               conda-forge/linux-64::lxml-3.8.0-py36_0



Downloading and Extracting Packages
lxml-3.8.0           | 3.8 MB    | 

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Formatting the dataset

In [2]:
empty = []
reassign = []
for i, row in df.iterrows():
    if df.iloc[i,1] == 'Not assigned':
        empty.append(i)
    if df.iloc[i,2] == 'NaN':
        df.iloc[i,2] = df.iloc[i,2].replace('NaN', df.iloc[i,1])
    if df.iloc[i,2].find('/') > 0:
        df.iloc[i,2] = df.iloc[i,2].replace(' /', ',')
df.drop(empty, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
print(df.shape)

(103, 3)


Calling the geospatial data and merging the two dataframes

In [4]:
link = 'https://cocl.us/Geospatial_data'
geo = pd.read_csv(link)
geo.rename(columns={'Postal Code':'Postal code'}, inplace=True)
geo.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
df_toronto = pd.merge(df, geo, on='Postal code')
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


I shall select only the central boroughs of Toronto for the neighborhood clustering

In [6]:
selection = ['North York','East York','York', 'Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']
central_toronto = df_toronto[pd.DataFrame(df_toronto.Borough.tolist()).isin(selection).any(1)]
central_toronto.reset_index(inplace=True)
central_toronto.head()

Unnamed: 0,index,Postal code,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Define a function to call API for each row with a for loop

In [7]:
id = '3IY3C0UTVM2HTXALLM2TS15SIDCCLS3AEREPWW3ROYC0AKQE'
pw = 'ZIL15ZGLNYOH5WI3DGVYWAAB1YHEX4T0IM0FKAX2FKFRIFEQ'
version = '20180605'
radius = 500
limit = 100

def NearbyVenues(postal_code, boroughs, neighborhoods, latitudes, longitudes):
    venues_list = []
    for pc, bor, name, lat, lng in zip(postal_code, boroughs, neighborhoods, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(id, pw, version, lat, lng, radius, limit)
        results = requests.get(url).json()
        results = results['response']['groups'][0]['items']
        venues_list.append([(pc, bor, name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal code',
                             'Borough',
                             'Neighborhood',
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']

    return (nearby_venues)

toronto_venues = NearbyVenues(postal_code=central_toronto['Postal code'], boroughs=central_toronto['Borough'], neighborhoods=central_toronto['Neighborhood'], latitudes=central_toronto['Latitude'], longitudes=central_toronto['Longitude'])

Grouping the venues with one hot encoding and groupby

In [8]:
onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
onehot['Postal code'] = toronto_venues['Postal code']
first_col = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[first_col]
toronto_grouped = onehot.groupby('Postal code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M2H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014286,0.0,...,0.0,0.0,0.014286,0.0,0.0,0.0,0.0,0.0,0.014286,0.0
2,M2K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M2N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0
4,M2P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


For each postal code find the top 5 most common venues and put them in a dataframe

In [9]:
def common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5
indicators = ['st', 'nd', 'rd']
columns = ['Postal code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
top_venues = pd.DataFrame(columns=columns)
top_venues['Postal code'] = toronto_grouped['Postal code']
for ind in np.arange(toronto_grouped.shape[0]):
    top_venues.iloc[ind, 1:] = common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
top_venues.head()

Unnamed: 0,Postal code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M2H,Pool,Dog Run,Golf Course,Athletics & Sports,Mediterranean Restaurant
1,M2J,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Convenience Store
2,M2K,Café,Chinese Restaurant,Bank,Japanese Restaurant,Electronics Store
3,M2N,Ramen Restaurant,Sandwich Place,Pizza Place,Café,Restaurant
4,M2P,Park,Bank,Bar,Convenience Store,Doner Restaurant


Using KMeans clustering to create five cluster and add labels to the dataframe

In [10]:
k = 5
toronto_clustering = toronto_grouped.drop('Postal code', 1)
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=12)
kmeans.fit(toronto_clustering)

top_venues.insert(0, 'Cluster Labels', kmeans.labels_)
top_venues.head()

Unnamed: 0,Cluster Labels,Postal code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,1,M2H,Pool,Dog Run,Golf Course,Athletics & Sports,Mediterranean Restaurant
1,1,M2J,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Convenience Store
2,1,M2K,Café,Chinese Restaurant,Bank,Japanese Restaurant,Electronics Store
3,1,M2N,Ramen Restaurant,Sandwich Place,Pizza Place,Café,Restaurant
4,2,M2P,Park,Bank,Bar,Convenience Store,Doner Restaurant


Creating a dataset with cluster label and top 5 venues for each postal code

In [33]:
toronto_merged = pd.merge(central_toronto, top_venues, on='Postal code')
toronto_merged.drop(['index'], axis=1, inplace=True)
toronto_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Park,Food & Drink Shop,Farmers Market,Event Space,Ethiopian Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,French Restaurant,Portuguese Restaurant,Coffee Shop,Pizza Place,Intersection
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Park,Pub,Bakery,Breakfast Spot
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,Clothing Store,Furniture / Home Store,Accessories Store,Shoe Store,Boutique
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,Mexican Restaurant


Grouping the 5 clusters by most occuring top 5 venue to understand the significance of each cluster

In [34]:
clusters = top_venues.groupby('Cluster Labels').agg(lambda x:x.value_counts().index[0])
clusters.drop(['Postal code'], axis=1, inplace=True)
clusters.head()

Unnamed: 0_level_0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Pizza Place,Yoga Studio,Deli / Bodega,Event Space,Ethiopian Restaurant
1,Coffee Shop,Coffee Shop,Café,Restaurant,Japanese Restaurant
2,Park,Playground,Spa,Event Space,Ethiopian Restaurant
3,Park,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store
4,Garden,Yoga Studio,Deli / Bodega,Event Space,Ethiopian Restaurant


To center a map we find Latitude and Longitude with Geopy

In [35]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent='toronto_neighborhoods')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print(latitude, longitude)

43.6534817 -79.3839347


creating a folium map

In [36]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

Color-code to visualize the different cluster

In [37]:
x = np.arange(k)
ys = [i + x + (i * x) ** 2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

Adding the labels to the map and displaying it

In [39]:
markers_colors = []
for lat, lon, pc, nei, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postal code'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(pc) + ': ' + str(nei) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon],radius=5,popup=label,color=rainbow[cluster-1],fill=True,fill_color=rainbow[cluster-1],fill_opacity=0.7).add_to(toronto_map)

toronto_map

The central boroughs of Toronto seem to be populated mainly cluster 1 type neighborhoods. This means there are a majority of coffee shops and restaurants. We can also find parks, gardens and leisure activities (clusters 3 and 4).  