# Toronto Battle of the Neighborhoods

##### January 14, 2020

# Obtaining and preparing the data

In [1]:
# import necessary libraries

import requests # to handle requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #import BeautifulSoup package
from html.parser import HTMLParser

print ('Libraries imported')

Libraries imported


### Scrape the data from the Wikipedia page to a pandas dataframe

In [2]:
# get the data from the wikipedia page

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url)

print(page.status_code) # the http response status code should print 200 if correct

print(page.content) # to see what the webpage contains

200
b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XoFTzgpAICoAAD@fNTsAAABO","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":948084252,"wgRevisionId":948084252,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario-rela

In [3]:
# change the above to a better format using prettify()
soup=BeautifulSoup(page.content,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XoFTzgpAICoAAD@fNTsAAABO","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":948084252,"wgRevisionId":948084252,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario

In [4]:
# find the table and extract the data to a pandas dataframe
table = soup.find('table', class_='wikitable')
table_rows=table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])
    
toronto=pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])

# filter out rows with data quality issues
toronto=toronto[~toronto['PostalCode'].isnull()]

toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


### Clean up the dataset

In [5]:
# remove any rows that have Borough = 'Not assigned'

toronto=toronto[toronto.Borough != 'Not assigned']

# reset index

toronto1=toronto.reset_index(drop=True)

toronto1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
# combine neighborhoods that belong to the same postal code into one row, comma-separated values

toronto2=toronto1.groupby(['PostalCode','Borough'])['Neighborhood'].agg([('Neighborhood',','.join)]).reset_index()

toronto2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# replace any neighborhoods that are 'Not assigned' with their borough name instead

toronto2.loc[toronto2['Neighborhood'] == 'Not assigned', 'Neighborhood'] = toronto2['Borough']

toronto2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
# check that the above clean up was done using a row that is known to have neighborhood 'Not assigned' in source table

toronto2.loc[toronto2['PostalCode']=='M9A']

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Etobicoke,Islington Avenue


In [9]:
toronto2.shape

(103, 3)

### In order to utilize Foursqaure location data, need to get latitude and longitude coordinates of each neighborhood.

In [10]:
# import csv file that has geographical coordinates of each postal code

geodata=pd.read_csv('http://cocl.us/Geospatial_data')

geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# merge geodata dataframe on Postal Code with the toronto2 dataframe to list PostalCode, Borough, Neighborhood, Latitude and Longitude

torontogeodata=pd.merge(toronto2,geodata, how='outer', left_on='PostalCode', right_on='Postal Code')
torontogeodata.drop('Postal Code',axis=1,inplace=True)

torontogeodata

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


# Clustering the Neighborhoods

### Create a map of Toronto neighborhoods 

In [14]:
# import libraries

import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes   #commented out after first install
import folium

print('Libraries imported.')

Libraries imported.


In [15]:
# create map of Toronto using latitude and longitude

# get geographical coordinates of Toronto

address='Toronto, Ontario'
geolocator=Nominatim()
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geographilca coordinates of Toronto are {}, {}'.format(latitude,longitude))

# create map

map_toronto=folium.Map(location=[latitude,longitude],zoom_start=10)

# add markers to map

for Latitude, Longitude, Neighborhood, Borough, PostalCode in zip(torontogeodata['Latitude'], torontogeodata['Longitude'], torontogeodata['Neighborhood'], torontogeodata['Borough'], torontogeodata['PostalCode']):
    label='{}, {}'.format(Neighborhood, Borough)
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [Latitude, Longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto



The geographilca coordinates of Toronto are 43.6534817, -79.3839347


In order to narrow down the analysis a bit, let's focus only neighborhoods within Downtown Toronto.

In [17]:
toronto_core = torontogeodata[torontogeodata['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_core.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,St. James Town / Cabbagetown,43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Using Foursquare API

In [18]:
# define Foursquare credentials and version in hidden cell below

In [19]:
# The code was removed by Watson Studio for sharing.

Exploring the first neighborhood in the dataframe.

In [20]:
neighborhood_name = toronto_core.loc[0,'Neighborhood']

neighborhood_latitude = toronto_core.loc[0, 'Latitude']
neighborhood_longitude = toronto_core.loc[0,'Longitude']

print('The latitude and longitude of {} are {} and {}.'.format(neighborhood_name,latitude,longitude))

The latitude and longitude of Rosedale are 43.6534817 and -79.3839347.


Exploring the top 100 venus that are within 500m of 

In [21]:
limit = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{},&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, limit)

# send the get request

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e84e29f71c428001b861700'},
 'response': {'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.6840626045, 'lng': -79.37131878274371},
   'sw': {'lat': 43.675062595499995, 'lng': -79.38374001725632}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4aff2d47f964a520743522e3',
       'name': 'Rosedale Park',
       'location': {'address': '38 Scholfield Ave.',
        'crossStreet': 'at Edgar Ave.',
        'lat': 43.68232820227814,
        'lng': -79.37893434347683,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.68232820227814,
          'lng': -79.37893434347683}],
        'distance': 32

In [22]:
# extract the category of each venue

def get_category_type(row):
    try:
        categories_list=row['categories']
    except:
        categories_list=row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# clean the JSON and structure it to a datafrmae

venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:,filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
#print('Foursquare returned {} venues.'.format(nearby_venues.shape[0]))

Unnamed: 0,name,categories,lat,lng
0,Rosedale Park,Playground,43.682328,-79.378934
1,Whitney Park,Park,43.682036,-79.373788
2,Alex Murray Parkette,Park,43.6783,-79.382773
3,Milkman's Lane,Trail,43.676352,-79.373842


Explore all venues in Downtown Toronto.

In [23]:
# find the types of venues in all neighborhoods in Downtown Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, latitude, longitude in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, latitude, longitude, radius, limit)
        
        # make the get request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby neighborhood
        venues_list.append([(name, latitude, longitude, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    for neighbourhood in venues_list:
        for venue in neighbourhood:
            venue
    
    nearby_venues = pd.DataFrame([venue for neighborhood in venues_list for venue in neighborhood])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
   
    return(nearby_venues)

In [24]:
# run the getNearbyVenues function on each neighborhood and create the downtown_venues dataframe

downtown_venues = getNearbyVenues(names=toronto_core['Neighborhood'], latitudes=toronto_core['Latitude'], longitudes=toronto_core['Longitude'])
downtown_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.678300,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,St. James Town / Cabbagetown,43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner
5,St. James Town / Cabbagetown,43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
6,St. James Town / Cabbagetown,43.667967,-79.367675,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
7,St. James Town / Cabbagetown,43.667967,-79.367675,Merryberry Cafe + Bistro,43.666630,-79.368792,Café
8,St. James Town / Cabbagetown,43.667967,-79.367675,F'Amelia,43.667536,-79.368613,Italian Restaurant
9,St. James Town / Cabbagetown,43.667967,-79.367675,Murgatroid,43.667381,-79.369311,Restaurant


## Analyzing the Neighborhoods in Downtown Toronto

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = downtown_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]  
                                                    
toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
toronto_onehot.shape

(1282, 206)

In [27]:
# group rows by neighborhood and by the mean frequency of each category

downtown_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
1,CN Tower / King and Spadina / Railway Lands / ...,0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,...,0.0,0.0,0.0,0.012987,0.0,0.0,0.012987,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.024691,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,0.012346,0.0
5,Commerce Court / Victoria Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
6,First Canadian Place / Underground city,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0
8,Harbourfront East / Union Station / Toronto Is...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
9,Kensington Market / Chinatown / Grange Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.038961,0.0,0.064935,0.012987,0.0,0.0,0.0


In [38]:
# print each neighborhood and its top 5 most common venues

num_top_venues = 5

for hood in downtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    
# sort values in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2                Café  0.04
3  Seafood Restaurant  0.04
4         Cheese Shop  0.04


----CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst
 Quay / South Niagara / Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3     Boat or Ferry  0.06
4          Boutique  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.18
1   Italian Restaurant  0.05
2  Japanese Restaurant  0.04
3      Thai Restaurant  0.04
4         Burger Joint  0.04


----Christie----
           venue  freq
0  Grocery Store  0.22
1           Café  0.17
2           Park  0.11
3          Diner  0.06
4    Candy Store  0.06


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.06
1  Japanese Restaurant  0.06
2              Gay Bar  0.05
3           Restaurant  0

In [39]:
# create a dataframe and display top 10 venues for each neighborhood

num_top_venues = 10

indicators = ['st','nd','rd']

# create columns accordingn to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venues'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# create new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Café,Greek Restaurant
1,CN Tower / King and Spadina / Railway Lands / ...,Airport Lounge,Airport Service,Airport Terminal,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Rental Car Location,Harbor / Marina,Boutique
2,Central Bay Street,Coffee Shop,Italian Restaurant,Japanese Restaurant,Sandwich Place,Burger Joint,Thai Restaurant,Salad Place,Middle Eastern Restaurant,Spa,Ice Cream Shop
3,Christie,Grocery Store,Café,Park,Athletics & Sports,Gas Station,Italian Restaurant,Diner,Nightclub,Candy Store,Restaurant
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Yoga Studio,Hotel,Café,Men's Store,Smoke Shop,Mediterranean Restaurant


## Clustering Neighborhoods in Downtown Toronto

In [41]:
# run k-means to cluster the neighborhoods into 5 clusters

kclusters = 5

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 3, 4, 2, 1, 1, 1, 1, 1, 1], dtype=int32)

In [52]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'clusters', kmeans.labels_)

downtown_merged = torontogeodata

# merge toronto_grouped with torontogeodata to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtown_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,,,,,,,,,,,,,,,
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,,,,,,,,,,,,,,,
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,,,,,,,,,,,,,,,
3,M1G,Scarborough,Woburn,43.770992,-79.216917,,,,,,,,,,,,,,,
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,,,,,,,,,,,,,,,


In [61]:
# visualize the resulting clusters

# create a map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

## Examine the Clusters

In [62]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 0, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Downtown Toronto,0.0,0.0,0.0,0.0,0.0,Park,Playground,Trail,Dance Studio,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


In [63]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 1, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
51,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Restaurant,Pub,Italian Restaurant,Pizza Place,Bakery,Café,Chinese Restaurant,Breakfast Spot,Plaza
52,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Yoga Studio,Hotel,Café,Men's Store,Smoke Shop,Mediterranean Restaurant
53,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Pub,Park,Mexican Restaurant,Breakfast Spot,Restaurant,Bakery,Café,Theater,Yoga Studio
54,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Clothing Store,Coffee Shop,Bubble Tea Shop,Cosmetics Shop,Japanese Restaurant,Café,Restaurant,Electronics Store,Diner,Italian Restaurant
55,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Café,Italian Restaurant,Restaurant,Breakfast Spot,American Restaurant,Cosmetics Shop,Diner,Bakery,Japanese Restaurant
56,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Café,Greek Restaurant
58,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Restaurant,Coffee Shop,Café,Bar,Bakery,Thai Restaurant,Concert Hall,Hotel,Cosmetics Shop,Pizza Place
59,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Aquarium,Hotel,Italian Restaurant,Café,Restaurant,Sporting Goods Shop,Brewery,Fried Chicken Joint,Scenic Lookout
60,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Café,Hotel,Restaurant,Bar,American Restaurant,Bakery,Seafood Restaurant,Italian Restaurant,Gastropub
61,Downtown Toronto,1.0,1.0,1.0,1.0,1.0,Coffee Shop,Restaurant,Café,Hotel,American Restaurant,Gym,Seafood Restaurant,Japanese Restaurant,Gastropub,Italian Restaurant


In [64]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 2, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
75,Downtown Toronto,2.0,2.0,2.0,2.0,2.0,Grocery Store,Café,Park,Athletics & Sports,Gas Station,Italian Restaurant,Diner,Nightclub,Candy Store,Restaurant


In [65]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 3, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,Downtown Toronto,3.0,3.0,3.0,3.0,3.0,Airport Lounge,Airport Service,Airport Terminal,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Rental Car Location,Harbor / Marina,Boutique


In [66]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 4, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,clusters,cluster labels,cluster label,cluster,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
57,Downtown Toronto,4.0,4.0,4.0,4.0,4.0,Coffee Shop,Italian Restaurant,Japanese Restaurant,Sandwich Place,Burger Joint,Thai Restaurant,Salad Place,Middle Eastern Restaurant,Spa,Ice Cream Shop
85,Downtown Toronto,4.0,4.0,4.0,4.0,4.0,Coffee Shop,Yoga Studio,Bar,Boutique,Burger Joint,Burrito Place,Café,College Auditorium,Nightclub,Music Venue


### Observations

- Cluster 3 only has one neighborhood because it is very unique in that it contains the Toronto Island Airport.
- Much of downtown Toronto falls into cluster 1, meaning that many downtown neighborhoods have similar distribution of venues, with coffee shops being the most frequent venues downtown.