# Segmenting and Clustering Neighbohoods in the City of Toronto, Canada

Importing library requests to pull the data from the Wikipedia

In [1]:
import requests

We pull the website as raw data

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
raw_data=requests.get(url).text

We import BeautifulSoup to arrange the data we downloaded as HTML

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(raw_data,'lxml')
#print(soup.prettify())

In [5]:
my_table = soup.find('table',{'class':'wikitable sortable'})
#my_table

Finding the headings of the table 

In [6]:
my_table.findAll('th')

[<th>Postcode</th>, <th>Borough</th>, <th>Neighbourhood
 </th>]

Creating a list with the headings

In [7]:
headings=list()
for title in my_table.findAll('th'):
    headings.append(title.text.rstrip('\n'))
headings

['Postcode', 'Borough', 'Neighbourhood']

Creating three lists for the postcodes, the boroughs and the neighborhoods

In [8]:
postcodes=[]
boroughs=[]
neighborhoods=[]

for i in range(1,len(my_table.findAll('tr'))):
        postcodes.append(my_table.findAll('tr')[i].findAll('td')[0].text.rstrip('\n'))
        boroughs.append(my_table.findAll('tr')[i].findAll('td')[1].text.rstrip('\n'))
        neighborhoods.append(my_table.findAll('tr')[i].findAll('td')[2].text.rstrip('\n'))
        
#postcodes

Importing pandas library to transform it to a data frame

In [9]:
import pandas as pd

In [10]:
data={headings[0]:postcodes,headings[1]:boroughs,headings[2]:neighborhoods}

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


Dropping boroughs with "Not assigned" and resetting the index

In [12]:
df=df[df.Borough != 'Not assigned'].reset_index(drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
...,...,...,...
206,M8Z,Etobicoke,Kingsway Park South West
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West


Replace Neighbourhood with "Not assigned" using numpy library

In [13]:
import numpy as np 

df['Neighbourhood']=np.where(df['Neighbourhood']=='Not assigned',df['Borough'],df['Neighbourhood'])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
...,...,...,...
206,M8Z,Etobicoke,Kingsway Park South West
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West


We import the csv file with the coordinates as the Geocoder didn't work

In [14]:
path='http://cocl.us/Geospatial_data'
coord = pd.read_csv(path)
coord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Importing the library numpy to assign the coordinates to correct postcode

In [15]:
import numpy as np

In [16]:
df=pd.merge(df,coord.iloc[:,:3],how='left',left_on='Postcode',right_on='Postal Code').drop(columns=['Postal Code'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


Keeping only the Boroughs that contain the word Toronto in their name

In [17]:
df=df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


## Starting the same analysis as we did for New York

Importing the required libraries

In [18]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [19]:

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Use geopy library to get the latitude and longitude values of New York City.

In [20]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create map of Tornonto using Folium and longitude and latitude values

In [21]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [22]:
CLIENT_ID = '11LA2X0BCMRRUFTYRZ0WMT0QLKADYA01JYYBYJSEL5JVOGJF' # your Foursquare ID
CLIENT_SECRET = 'RITKWKWFYL2YDFB0SUL1XL5NBBQXCWUE0BVWPTJIIPFNVKX2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Let's explore the first location

In [23]:
neighbourhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


### Now, let's get the top 100 venues that are in Harbourfront within a radius of 500 meters.

First, let's create the GET request URL. Name your URL url.

In [24]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=11LA2X0BCMRRUFTYRZ0WMT0QLKADYA01JYYBYJSEL5JVOGJF&client_secret=RITKWKWFYL2YDFB0SUL1XL5NBBQXCWUE0BVWPTJIIPFNVKX2&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [25]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d8d6b8b396de0002c6915a3'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 50,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

From the Foursquare lab in the previous module, we know that all the information is in the items key. Before we proceed, let's borrow the get_category_type function from the Foursquare lab.

In [26]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [27]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [28]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

50 venues were returned by Foursquare.


## Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now write the code to run the above function on each neighborhood and create a new dataframe called toronto_venues.

In [32]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

toronto_venues


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.654260,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
...,...,...,...,...,...,...,...
3306,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park
3307,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,The Ten Spot,43.664815,-79.324213,Spa
3308,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,Toronto Yoga Mamas,43.664824,-79.324335,Yoga Studio
3309,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,Revolution Recording,43.662561,-79.326940,Recording Studio


Let's check how many venues were returned for each neighborhood

In [33]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,15,15,15,15,15,15
Berczy Park,57,57,57,57,57,57
Brockton,22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
...,...,...,...,...,...,...
Underground city,100,100,100,100,100,100
Union Station,100,100,100,100,100,100
University of Toronto,33,33,33,33,33,33
Victoria Hotel,100,100,100,100,100,100


#### Let's find out how many unique categories can be curated from all the returned venues


In [34]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 235 uniques categories.


# Analyze Each Neighborhood

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
toronto_onehot.shape

(3311, 236)

##### Below that we see that there is a location with the name Neighborhood. This is why we use Neighbourhood instead throughout the project.

In [37]:
toronto_onehot.columns!="Neighborhood"

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [38]:
toronto_grouped=toronto_onehot.groupby("Neighbourhood").mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Adelaide,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030000,0.0,...,0.0,0.0,0.0,0.00,0.020000,0.000000,0.0,0.01,0.0,0.000000
1,Bathurst Quay,0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.066667,0.000000,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.000000
2,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.00,0.017544,0.000000,0.0,0.00,0.0,0.000000
3,Brockton,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.000000
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Underground city,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030000,0.0,...,0.0,0.0,0.0,0.01,0.010000,0.000000,0.0,0.01,0.0,0.000000
69,Union Station,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.01,0.010000,0.000000,0.0,0.01,0.0,0.000000
70,University of Toronto,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.030303,0.0,0.00,0.0,0.000000
71,Victoria Hotel,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040000,0.0,...,0.0,0.0,0.0,0.00,0.020000,0.000000,0.0,0.01,0.0,0.000000


#### Let's print each neighborhood along with the top 5 most common venues

In [39]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
             venue  freq
0      Coffee Shop  0.08
1             Café  0.05
2       Steakhouse  0.04
3              Bar  0.04
4  Thai Restaurant  0.03


----Bathurst Quay----
             venue  freq
0   Airport Lounge  0.13
1  Airport Service  0.13
2         Boutique  0.07
3    Boat or Ferry  0.07
4              Bar  0.07


----Berczy Park----
            venue  freq
0     Coffee Shop  0.07
1    Cocktail Bar  0.05
2        Beer Bar  0.04
3  Farmers Market  0.04
4          Bakery  0.04


----Brockton----
               venue  freq
0     Breakfast Spot  0.09
1               Café  0.09
2        Coffee Shop  0.09
3  Convenience Store  0.05
4                Gym  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0           Yoga Studio  0.06
1  Gym / Fitness Center  0.06
2    Light Rail Station  0.06
3               Brewery  0.06
4            Restaurant  0.06


----CN Tower----
             venue  freq
0   Airport Lounge  0.13
1  

                venue  freq
0                Café  0.08
1         Coffee Shop  0.08
2    Sushi Restaurant  0.05
3         Pizza Place  0.05
4  Italian Restaurant  0.05


----Ryerson----
                       venue  freq
0                Coffee Shop  0.08
1             Clothing Store  0.08
2  Middle Eastern Restaurant  0.03
3             Cosmetics Shop  0.03
4                       Café  0.03


----South Hill----
          venue  freq
0           Pub  0.13
1   Coffee Shop  0.13
2  Liquor Store  0.07
3   Supermarket  0.07
4    Restaurant  0.07


----South Niagara----
             venue  freq
0   Airport Lounge  0.13
1  Airport Service  0.13
2         Boutique  0.07
3    Boat or Ferry  0.07
4              Bar  0.07


----St. James Town----
                venue  freq
0         Coffee Shop  0.07
1                Café  0.05
2          Restaurant  0.05
3  Italian Restaurant  0.04
4           Gastropub  0.03


----Stn A PO Boxes 25 The Esplanade----
                venue  freq
0         Coff

#### Let's put that into a pandas dataframe

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [41]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Bar,Steakhouse,Burger Joint,Hotel,Cosmetics Shop,Restaurant,Thai Restaurant,American Restaurant
1,Bathurst Quay,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
2,Berczy Park,Coffee Shop,Cocktail Bar,Steakhouse,Cheese Shop,Café,Farmers Market,Italian Restaurant,Beer Bar,Bakery,Seafood Restaurant
3,Brockton,Café,Breakfast Spot,Coffee Shop,Gym,Restaurant,Caribbean Restaurant,Bar,Bakery,Italian Restaurant,Intersection
4,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Garden Center,Garden,Light Rail Station,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Gym / Fitness Center


# Cluster Neighborhoods

Run k-means to cluster the neighborhood into 7 clusters.

In [42]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [43]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [44]:
toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Bakery,Breakfast Spot,Gastropub,Pizza Place,Park


Finally, let's visualize the resulting clusters

In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster. 

Cluster 0: Airport

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels']==0,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
59,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
60,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
61,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
62,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
63,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
64,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court
65,Downtown Toronto,0,Airport Lounge,Airport Service,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Food Court


Cluster 1: Nights out

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
1,Downtown Toronto,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
2,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
3,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
4,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Bakery,Breakfast Spot,Gastropub,Pizza Place,Park
...,...,...,...,...,...,...,...,...,...,...,...,...
69,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Bakery,Breakfast Spot,Gastropub,Pizza Place,Park
70,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,Bar,Gastropub,Burger Joint,Gym,Asian Restaurant
71,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,Bar,Gastropub,Burger Joint,Gym,Asian Restaurant
72,Downtown Toronto,1,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Hotel,Gym,Bubble Tea Shop,Burger Joint,Café


Cluster 2:

In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels']==2,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
34,Central Toronto,2,Park,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
35,Central Toronto,2,Park,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
49,Central Toronto,2,Playground,Gym,Park,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
50,Central Toronto,2,Playground,Gym,Park,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
66,Downtown Toronto,2,Park,Playground,Trail,Building,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop


Cluster 2: Shopping

In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1,toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
1,Downtown Toronto,1,Coffee Shop,Pub,Bakery,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Theater,Gym / Fitness Center
2,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
3,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Bubble Tea Shop,Pizza Place,Italian Restaurant,Japanese Restaurant
4,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Bakery,Breakfast Spot,Gastropub,Pizza Place,Park
...,...,...,...,...,...,...,...,...,...,...,...,...
69,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Bakery,Breakfast Spot,Gastropub,Pizza Place,Park
70,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,Bar,Gastropub,Burger Joint,Gym,Asian Restaurant
71,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,Bar,Gastropub,Burger Joint,Gym,Asian Restaurant
72,Downtown Toronto,1,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Hotel,Gym,Bubble Tea Shop,Burger Joint,Café


# THE END