# Segmenting and Clustering Neighborhoods in Toronto

## I. Data manging

In [200]:
# Data / analytic
import pandas as pd
import numpy as np
import lxml.html as LH

# Scrap / IO
import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# ML
from sklearn.cluster import KMeans

# Cartography
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

# Visualization
import matplotlib.cm as cm
import matplotlib.colors as colors

### I.1 Fetch raw data

In [35]:
# scrap data and save the result as a dataframe
url_wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

raw_data = pd.read_html(url_wiki, header=0)[0]
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [36]:
print("raw data shape:", raw_data.shape)

raw data shape: (288, 3)


### I.2 Clean raw data

In [37]:
raw_data = raw_data.replace({"Not assigned": np.nan})
raw_data = raw_data.dropna(axis=0)
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [38]:
print("cleaned raw data shape:", raw_data.shape)

cleaned raw data shape: (210, 3)


### I.3 Append locations

#### 1ST OPTION (very long...)

In [63]:
#!conda install -c conda-forge geocoder --yes
import geocoder

def getLoc(x=""):
    # initialize your variable to None
    lat_lng_coords = None
    
    adress = 'Toronto, Ontario'
    if x:
        adress = '{}, Toronto, Ontario'.format(x)
    g = geocoder.google(adress)

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [None]:
tmp = raw_data.Postcode.map(getLoc).map(pd.Series)

In [None]:
raw_data = raw_data.merge(tmp, left_index=True, right_index=True)
raw_data.columns = ["Postcode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
raw_data.head()

*1st option was aborted because the calculation is very very long...*

#### 2ND OPTION (fast)

In [39]:
url_loc = "https://cocl.us/Geospatial_data"

In [40]:
tmp = pd.read_csv(url_loc, sep=",", encoding="utf-8")
tmp.columns = ["Postcode", "Latitude", "Longitude"]
raw_data = raw_data.merge(tmp, how="left", on="Postcode")
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [41]:
print("data shape:", raw_data.shape)

data shape: (210, 5)


## II. Quick overview

### II.1 Few stats

In [80]:
boroughs = raw_data.Borough.unique()
boroughs

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [81]:
print("They are {0} distincts boroughs.".format(boroughs.size))

They are 10 distincts boroughs.


In [84]:
neighbourhoods = raw_data.Neighbourhood.unique()
print("They are {0} distincts neighbourhoods.".format(neighbourhoods.size))

They are 208 distincts neighbourhoods.


In [86]:
tmp = raw_data.groupby(by="Borough")[["Neighbourhood"]].count()
tmp.columns = ["Neighbourhood Count"]
tmp

Unnamed: 0_level_0,Neighbourhood Count
Borough,Unnamed: 1_level_1
Central Toronto,17
Downtown Toronto,37
East Toronto,7
East York,6
Etobicoke,45
Mississauga,1
North York,38
Scarborough,37
West Toronto,13
York,9


### II.2 Create a map of Toronto with neighborhoods superimposed on top.

In [77]:
def getLocation(x=""):
    
    adress = 'Toronto, Ontario'
    if x:
        adress = '{}, Toronto, Ontario'.format(x)

    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

In [79]:
latitude, longitude = getLocation()
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [212]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(raw_data['Latitude'], raw_data['Longitude'], raw_data['Borough'], raw_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## III Filter data - create a subset

**Focus on "%Toronto%" boroughs**

### III.1 Filter data

In [95]:
toronto_data = raw_data[raw_data['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [96]:
print("Toronto data shape:", toronto_data.shape)

Toronto data shape: (74, 5)


### III.2 Draw Toronto map

In [101]:
# get the geographical coordinates of Toronto borough.
latitude, longitude = getLocation("Toronto")
print('The geograpical coordinate of Toronto borough are {}, {}.'.format(latitude, longitude))

# create map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto borough are 43.653963, -79.387207.


## IV. Data enriching (foursquare)

*Define Foursquare Credentials and Version*

In [1]:
# Removed by author

### IV.1 First exploration

*Get the top 100 venues that are in the first neighborhood within a radius of 500 meters*

#### Fetch data

In [108]:
# Identify the first neighborhood and get its attributes
first_neigh = toronto_data.loc[0, 'Neighbourhood']       # neighborhood name
first_neigh_latitude = toronto_data.loc[0, 'Latitude']   # neighborhood latitude value
first_neigh_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(first_neigh, 
                                                               first_neigh_latitude, 
                                                               first_neigh_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


In [119]:
# Get the top 100 venues

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    first_neigh_latitude, 
    first_neigh_longitude, 
    radius, 
    LIMIT)

print("url:\n", url, "\n") # display URL

# execute the query and display results
results = requests.get(url).json()
results

url:
 https://api.foursquare.com/v2/venues/explore?&client_id=VYRSRVUVIY4DJ4B1Y2LFY5IWSPP25CNSZKP5IJTAXNK4XUXO&client_secret=4OUSV2ADEEKHFO2HGLBBKRC5SOP0OCZWW4SE5FG3ZUUK0HT3&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100 



{'meta': {'code': 200, 'requestId': '5d2ce14d531593002c0af069'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 49,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

#### Reformat results

In [124]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [125]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


In [126]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

49 venues were returned by Foursquare.


### IV.2 Explore Neighborhoods in Toronto

*Get the top 100 venues that are in **all neighborhoods** within a radius of 500 meters*

In [134]:
# Function to repeat the same process to all the neighborhoods in Toronto : concatenate toronto_data and venues information (name, category)
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [135]:
# Enrich toronto data with foursquare API
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                 latitudes=toronto_data['Latitude'],
                                 longitudes=toronto_data['Longitude'])

print("toronto venues shape:", toronto_venues.shape)
toronto_venues.head()

toronto venues shape: (3347, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [140]:
toronto_venues.groupby('Neighborhood')["Venue"].count()

Neighborhood
Adelaide                                             100
Bathurst Quay                                         16
Berczy Park                                           57
Brockton                                              27
Business Reply Mail Processing Centre 969 Eastern     19
CN Tower                                              16
Cabbagetown                                           45
Central Bay Street                                    89
Chinatown                                            100
Christie                                              16
Church and Wellesley                                  88
Commerce Court                                       100
Davisville                                            32
Davisville North                                      10
Deer Park                                             15
Design Exchange                                      100
Dovercourt Village                                    21
Dufferin          

In [141]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 239 uniques categories.


### IV.3 Analyze Each Neighborhood

#### Feature engineering

*a. Transform column of qualitative variable to multiple binary variables*

In [153]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print("toronto_onehot shape:", toronto_onehot.shape, "\n")
toronto_onehot.head()

toronto_onehot shape: (3347, 239) 



Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


*b. Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category*

In [156]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print("toronto grouped shape:", toronto_grouped.shape, "\n")
toronto_grouped.head()

toronto grouped shape: (73, 239) 



Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.074074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


*c. Print each neighborhood along with the top 5 most common venues*

In [168]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]  # remove the neighborhood name
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2              Bar  0.04
3       Steakhouse  0.04
4  Thai Restaurant  0.04


----Bathurst Quay----
              venue  freq
0   Airport Service  0.19
1  Airport Terminal  0.12
2    Airport Lounge  0.12
3       Coffee Shop  0.06
4   Harbor / Marina  0.06


----Berczy Park----
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.05
2        Bakery  0.05
3    Steakhouse  0.04
4          Café  0.04


----Brockton----
                   venue  freq
0            Yoga Studio  0.07
1         Breakfast Spot  0.07
2                   Café  0.07
3            Coffee Shop  0.07
4  Performing Arts Venue  0.04


----Business Reply Mail Processing Centre 969 Eastern----
           venue  freq
0    Yoga Studio  0.05
1     Restaurant  0.05
2     Skate Park  0.05
3    Pizza Place  0.05
4  Burrito Place  0.05


----CN Tower----
              venue  freq
0   Airport Service  0.19
1  Airport Terminal  0.12

*d. Print each neighborhood along with the top 10 most common venues and save the results in a dataFrame*

In [170]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [196]:
num_top_venues = 10

# for column labelling
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Gym,Sushi Restaurant,Bakery,Restaurant
1,Bathurst Quay,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
2,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Farmers Market,Beer Bar,Steakhouse,Cheese Shop,Café,Seafood Restaurant,Basketball Stadium
3,Brockton,Yoga Studio,Breakfast Spot,Café,Coffee Shop,Bakery,Stadium,Burrito Place,Restaurant,Caribbean Restaurant,Climbing Gym
4,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Smoke Shop,Skate Park,Brewery,Burrito Place,Butcher,Restaurant,Recording Studio,Park


## V. Cluster Neighborhoods

### V.1 Cluster Toronto data with k-means

*Run k-means to cluster the neighborhood into 5 clusters.*

In [197]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print("groups:", np.unique(kmeans.labels_))
print("\nFirst 10 groups:", kmeans.labels_[0:10])

groups: [0 1 2 3 4]

First 10 groups: [1 3 1 1 1 3 1 1 1 1]


In [198]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data
toronto_merged.rename({"Neighbourhood":"Neighborhood"}, axis=1, inplace=True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
print("toronto merged columns:", list(toronto_merged.columns))

toronto_merged.head() # check the last columns!

toronto merged columns: ['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude', 'Cluster Labels', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue', '6th Most Common Venue', '7th Most Common Venue', '8th Most Common Venue', '9th Most Common Venue', '10th Most Common Venue']


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Mexican Restaurant,Theater,Restaurant,Gym / Fitness Center,Café
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Mexican Restaurant,Theater,Restaurant,Gym / Fitness Center,Café
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Fast Food Restaurant,Diner,Plaza,Pizza Place,Ramen Restaurant
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Fast Food Restaurant,Diner,Plaza,Pizza Place,Ramen Restaurant
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Bakery,Hotel,Breakfast Spot,Pizza Place,Gastropub,Park


In [204]:
# Distribution of the clusters
toronto_merged["Cluster Labels"].value_counts().sort_index()

0     2
1    60
2     4
3     7
4     1
Name: Cluster Labels, dtype: int64

### V.2 Visualization

In [211]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### V.3 Examine Clusters

#### Cluster 1

In [205]:
# Display data whose cluster label = 0 and don't show columns: Postcode, Borough, Neighborhood, Latitude, Longitude
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
49,Central Toronto,0,Playground,Restaurant,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
50,Central Toronto,0,Playground,Restaurant,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


#### Cluster 2

In [206]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Mexican Restaurant,Theater,Restaurant,Gym / Fitness Center,Café
1,Downtown Toronto,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Mexican Restaurant,Theater,Restaurant,Gym / Fitness Center,Café
2,Downtown Toronto,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Fast Food Restaurant,Diner,Plaza,Pizza Place,Ramen Restaurant
3,Downtown Toronto,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Fast Food Restaurant,Diner,Plaza,Pizza Place,Ramen Restaurant
4,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Italian Restaurant,Bakery,Hotel,Breakfast Spot,Pizza Place,Gastropub,Park
5,East Toronto,1,Health Food Store,Other Great Outdoors,Trail,Pub,Wings Joint,Donut Shop,Discount Store,Dive Bar,Dog Run,Doner Restaurant
6,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Bakery,Farmers Market,Beer Bar,Steakhouse,Cheese Shop,Café,Seafood Restaurant,Basketball Stadium
7,Downtown Toronto,1,Coffee Shop,Café,Ice Cream Shop,Italian Restaurant,Chinese Restaurant,Burger Joint,Sandwich Place,Salad Place,Indian Restaurant,Spa
8,Downtown Toronto,1,Grocery Store,Café,Park,Baby Store,Convenience Store,Italian Restaurant,Coffee Shop,Athletics & Sports,Diner,Restaurant
9,Downtown Toronto,1,Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Gym,Sushi Restaurant,Bakery,Restaurant


#### Cluster 3

In [207]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,Central Toronto,2,Gym / Fitness Center,Park,Swim School,Bus Line,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
34,Central Toronto,2,Bus Line,Sushi Restaurant,Park,Jewelry Store,Trail,Doner Restaurant,Discount Store,Dive Bar,Dog Run,Wings Joint
35,Central Toronto,2,Bus Line,Sushi Restaurant,Park,Jewelry Store,Trail,Doner Restaurant,Discount Store,Dive Bar,Dog Run,Wings Joint
66,Downtown Toronto,2,Park,Playground,Trail,Building,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


#### Cluster 4

In [208]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
59,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
60,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
61,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
62,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
63,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
64,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina
65,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Bar,Boutique,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Harbor / Marina


#### Cluster 5

In [209]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,Central Toronto,4,Garden,Home Service,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
