<h1><center>Segmenting and Clustering Neighborhoods in Toronto</center></h1>

In [1]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

#!conda install -c conda-forge geopy --yes # uncomment this line if geocoder is not installed
from geopy.geocoders import Nominatim# import geocoder

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if folium is not installed
import folium

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Scraping Wikipedia

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

Look at the DataFrame

In [3]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Cleaning the data

In [4]:
df.columns=['PostalCode','Borough','Neighborhood']
# Deleting row with a Borough not assigned
df = df[df['Borough']!='Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


Verify that no value in the Neighborhood columns is NA or unassigned

In [5]:
print(df[df['Neighborhood'].isna()].empty)
print(df[df['Neighborhood']=='Not assigned'].empty)

True
True


Resetting the index

In [6]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


Verifying that there is no duplicate of Postal code (i.e PostalCode is the primary key)

In [7]:
df.shape[0]==df['PostalCode'].unique().size

True

Print information about the dataframe

In [8]:
print('The dataframe has a shape of:', df.shape)
print('Toronto has {} Postal Code, {} Borough, and {} Neighborhood.'.format(df['PostalCode'].unique().size, \
                                                                            df['Borough'].unique().size, \
                                                                            df['Neighborhood'].unique().size))

The dataframe has a shape of: (103, 3)
Toronto has 103 Postal Code, 10 Borough, and 98 Neighborhood.


# Getting the Toronto's Postal Code coordinates

## Via geopy geocoders

In [9]:
def get_coordinate(postal_code):
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent="toronto_explorer")   
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
    except:
        latitude = None
        longitude = None
        
    return latitude, longitude

In [10]:
from tqdm import tqdm # to time for loop

coord_dic = {'PostalCode':[],'Latitude': [], 'Longitude': []}

for postal_code in tqdm(df['PostalCode']):
    lat, long = get_coordinate(postal_code)
    if lat != None or long != None:
        coord_dic['PostalCode'].append(postal_code)
        coord_dic['Latitude'].append(lat)
        coord_dic['Longitude'].append(long)

100%|██████████| 103/103 [02:30<00:00,  1.46s/it]


In [11]:
coord_df = pd.DataFrame(coord_dic)
print('Shape of the obtained dataframe',coord_df.shape)
if coord_df.shape[0]==df.shape[0]:
    print('Collection of coordinate via Geopy successfull!')
else:
    print('Collection of coordinate via Geopy failed!')
coord_df.head()

Shape of the obtained dataframe (22, 3)
Collection of coordinate via Geopy failed!


Unnamed: 0,PostalCode,Latitude,Longitude
0,M3A,43.653482,-79.383935
1,M7A,43.653482,-79.383935
2,M1B,43.653482,-79.383935
3,M1C,43.653482,-79.383935
4,M3C,43.732822,-79.346961


## Via a csv file

If the geocoders didn't manage to retrieve all the coordinate linked to the postal code, use the following data frame

In [12]:
coord_df2 = pd.read_csv('https://cocl.us/Geospatial_data')
print('Shape of the csv dataframe ', coord_df2.shape)
coord_df2.head()

Shape of the csv dataframe  (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Fusing with previous dataframe

In [13]:
coord_df2.columns=['PostalCode','Latitude','Longitude']
toronto_neigh = df.join(coord_df2.set_index("PostalCode"),on="PostalCode")
toronto_neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


# Explore and cluster the neighborhoods in Toronto via Foursquare API

## Find coordinate of Toronto and display a map of Toronto

In [14]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = None

while location is None:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, postal_code in zip(toronto_neigh['Latitude'], toronto_neigh['Longitude'], toronto_neigh['Borough'], toronto_neigh['Neighborhood'],toronto_neigh['PostalCode']):
    label = '{}, {}, {}'.format(postal_code,neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Let's retrieve the popular venues for each neoghborhood using Foursquare API

In [16]:
# Foursquare API credential
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes)):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
toronto_venues = getNearbyVenues(names=toronto_neigh['PostalCode'],
                                   latitudes=toronto_neigh['Latitude'],
                                   longitudes=toronto_neigh['Longitude']
                                  )
print('Venues Collected!')

103it [01:08,  1.50it/s]

Venues Collected!





Let's check the resulting dataframe

In [23]:
print('Number of venues retrieved: ', toronto_venues.shape)
toronto_venues.head()

Number of venues retrieved:  (2147, 7)


Unnamed: 0,PostalCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


Let's look to the Venue Category

In [24]:
toronto_venues[['Venue Category']].describe()

Unnamed: 0,Venue Category
count,2147
unique,266
top,Coffee Shop
freq,183


Coffee Shop is the most frequent popular category of venue in Toronto

## Let's describe the dataset in function of what category of venues each PostalCode has

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's group by postal code

In [26]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's verify that every Postal Code exists in this dataframe

In [27]:
print(toronto_grouped.shape)
print(toronto_neigh.shape)

(100, 267)
(103, 5)


Let's find the Postal codes with no venues:

In [28]:
postalcode_wo_venues_truth_table = ~toronto_neigh['PostalCode'].isin(toronto_grouped['PostalCode']).to_frame()
toronto_neigh[postalcode_wo_venues_truth_table['PostalCode']]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
11,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...,43.650943,-79.554724
95,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


 We need to add these postal codes to our category table 

To do a successfull join between toronto_neigh and toronto_grouped, we need only one column in common

In [29]:
columns = toronto_neigh.columns.values
for c in columns[1:]:
    if c in toronto_grouped:
        print(c, 'listed as a category in toronto_grouped dataframe')

Neighborhood listed as a category in toronto_grouped dataframe


Let's rename the 'Neighborhood' column of toronto_grouped

In [30]:
toronto_grouped.rename(columns={'Neighborhood':'Neighborhood_'}, inplace=True)

Now, we can join the two datasets to include all the neighborhodd

In [31]:
toronto_neigh_w_venues = toronto_neigh[["PostalCode"]].join(toronto_grouped.set_index("PostalCode"), on="PostalCode", how='left')

print(toronto_neigh_w_venues.shape)
toronto_neigh_w_venues.head()

(103, 267)


Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
3,M6A,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0
4,M7A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025


Let's look at the column associated with the neighborhood with no venues

In [32]:
toronto_neigh_w_venues[postalcode_wo_venues_truth_table['PostalCode']]

Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
5,M9A,,,,,,,,,,...,,,,,,,,,,
11,M9B,,,,,,,,,,...,,,,,,,,,,
95,M1X,,,,,,,,,,...,,,,,,,,,,


Let's replace the NaN

In [33]:
toronto_neigh_w_venues.replace(np.nan, 0.0, inplace = True)
toronto_neigh_w_venues[postalcode_wo_venues_truth_table['PostalCode']]

Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
5,M9A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,M9B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,M1X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finally, we have our desire dataframe with every postal code

In [34]:
toronto_neigh_w_venues.head()

Unnamed: 0,PostalCode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
3,M6A,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0
4,M7A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025


## Let's find the most common venues per Postal Code

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_neigh_w_venues['PostalCode']

for ind in np.arange(toronto_neigh_w_venues.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_neigh_w_venues.iloc[ind, :], num_top_venues)

print('Shape is',neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

Shape is (103, 11)


Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Park,Bus Stop,Food & Drink Shop,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Electronics Store
1,M4A,Coffee Shop,Grocery Store,Hockey Arena,Portuguese Restaurant,Yoga Studio,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,M5A,Coffee Shop,Pub,Park,Bakery,Theater,Restaurant,Café,Breakfast Spot,Hotel,Beer Store
3,M6A,Clothing Store,Furniture / Home Store,Accessories Store,Vietnamese Restaurant,Carpet Store,Miscellaneous Shop,Event Space,Coffee Shop,Boutique,Drugstore
4,M7A,Coffee Shop,Sushi Restaurant,Diner,Juice Bar,Bank,Bar,Café,Spa,Beer Bar,Yoga Studio


## Let's cluster the postal code

#### K-Means Clustering

In [37]:
# set number of clusters
kclusters = 6

toronto_clustering = toronto_neigh_w_venues.drop(['PostalCode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 1, 1, 1, 3, 1, 1, 1], dtype=int32)

#### Add these label to above dataframe

In [38]:
toronto_clust_neigh_venues_sorted = neighborhoods_venues_sorted.copy()
toronto_clust_neigh_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [39]:
toronto_clust_neigh_venues_sorted.head()

Unnamed: 0,Cluster Labels,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,M3A,Park,Bus Stop,Food & Drink Shop,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Electronics Store
1,1,M4A,Coffee Shop,Grocery Store,Hockey Arena,Portuguese Restaurant,Yoga Studio,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,1,M5A,Coffee Shop,Pub,Park,Bakery,Theater,Restaurant,Café,Breakfast Spot,Hotel,Beer Store
3,1,M6A,Clothing Store,Furniture / Home Store,Accessories Store,Vietnamese Restaurant,Carpet Store,Miscellaneous Shop,Event Space,Coffee Shop,Boutique,Drugstore
4,1,M7A,Coffee Shop,Sushi Restaurant,Diner,Juice Bar,Bank,Bar,Café,Spa,Beer Bar,Yoga Studio


#### Retrieve Neighborhood and Location for the cluster

Let's merge the above dataframe with the toronto_neigh one

In [40]:
toronto_merged = toronto_neigh

# merge toronto_grouped with toronto_data to add Borough/Neighborhood/latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_clust_neigh_venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Bus Stop,Food & Drink Shop,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Electronics Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Grocery Store,Hockey Arena,Portuguese Restaurant,Yoga Studio,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636,1,Coffee Shop,Pub,Park,Bakery,Theater,Restaurant,Café,Breakfast Spot,Hotel,Beer Store
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763,1,Clothing Store,Furniture / Home Store,Accessories Store,Vietnamese Restaurant,Carpet Store,Miscellaneous Shop,Event Space,Coffee Shop,Boutique,Drugstore
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Diner,Juice Bar,Bank,Bar,Café,Spa,Beer Bar,Yoga Studio


#### Visualize the resulting clusters

In [41]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, pc, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(pc)+', ' + str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Cluster

### Cluster 1

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,0,Park,Bus Stop,Food & Drink Shop,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Electronics Store
16,M6C,York,Humewood-Cedarvale,0,Park,Hockey Arena,Field,Trail,Yoga Studio,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
21,M6E,York,Caledonia-Fairbanks,0,Park,Women's Store,Pool,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
35,M4J,East York,East Toronto,0,Park,Coffee Shop,Metro Station,Convenience Store,Yoga Studio,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
40,M3K,North York,Downsview,0,Park,Airport,Business Service,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dance Studio,Distribution Center
49,M6L,North York,North Park / Maple Leaf Park / Upwood Park,0,Park,Construction & Landscaping,Bakery,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dance Studio
61,M4N,Central Toronto,Lawrence Park,0,Swim School,Park,Bus Line,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Yoga Studio
64,M9N,York,Weston,0,Park,Convenience Store,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center,Curling Ice
66,M2P,North York,York Mills West,0,Park,Bank,Convenience Store,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center,Dance Studio
85,M1V,Scarborough,Milliken / Agincourt North / Steeles East / L'...,0,Coffee Shop,Park,Playground,Discount Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner


### Cluster 2

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4A,North York,Victoria Village,1,Coffee Shop,Grocery Store,Hockey Arena,Portuguese Restaurant,Yoga Studio,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,M5A,Downtown Toronto,Regent Park / Harbourfront,1,Coffee Shop,Pub,Park,Bakery,Theater,Restaurant,Café,Breakfast Spot,Hotel,Beer Store
3,M6A,North York,Lawrence Manor / Lawrence Heights,1,Clothing Store,Furniture / Home Store,Accessories Store,Vietnamese Restaurant,Carpet Store,Miscellaneous Shop,Event Space,Coffee Shop,Boutique,Drugstore
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,1,Coffee Shop,Sushi Restaurant,Diner,Juice Bar,Bank,Bar,Café,Spa,Beer Bar,Yoga Studio
5,M9A,Etobicoke,Islington Avenue,1,Yoga Studio,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,M5X,Downtown Toronto,First Canadian Place / Underground city,1,Coffee Shop,Café,Hotel,Gym,Restaurant,Japanese Restaurant,Salad Place,American Restaurant,Steakhouse,Asian Restaurant
99,M4Y,Downtown Toronto,Church and Wellesley,1,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Men's Store,Hotel,Mediterranean Restaurant
100,M7Y,East Toronto,Business reply mail Processing CentrE,1,Light Rail Station,Yoga Studio,Comic Shop,Spa,Farmers Market,Pizza Place,Fast Food Restaurant,Restaurant,Park,Auto Workshop
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,1,Construction & Landscaping,Baseball Field,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Yoga Studio,Deli / Bodega


### Cluster 3

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,M1J,Scarborough,Scarborough Village,2,Convenience Store,Playground,Yoga Studio,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center
83,M4T,Central Toronto,Moore Park / Summerhill East,2,Playground,Yoga Studio,Discount Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center


### Cluster 4

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M1B,Scarborough,Malvern / Rouge,3,Fast Food Restaurant,Yoga Studio,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run,Distribution Center


### Cluster 5

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
52,M2M,North York,Willowdale / Newtonbrook,4,Home Service,Yoga Studio,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center,Curling Ice


### Cluster 6

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[0,1,2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,M6B,North York,Glencairn,5,Pizza Place,Pub,Japanese Restaurant,Yoga Studio,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
50,M9L,North York,Humber Summit,5,Pizza Place,Yoga Studio,Discount Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center
63,M6N,York,Runnymede / The Junction North,5,Grocery Store,Brewery,Bus Line,Pizza Place,Yoga Studio,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
70,M9P,Etobicoke,Westmount,5,Pizza Place,Coffee Shop,Sandwich Place,Intersection,Chinese Restaurant,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Curling Ice
77,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,5,Mobile Phone Shop,Bus Line,Pizza Place,Sandwich Place,Yoga Studio,Dim Sum Restaurant,Deli / Bodega,Department Store,Dessert Shop,Diner
