# Segmenting and Clustering Neighborhoods in Toronto

   ## 1. Get Toronto Data from Wiki and build Data Frame

In [1]:
import urllib.request
import bs4 as bs
import pandas as pd

import numpy as np # library to handle data in a vectorized manner
import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [2]:
CLIENT_ID = 'IU3Z4AGNKVWG3JSLOZDHZXN5NRE4VWJOB51CUDT532CQETUI' # your Foursquare ID
CLIENT_SECRET = 'L1GYNUZG2DKGWLZEPUZ5LA5O3MGE5MTHBBCVNSXCDK32BIH0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 30

In [3]:
# Read the wiki page
# create the above dataframe:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])
# df.head()
# print(  "1. count: " , df.shape[0])

In [4]:
# Read the wiki page
# create the above dataframe:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])
# df.head()
# print(  "1. count: " , df.shape[0])

In [5]:
# Remove 1st row whic is all None
df = df[~df.isna().any(axis=1)]
# print(  "2. count: " , df.shape[0] )
# df.head()

In [6]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df.loc[df['Borough'].str.contains('Not assigned'), 'Borough'] = None
#df = df[~df.isna().any(axis=1)]
df = df.dropna(subset=['Borough'])
# print(  "3. count: " , df.shape[0] )
# df.head()

In [7]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
# But, becuase the wiki alredy remvoved duplicate postal codes we are only going to cleanup the data in the Neighborhood column
# Also remove new line from HTML
for i in df.index: 
     df['PostalCode'][i] = df['PostalCode'][i].replace('\n','')
     df['Borough'][i] = df['Borough'][i].replace('\n','')
     df['Neighborhood'][i] = df['Neighborhood'][i].replace('\n','')
     df['Neighborhood'][i] = df['Neighborhood'][i].replace(' / ',', ')

In [8]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df['Neighborhood'].str.contains('Not assigned'), 'Neighborhood'] 

Series([], Name: Neighborhood, dtype: object)

In [9]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print(  "3. count: " , df.shape[0] )

3. count:  103


In [10]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')


Data downloaded!


In [11]:
header_list = ["PostalCode", "Latitude", "Longitude"]
ll_df = pd.read_csv('Geospatial_Coordinates.csv', names=header_list, skiprows=1)
ll_df.set_index('PostalCode')
ll_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# Function to get Latitude from dataframe created from Geospatial_Coordinates.csv 
def find_latitude(postal_code):
    return ll_df.loc[ll_df['PostalCode'] == postal_code]['Latitude'].item()

In [13]:
# Function to get Longitude from dataframe created from Geospatial_Coordinates.csv 
def find_longitude(postal_code):
    return ll_df.loc[ll_df['PostalCode'] == postal_code]['Longitude'].item()

In [14]:
df['Latitude'] = None
df['Longitude'] = None
for i in df.index: 
     df['Latitude'][i] = find_latitude( df['PostalCode'][i] )
     df['Longitude'][i] = find_longitude( df['PostalCode'][i] )

In [15]:
some_values = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V', 'M9L','M5V','M1B','M5A']
df.loc[df['PostalCode'].isin(some_values)]
#df.loc[df['PostalCode'] == 'M9W']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
10,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
13,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7064,-79.3099
40,M4G,East York,Leaside,43.7091,-79.3635
41,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
47,M2H,North York,Hillcrest Village,43.8038,-79.3635
55,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
81,M9L,North York,Humber Summit,43.7563,-79.566
85,M4M,East Toronto,Studio District,43.6595,-79.3409
109,M1R,Scarborough,"Wexford, Maryvale",43.7501,-79.2958


In [16]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [17]:
df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [18]:
toronto_df = df.loc[df['Borough'].str.contains("Toronto") ].sort_values('Borough')
print( '{} Boroughs and {} Neighborhoods in {} Postal Codes'.format(
    len(toronto_df['Borough'].unique()), 
    toronto_df.shape[0],
    len(toronto_df['PostalCode'].unique()), 
    )
)
toronto_df

4 Boroughs and 39 Neighborhoods in 39 Postal Codes


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
95,M5N,Central Toronto,Roselawn,43.7117,-79.4169
103,M4P,Central Toronto,Davisville North,43.7128,-79.3902
104,M5P,Central Toronto,Forest Hill North & West,43.6969,-79.4113
112,M4R,Central Toronto,North Toronto West,43.7154,-79.4057
113,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.6727,-79.4057
121,M4S,Central Toronto,Davisville,43.7043,-79.3888
130,M4T,Central Toronto,"Moore Park, Summerhill East",43.6896,-79.3832
139,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.6864,-79.4
94,M4N,Central Toronto,Lawrence Park,43.728,-79.3888
131,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.6532,-79.4


   ## 2. Explore Neighborhoods in Toronto

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
toranto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )



Roselawn
Davisville North
Forest Hill North & West
North Toronto West
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Lawrence Park
Kensington Market, Chinatown, Grange Park
Church and Wellesley
University of Toronto, Harbord
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Regent Park, Harbourfront
Commerce Court, Victoria Hotel
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
Toronto Dominion Centre, Design Exchange
St. James Town
Berczy Park
Central Bay Street
Harbourfront East, Union Station, Toronto Islands
Christie
Richmond, Adelaide, King
The Beaches
Business reply mail Processing CentrE
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Dufferin, Dovercourt Village
Parkdale, Roncesvalles
Little P

In [22]:
print(toranto_venues.shape)
toranto_venues.head()

(863, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden
1,Roselawn,43.711695,-79.416936,Menchie's St. Clair West,43.707664,-79.414301,Ice Cream Shop
2,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop
3,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park
4,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot


In [23]:
toranto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,30,30,30,30,30,30
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
Business reply mail Processing CentrE,17,17,17,17,17,17
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,30,30,30,30,30,30
Christie,18,18,18,18,18,18
Church and Wellesley,30,30,30,30,30,30
"Commerce Court, Victoria Hotel",30,30,30,30,30,30
Davisville,30,30,30,30,30,30
Davisville North,7,7,7,7,7,7


In [24]:
print('There are {} uniques categories.'.format(len(toranto_venues['Venue Category'].unique())))

There are 184 uniques categories.


## 3. Analyze Each Neighborhood

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toranto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toranto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Art Museum,...,Thai Restaurant,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
toronto_onehot.shape

(863, 184)

In [27]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,...,Thai Restaurant,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
toronto_grouped.shape

(39, 184)

#### Let's print each neighborhood along with the top 5 most common venues

In [29]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.10
1              Bakery  0.07
2                Café  0.07
3  Seafood Restaurant  0.07
4      Farmers Market  0.07


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.13
1     Coffee Shop  0.09
2       Nightclub  0.09
3  Breakfast Spot  0.09
4             Gym  0.04


----Business reply mail Processing CentrE----
            venue  freq
0      Smoke Shop  0.06
1   Auto Workshop  0.06
2         Brewery  0.06
3      Skate Park  0.06
4  Farmers Market  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3       Coffee Shop  0.06
4          Boutique  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.30
1                Café  0.07
2  Italian Restaurant  0.07
3      

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [31]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Café,Seafood Restaurant,Beer Bar,Bakery,Farmers Market,Park,Fish Market,Concert Hall
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Nightclub,Coffee Shop,Pet Store,Restaurant,Intersection,Burrito Place,Bar,Stadium
2,Business reply mail Processing CentrE,Light Rail Station,Burrito Place,Farmers Market,Fast Food Restaurant,Auto Workshop,Recording Studio,Gym / Fitness Center,Pizza Place,Spa,Smoke Shop
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Coffee Shop,Boutique,Rental Car Location,Plane,Harbor / Marina,Sculpture Garden
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Yoga Studio,Steakhouse,Middle Eastern Restaurant,Comic Shop,Park,Ramen Restaurant,Bubble Tea Shop


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [32]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [33]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
95,M5N,Central Toronto,Roselawn,43.7117,-79.4169,2,Garden,Ice Cream Shop,Wine Bar,Cuban Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
103,M4P,Central Toronto,Davisville North,43.7128,-79.3902,0,Park,Gym,Breakfast Spot,Sandwich Place,Department Store,Hotel,Food & Drink Shop,Comic Shop,Comfort Food Restaurant,Diner
104,M5P,Central Toronto,Forest Hill North & West,43.6969,-79.4113,3,Jewelry Store,Bus Line,Trail,Sushi Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
112,M4R,Central Toronto,North Toronto West,43.7154,-79.4057,0,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Fast Food Restaurant,Diner,Dessert Shop,Cosmetics Shop,Mexican Restaurant,Chinese Restaurant
113,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.6727,-79.4057,0,Café,Sandwich Place,Coffee Shop,Pizza Place,Burger Joint,Indian Restaurant,Donut Shop,History Museum,Pub,BBQ Joint


Finally, let's visualize the resulting clusters

In [34]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
103,Central Toronto,0,Park,Gym,Breakfast Spot,Sandwich Place,Department Store,Hotel,Food & Drink Shop,Comic Shop,Comfort Food Restaurant,Diner
112,Central Toronto,0,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Fast Food Restaurant,Diner,Dessert Shop,Cosmetics Shop,Mexican Restaurant,Chinese Restaurant
113,Central Toronto,0,Café,Sandwich Place,Coffee Shop,Pizza Place,Burger Joint,Indian Restaurant,Donut Shop,History Museum,Pub,BBQ Joint
121,Central Toronto,0,Dessert Shop,Gym,Sushi Restaurant,Italian Restaurant,Sandwich Place,Coffee Shop,Café,Pizza Place,Diner,Brewery
139,Central Toronto,0,Coffee Shop,Pub,Pizza Place,Restaurant,Bank,Sports Bar,Bagel Shop,Light Rail Station,Liquor Store,Sushi Restaurant
131,Downtown Toronto,0,Café,Mexican Restaurant,Vietnamese Restaurant,Bakery,Comfort Food Restaurant,Record Shop,Caribbean Restaurant,Cheese Shop,Pizza Place,Cocktail Bar
166,Downtown Toronto,0,Coffee Shop,Burger Joint,Men's Store,Bubble Tea Shop,Pizza Place,Smoke Shop,Creperie,Diner,Beer Bar,Martial Arts Dojo
122,Downtown Toronto,0,Café,Bakery,Restaurant,Japanese Restaurant,Italian Restaurant,Bookstore,Bar,Yoga Studio,Nightclub,Noodle House
140,Downtown Toronto,0,Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Coffee Shop,Boutique,Rental Car Location,Plane,Harbor / Marina,Sculpture Garden
149,Downtown Toronto,0,Café,Farmers Market,Cocktail Bar,Beer Bar,Bakery,Fish Market,Restaurant,Japanese Restaurant,Pub,Seafood Restaurant


#### Cluster 2

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
130,Central Toronto,1,Park,Trail,Playground,Summer Camp,Coworking Space,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
148,Downtown Toronto,1,Park,Trail,Playground,Cosmetics Shop,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega,Dance Studio


#### Cluster 3

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
95,Central Toronto,2,Garden,Ice Cream Shop,Wine Bar,Cuban Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


#### Cluster 4

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
104,Central Toronto,3,Jewelry Store,Bus Line,Trail,Sushi Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


#### Cluster 5

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
94,Central Toronto,4,Bus Line,Swim School,Park,College Gym,Creperie,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
