# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

The needed libraries are imported, among them requests to fetch the file and BeautifulSoup to parse it

In [31]:
# installing geocoder, geopy and folium
!pip3 install geocoder --user
!pip3 install geopy --user
!pip3 install folium --user

# importing pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# importing requests for web page retrieval
import requests

# importing Beautiful soup for web page parsing
from bs4 import BeautifulSoup as BS

# importing matplotlib for map colors
import matplotlib.cm as cm
import matplotlib.colors as colors

# importing math for NaN processing
import math

# importing geocoder, Nominatim and folium for map generation
import geocoder 
from geopy.geocoders import Nominatim 
import folium

# importing KMeans for clustering
from sklearn.cluster import KMeans

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## First Part: Retrieving the Table and Processing it

Using request to fetch the file

In [2]:
#loading text from the web page

# source url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# performing the request
file = requests.get(url).text

Parsing the text with Beautiful Soup and retrieving the data table

In [3]:
# parsing data with Beautiful Soup
parsable_file = BS(file, 'lxml')

# retrieving the table
data_table = parsable_file.find('table')

Converting the table into a dataframe

In [4]:
# converting the table into a list
list = pd.read_html(str(data_table), header=0)
list

[    Postcode           Borough  \
 0        M1A      Not assigned   
 1        M2A      Not assigned   
 2        M3A        North York   
 3        M4A        North York   
 4        M5A  Downtown Toronto   
 5        M5A  Downtown Toronto   
 6        M6A        North York   
 7        M6A        North York   
 8        M7A      Queen's Park   
 9        M8A      Not assigned   
 10       M9A         Etobicoke   
 11       M1B       Scarborough   
 12       M1B       Scarborough   
 13       M2B      Not assigned   
 14       M3B        North York   
 15       M4B         East York   
 16       M4B         East York   
 17       M5B  Downtown Toronto   
 18       M5B  Downtown Toronto   
 19       M6B        North York   
 20       M7B      Not assigned   
 21       M8B      Not assigned   
 22       M9B         Etobicoke   
 23       M9B         Etobicoke   
 24       M9B         Etobicoke   
 25       M9B         Etobicoke   
 26       M9B         Etobicoke   
 27       M1C       

In [5]:
# converting the list into a dataframe
df = list[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Ignoring the rows with a not assigned borough

In [6]:
# ignoring the rows with a Borough equal to "Not assigned"
df = df[df.Borough != "Not assigned"].reset_index()
df.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


Combining in the same row the neighbourhoods corresponding to the same postcode

In [7]:
# grouping together all the neighbourhoods corresponding to the same postcode
df = df.groupby(by=['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index() 
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Managing the not assigned neighbourhoods

In [8]:
# substituting the corresponding borough value into the not assigned neighbourhoods
for i, neighbourhood in enumerate(df.Neighbourhood):
    if neighbourhood == 'Not assigned':
        df.Neighbourhood[i] = df.Borough[i]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Showing the final dataframe shape

In [9]:
df.shape

(103, 3)

## Second Part: Retrieving the coordinates of each Postal Code

Trying to use geocoder

In [None]:
# initializing coordinate variables 
latitude = []
longitude = []

# initializing counter
i=0

# initializing counter length
length = len(df)

# initializing postal code
postal_code = df.loc[:,'Postcode']

# looping until I get the coordinates
for i in range(len(df)):
    lat_lng_coords = None
    print("entered the for loop "+str(i))
    while(lat_lng_coords is None) and i<length:
        print("entered the while loop "+str(i))
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[i]))
        print("went past geocoder "+str(i))
        lat_lng_coords = g.latlng
    latitude.append(lat_lng_coords[0])
    longitude.append(at_lng_coords[1])
    
print(latitude)
print(longitude)

**Unfortunately, the geocoder gets stuck and the code never reaches the print("went past geocoder "+str(i)) instruction**

Using the csv instead to retrieve coordinates

In [10]:
# loading the geospatial csv into a dataframe and reading it
geosp_url = 'https://cocl.us/Geospatial_data'
df_geosp=pd.read_csv(geosp_url)
df_geosp.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# checking all the 103 Postal Codes present in the original dataframe are present
df_geosp.shape

(103, 3)

Joining the two dataframes on the postal code

In [12]:
# joining the two dataframes using the postal code as key
df_final = df.join(df_geosp.set_index('Postal Code'), on='Postcode')

In [13]:
# checking the final result
df_final.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Third Part: Clustering Neighborhoods in Toronto

Retrieving Toronto latitude and longitude

In [14]:
# specifying Toronto Address
address = 'Toronto, ON'

# instantiating a geolocator
geolocator = Nominatim(user_agent="trnt_explorer")

# passing the location to the geolocator
location = geolocator.geocode(address)

# retrieving latitude and longitude from the geolocator
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


Creating a map of Toronto with all neighbourhoods

In [15]:
# creating a map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# adding markers of neighborhoods to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto

Storing my FourSquare credentials in variables

In [42]:
# my foursquare credentials
CLIENT_ID = 'LDMMYQ2ZHDG5LBNQAMDT1HIFZHSOTRK5UT3UDP5NCAC1SBRM' # my Foursquare ID
CLIENT_SECRET = 'ADD1ZZA5ORJ4HV3SXM5AVRPGQI5FNRDSDSVRPN01PRBGX5FU' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('My credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentials:
CLIENT_ID: LDMMYQ2ZHDG5LBNQAMDT1HIFZHSOTRK5UT3UDP5NCAC1SBRM
CLIENT_SECRET:ADD1ZZA5ORJ4HV3SXM5AVRPGQI5FNRDSDSVRPN01PRBGX5FU


Creating a function able to retrieve the first 100 venues in a radius of 500 from the given point

In [17]:
# number of venues returned by Foursquare API
LIMIT = 100 

# search radius
radius = 500 

# function retrieving the closer and most active venues for a list of points
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # creating the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # making the request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # returning the relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    #inserting the venues into a dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Applying the function to all the neighbourhoods stored in my dataframe

In [20]:
# creating a dataframe with a row for each venue found in the given neighborhood list
toronto_venues = getNearbyVenues(names=df_final['Neighbourhood'],
                                   latitudes=df_final['Latitude'],
                                   longitudes=df_final['Longitude']
                                  )

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
The Beaches West,Indi

In [21]:
# checking the dataframe was correctly created
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,"Guildwood,Morningside,West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood,Morningside,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


As a preparatory step for clustering, the venue categories are transformed in one hot encoded features, and the neighborhood column is added

In [23]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# adding Neighborhood column to the dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The one hot encoded table is grouped by neighborhood so that, for each neighborhood, the sum of each feature is reported

In [25]:
# grouping by neighborhood and taking the mean
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.030000,...,0.0,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.0,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,"Alderwood,Long Branch",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
5,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
7,"Bedford Park,Lawrence Manor East",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.043478,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.017857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
9,"Birch Cliff,Cliffside West",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


Clustering is performed on the neighborhoods by features

In [26]:
# setting number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# running k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# checking cluster labels generated for each row in the dataframe
kmeans.labels_[0:100]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 4, 3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int32)

A function returning the top venues for a row

In [27]:
# the function returning the num_top_venues for a row
def return_most_common_venues(row, num_top_venues): 
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

The data is alphabetically sorted and the 10 most common venues reported

In [28]:
# number of top venues to be reported
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# creating columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1 , indicators[ind])) 
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# creating a new dataframe
Neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
Neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]): 
    Neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

Neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,Gym,Asian Restaurant,Hotel,Restaurant,American Restaurant
1,Agincourt,Breakfast Spot,Lounge,Skating Rink,Clothing Store,Dumpling Restaurant,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Playground,Yoga Studio,Drugstore,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Fast Food Restaurant,Pizza Place,Sandwich Place,Beer Store,Coffee Shop,Fried Chicken Joint,Pharmacy,Electronics Store,Empanada Restaurant
4,"Alderwood,Long Branch",Pizza Place,Gym,Sandwich Place,Dance Studio,Pharmacy,Coffee Shop,Skating Rink,Pub,Doner Restaurant,Discount Store
5,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Deli / Bodega,Pizza Place,Middle Eastern Restaurant,Sushi Restaurant,Bank,Bridal Shop,Diner,Fried Chicken Joint,Frozen Yogurt Shop
6,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio
7,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Sushi Restaurant,Greek Restaurant,Thai Restaurant,Liquor Store,Comfort Food Restaurant,Fast Food Restaurant,Sandwich Place,Juice Bar
8,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Bakery,Café,Beer Bar,Seafood Restaurant,Steakhouse,Farmers Market,Jazz Club
9,"Birch Cliff,Cliffside West",College Stadium,Café,Skating Rink,General Entertainment,Donut Shop,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Drugstore


Cluster labels and latitude and longitude are added

In [29]:
# adding clustering labels
Neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_, allow_duplicates=True)
toronto_merged = df_final

# merging toronto_grouped with toronto_data to add latitude/longitude for each Neighborhood
toronto_merged = toronto_merged.join(Neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,3.0,Fast Food Restaurant,Dumpling Restaurant,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Harbor / Marina
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,0.0,Moving Target,Bar,Yoga Studio,Diner,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,0.0,Breakfast Spot,Electronics Store,Rental Car Location,Mexican Restaurant,Medical Center,Intersection,Pizza Place,Eastern European Restaurant,Empanada Restaurant,Diner
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Pharmacy,Korean Restaurant,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Lounge,Bakery,Caribbean Restaurant,Fried Chicken Joint,Athletics & Sports,Thai Restaurant,Bank,Hakka Restaurant,Empanada Restaurant,Electronics Store


The map showing the clusters is created

In [41]:
# creating map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# setting color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)] 
colors_array = cm.rainbow(np.linspace(0, 1, len(ys))) 
rainbow = [colors.rgb2hex(i) for i in colors_array]

toronto_merged.fillna(5)

# adding markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude' ], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    if math.isnan(cluster): cluster=5
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label, 
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters
 