# Capstone Week3 : Segmenting and Clustering Neighborhoods in Toronto

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import numpy as np

## Web scrapping the wikipedia page:https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

#### Read the Wikipedia page directly wti the pandas read_html module .. and keep first table of that page as this is the one we want 

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url) # Returns list of all tables on page
# First table of that page is what we need .. 
toronto_neigh = tables[0]

# rename the column Neighbourhood to Neighborhood for consistency with other resources later in this lab
toronto_neigh.rename(columns={"Neighbourhood":"Neighborhood"},inplace=True)

print (toronto_neigh.dtypes, toronto_neigh.shape)


Postal Code     object
Borough         object
Neighborhood    object
dtype: object (180, 3)


#### Clean the data frame by removing the Borought with 'not assigned' value then grouping based on Postal Code and Borough to aggregate the Neighborhood in a single line

In [4]:
# clean and format data frame :
# First remove spaces in the column name, as a good pratice 
toronto_neigh.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
toronto_neigh = toronto_neigh.loc[toronto_neigh['Borough']!='Not assigned']
#More than one neighborhood can exist in one postal code area ...  These two rows will be combined into one row with the neighborhoods separated with a comma 
toronto_grouped = toronto_neigh.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
mask = toronto_grouped['Neighborhood'] == "Not assigned"
toronto_grouped.loc[mask, 'Neighborhood'] = toronto_grouped.loc[mask, 'Borough']

print ('Dataframe shape:',toronto_grouped.shape)
display (toronto_grouped.head())

Dataframe shape: (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Work with the coordinate Latitude Longitude, based on csv file : https://cocl.us/Geospatial_data and merge dataframes

In [5]:
# read the csv file cintaing Coordinates and convert to dataframe + merge to our previous df from wikipedia
import io
import requests
url = "https://cocl.us/Geospatial_data"
src = requests.get(url).content  # get raw formatted content data from that url
coord = pd.read_csv(io.StringIO(src.decode('utf-8'))) # pass to pandas read_csv a 'file-like' object the raw we got from prev step

# again rename header to remove space 
coord.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)

# mergde our datafile wth the coordinate dataframe
neigh = pd.merge(toronto_grouped, coord, on='PostalCode', how='inner') # Join info on PostalCode which is our unique value, common to both dataframe

display (neigh.head())

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Explore and cluster the neighborhoods in Toronto

#### We keep only the Borough having 'Downtown Toronto' in their name . This sublist will be our focus for the coming investigations

In [14]:
# filter our df only on the Borough having Toronto in their name 
toronto_data = neigh[neigh['Borough'].str.contains('Downtown Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


#### Visualize our selected borough in a map using Folium

In [17]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# visualize the Tornonto neighbour map whenever there is Toronto in Borough 
# create map of Manhattan using latitude and longitude values. centered  on first Borough , to start
latitude = toronto_data.Latitude[0]
longitude = toronto_data.Longitude[0]
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add Borough long/lat markers to map
for postcode, lat, lng, label in zip(toronto_data['PostalCode'], toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(str(postcode)+ str(' | ') + label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [12]:
##My foursquar credentials 

CLIENT_ID = ' ' # your Foursquare ID
CLIENT_SECRET = ' ' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

### Explore the Toronto Neighborhood

#### Neighborhood exploration will rely on FourSquare API, with a given central coordinate we are going to ask what kind of venue there are around that point, limiting to a given radius and up to 100 results per query

borrowed from Coursera Lab that function will help to explore - based on coordinates being pushed - what are the interesting venues around. To do that we're querying the foursquare API passing along the coordinate of each of our Downtown Toronto places from our list, with a relatively low radius to get meaningful clustering afterwards as we are focusing on downtown narrowed place.

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=300):
    LIMIT = 100 # limit of number of venues returned by Foursquare API
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
# Get the list of venues along with their coordinates, for our selected borough from our dataframe toronto_data
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

#### Look at Neighborhood having the most Venues , by Category .. 

In [21]:
display (toronto_venues.groupby(['Neighborhood','Venue Category']).count().sort_values(['Venue'], ascending=False).head(15))  

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Neighborhood,Venue Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Toronto Dominion Centre, Design Exchange",Coffee Shop,13,13,13,13,13
Central Bay Street,Coffee Shop,12,12,12,12,12
"Commerce Court, Victoria Hotel",Coffee Shop,11,11,11,11,11
"Harbourfront East, Union Station, Toronto Islands",Coffee Shop,9,9,9,9,9
"Garden District, Ryerson",Coffee Shop,8,8,8,8,8
"First Canadian Place, Underground city",Coffee Shop,8,8,8,8,8
"Kensington Market, Chinatown, Grange Park",Chinese Restaurant,5,5,5,5,5
"Kensington Market, Chinatown, Grange Park",Café,5,5,5,5,5
Church and Wellesley,Gay Bar,4,4,4,4,4
"First Canadian Place, Underground city",Salad Place,4,4,4,4,4


#### How many venue's categories do we have and which category is the most seen  

List the top 15 Venues we observed from our investigation .. There a lot of Coffee shop around :-) . first observation is that there is not much shops in downtown, but rather restaurant, coffee and a few Gym places

In [22]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

print (toronto_venues.groupby(['Venue Category']).count().sort_values(['Venue'], ascending=False)['Venue'].head(15))


There are 168 uniques categories.
Venue Category
Coffee Shop            84
Café                   39
Restaurant             27
Japanese Restaurant    16
Italian Restaurant     15
Deli / Bodega          15
Sandwich Place         13
Bank                   12
Salad Place            12
Hotel                  12
Thai Restaurant        12
Bar                    11
Pizza Place            11
Gym                    11
American Restaurant    10
Name: Venue, dtype: int64


#### Let's the One Hot Encoding to extract the dummy variables of those venues. We do that to help understanding what is the most frequent venue in each neighborhood

In [23]:
pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

Unnamed: 0,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Analyse each neighborhood

In [24]:
# one hot encoding
toronto_dum = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_dum['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column to make easier to read for us as we have many Venue category
colname = "Neighborhood"
neighcol = toronto_dum.pop(colname)
toronto_dum.insert(0, colname, neighcol)

toronto_dum.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,...,Theme Restaurant,Thrift / Vintage Store,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [25]:
toronto_grouped = toronto_dum.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,...,Theme Restaurant,Thrift / Vintage Store,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.018868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018868,0.0,0.0,0.0,0.018868,0.0,0.0,0.0,0.018868,0.0


#### Let's see what are the top5 most common venue for each Neighborhood 

In [26]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0             Pub  0.12
1        Beer Bar  0.12
2    Concert Hall  0.12
3   Grocery Store  0.12
4  Breakfast Spot  0.12


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0         Coffee Shop   0.2
1        Airport Gate   0.2
2      Airport Lounge   0.2
3    Airport Terminal   0.2
4  Airport Food Court   0.2


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.36
1                Café  0.09
2          Restaurant  0.06
3      Sandwich Place  0.06
4  Italian Restaurant  0.06


----Christie----
                 venue  freq
0        Grocery Store  0.25
1          Flower Shop  0.12
2                 Café  0.12
3  American Restaurant  0.12
4          Candy Store  0.12


----Church and Wellesley----
                 venue  freq
0              Gay Bar  0.08
1          Coffee Shop  0.06
2  Japanese Restaurant  0.04
3         Dess

In [27]:
np.arange(toronto_grouped.shape[0])
toronto_grouped.shape

(18, 168)

#### Build a dataframe out of this  and get the top5 Most common Venue

In [41]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]



In [42]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
#del neighborhoods_venues_sorted
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Concert Hall,Pub,Breakfast Spot,Liquor Store,Beer Bar
1,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal
2,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Restaurant,Sandwich Place
3,Christie,Grocery Store,Flower Shop,American Restaurant,Café,Candy Store
4,Church and Wellesley,Gay Bar,Coffee Shop,Burger Joint,Dessert Shop,Japanese Restaurant


## Clustering  Neighborhoods

In [46]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(100) # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,,,,,,
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,1.0,Café,Coffee Shop,Restaurant,Pizza Place,Indian Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1.0,Gay Bar,Coffee Shop,Burger Joint,Dessert Shop,Japanese Restaurant
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1.0,Park,History Museum,Bakery,Italian Restaurant,Food Truck
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1.0,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Movie Theater
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Gastropub,Coffee Shop,Restaurant,Japanese Restaurant,BBQ Joint
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,4.0,Concert Hall,Pub,Breakfast Spot,Liquor Store,Beer Bar
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0.0,Coffee Shop,Café,Italian Restaurant,Restaurant,Sandwich Place
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,1.0,Coffee Shop,Steakhouse,Asian Restaurant,Seafood Restaurant,Thai Restaurant
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,1.0,Coffee Shop,Café,Plaza,Sandwich Place,Boat or Ferry


#### It appears that for Rosedal we don't have any information back from FourSquare , we need to remove that from dataframe


In [58]:
print(toronto_venues.shape)
print (toronto_venues.loc[(toronto_venues.Neighborhood == 'Rosedale')])

(698, 7)
Empty DataFrame
Columns: [Neighborhood, Neighborhood Latitude, Neighborhood Longitude, Venue, Venue Latitude, Venue Longitude, Venue Category]
Index: []


In [69]:
toronto_merged.dropna(inplace=True)
# for some reason the Cluster labels is sometime showing as float which is a problem for the colouring for Folium map
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged.head(100) 
#toronto_merged.dtypes

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,1,Café,Coffee Shop,Restaurant,Pizza Place,Indian Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,Gay Bar,Coffee Shop,Burger Joint,Dessert Shop,Japanese Restaurant
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Park,History Museum,Bakery,Italian Restaurant,Food Truck
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Movie Theater
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Gastropub,Coffee Shop,Restaurant,Japanese Restaurant,BBQ Joint
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,4,Concert Hall,Pub,Breakfast Spot,Liquor Store,Beer Bar
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Café,Italian Restaurant,Restaurant,Sandwich Place
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,1,Coffee Shop,Steakhouse,Asian Restaurant,Seafood Restaurant,Thai Restaurant
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,1,Coffee Shop,Café,Plaza,Sandwich Place,Boat or Ferry
10,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,1,Coffee Shop,Deli / Bodega,Restaurant,Café,Salad Place


In [68]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postal, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(postal) + str('-') + str(poi) + ' | Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine the clusters, report back the Postal code as it is more meaninfgul than Borough .. as there is multiple Postal code per Borough, Postal gives us a more consumable information
From below observation on each cluster, it appears that some of the Venue Category as we got them from FourSquare API could be groupped to have another cluster construction. e.g Café to be considered as CoffeeShope maybe? Depending on what are our criteria we may want to rework also the categories from FourSquare to group them by theme before doing the clustering.

In [70]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
7,M5G,Downtown Toronto,0,Coffee Shop,Café,Italian Restaurant,Restaurant,Sandwich Place
18,M7A,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Bubble Tea Shop,Park,Café


In [71]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M4X,Downtown Toronto,1,Café,Coffee Shop,Restaurant,Pizza Place,Indian Restaurant
2,M4Y,Downtown Toronto,1,Gay Bar,Coffee Shop,Burger Joint,Dessert Shop,Japanese Restaurant
3,M5A,Downtown Toronto,1,Park,History Museum,Bakery,Italian Restaurant,Food Truck
4,M5B,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Movie Theater
5,M5C,Downtown Toronto,1,Gastropub,Coffee Shop,Restaurant,Japanese Restaurant,BBQ Joint
8,M5H,Downtown Toronto,1,Coffee Shop,Steakhouse,Asian Restaurant,Seafood Restaurant,Thai Restaurant
9,M5J,Downtown Toronto,1,Coffee Shop,Café,Plaza,Sandwich Place,Boat or Ferry
10,M5K,Downtown Toronto,1,Coffee Shop,Deli / Bodega,Restaurant,Café,Salad Place
11,M5L,Downtown Toronto,1,Coffee Shop,Deli / Bodega,Restaurant,Café,Hotel
12,M5S,Downtown Toronto,1,Sandwich Place,Café,Bookstore,Yoga Studio,Pub


In [72]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
14,M5V,Downtown Toronto,2,Coffee Shop,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal


In [73]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
17,M6G,Downtown Toronto,3,Grocery Store,Flower Shop,American Restaurant,Café,Candy Store


In [74]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,M5E,Downtown Toronto,4,Concert Hall,Pub,Breakfast Spot,Liquor Store,Beer Bar
