# Clustering the city of Toronto

Install and import needed packages

In [None]:
!conda install -c conda-forge folium

In [258]:
# import needed packages
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import requests
import json
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Reading dataset

Lets read the dataset, the same created by the last notebook (part 2) and see a little of information about it

In [20]:
# reading the dataset
df = pd.read_csv('toPostalDf2_ll.csv')

In [22]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"HighlandCreek, RougeHill, PortUnion",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, WestHill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [23]:
df.shape

(103, 5)

## Generating the Toronto Map

Lets get the latitude and longitude of Toronto City

In [24]:
# getting Toronto latitude and longitude
geolocator = Nominatim(user_agent='foursquare_agent')
location = geolocator.geocode('Toronto, CA')
print('Toronto latitude/longitude: ', location.latitude, '/', location.longitude)

Toronto latitude/longitude:  43.653963 / -79.387207


Now, let's plot the graph

In [30]:
# generating the map
map_Toronto = folium.Map(location = (location.latitude, location.longitude), zoom_start=11)

# plotting the neighbourhood marks
for lat, lng, name, postcode in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], df['Postcode']):
    label = '{} ({})'.format(name, postcode)
    folium.CircleMarker(
        [lat, lng],
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = 'blue',
        fill_opacity = '0.3'
    ).add_to(map_Toronto)

In [31]:
# show map
map_Toronto

## Skiping boroughs without the key 'Toronto'

Let's use just the rows that have **Toronto** word in the Borough field

In [284]:
# using just boroughs that contains the word Toronto
df_TorontoOnly = df[df['Borough'].str.contains('Toronto')]

## Getting data from Foursquare API

Setting the FourSquare API variables

**<font color="red">FOR SECURITY REASONS, FOURSQUARE CLIENT_ID AND CLIENT_SECRET WAS REMOVED FROM SHARED NOTEBOOK</font>**

In [46]:
#setting the foursquares variables
fs_clientid='***'
fs_clientsecret='***'
fs_version='20190505'
fs_limit=50
fs_radius=500

Getting the venues using the explore action. For each *Postcode*, fetch foursquare (explore) and save information about venues (name, category and location) into a list

In [None]:
# url to request
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&limit={}&radius={}'

# for each postcode ...
venues_list = []
for pcode, lat, lng in zip(df_TorontoOnly['Postcode'], df_TorontoOnly['Latitude'], df_TorontoOnly['Longitude']):
    
    # fetching foursquare
    url_ = url.format(fs_clientid, fs_clientsecret, lat, lng, fs_version, fs_limit, fs_radius)
    venues = requests.get(url_).json()['response']['groups'][0]['items']
    
    # append venues to list
    venues_list.append([(
        pcode,
        v['venue']['name'],
        v['venue']['categories'][0]['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng']
    ) for v in venues])
        
    #END
venues_list
        

Creating a new dataframe to handle the venues information, joining with postcode field

In [147]:
# create a new df
df_venues = pd.DataFrame(columns=['Postcode', 'Venue', 'Category', 'Latitude', 'Longitude'])

# insert data into DataFrame
for main_list in venues_list:
    for sub_list in main_list:
        df_venues = df_venues.append({
            'Postcode': sub_list[0],
            'Venue': sub_list[1],
            'Category': sub_list[2],
            'Latitude': sub_list[3],
            'Longitude': sub_list[4]
        }, ignore_index=True)

Let's see a little information about this dataset

In [148]:
df_venues.head()

Unnamed: 0,Postcode,Venue,Category,Latitude,Longitude
0,M4E,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,M4E,Grover Pub and Grub,Pub,43.679181,-79.297215
2,M4E,St-Denis Studios Inc.,Music Venue,43.675031,-79.288022
3,M4E,Upper Beaches,Neighborhood,43.680563,-79.292869
4,M4K,Pantheon,Greek Restaurant,43.677621,-79.351434


In [150]:
df_venues.shape

(1160, 5)

In [153]:
df_venues.groupby('Postcode').count()

Unnamed: 0_level_0,Venue,Category,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M4E,4,4,4,4
M4K,44,44,44,44
M4L,22,22,22,22
M4M,37,37,37,37
M4N,4,4,4,4
M4P,7,7,7,7
M4R,21,21,21,21
M4S,32,32,32,32
M4T,3,3,3,3
M4V,14,14,14,14


Now, let's hot encode the category, preparing the dataset to KMeans cluster

In [156]:
# get dummies from venues categories
torontoVenues_dummy = pd.get_dummies(df_venues[['Category']], prefix='', prefix_sep='')

# add back postcode
torontoVenues_dummy['Postcode'] = df_venues['Postcode']

# move Postcode to first column
fixed_columns = [torontoVenues_dummy.columns[-1]] + list(torontoVenues_dummy.columns[:-1])
torontoVenues_dummy = torontoVenues_dummy[fixed_columns]
torontoVenues_dummy.head()

Unnamed: 0,Postcode,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [157]:
torontoVenues_dummy.shape

(1160, 217)

Grouping and getting the mean of each category, for each postcode

In [159]:
torontoVenues_grp = torontoVenues_dummy.groupby('Postcode').mean().reset_index()
torontoVenues_grp.head()

Unnamed: 0,Postcode,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,...,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
torontoVenues_grp.shape

(38, 217)

Printing the top 5 common venues for each postcode

In [182]:
top_venues = 5
for postcode in torontoVenues_grp['Postcode']:
    print('---'+postcode+'---')
    x_ = torontoVenues_grp[torontoVenues_grp['Postcode'] == postcode].T.reset_index()
    x_.columns = ['Venue', 'Freq']
    x_ = x_.iloc[1:]
    x_['Freq'] = x_['Freq'].astype(float)
    x_ = x_.round({'Freq':2})
    print(x_.sort_values('Freq', ascending=False).reset_index(drop=True).head(5))
    print()


---M4E---
                     Venue  Freq
0        Health Food Store  0.25
1                      Pub  0.25
2              Music Venue  0.25
3             Neighborhood  0.25
4  New American Restaurant  0.00

---M4K---
                Venue  Freq
0    Greek Restaurant  0.18
1         Coffee Shop  0.09
2      Ice Cream Shop  0.07
3           Bookstore  0.05
4  Italian Restaurant  0.05

---M4L---
                Venue  Freq
0      Sandwich Place  0.09
1                Park  0.09
2  Light Rail Station  0.05
3             Brewery  0.05
4   Fish & Chips Shop  0.05

---M4M---
                 Venue  Freq
0                 Café  0.11
1          Coffee Shop  0.08
2   Italian Restaurant  0.05
3  American Restaurant  0.05
4               Bakery  0.05

---M4N---
                        Venue  Freq
0  Construction & Landscaping  0.25
1                        Park  0.25
2                 Swim School  0.25
3                    Bus Line  0.25
4              Adult Boutique  0.00

---M4P---
           

Generating the **Top 10 Common Venue DataFrame**. It will handle the clusters number in future

In [223]:
# function to sort venues
def sortVenues(row, topVenues=10):
    # sort_values
    return row[1:].sort_values(ascending=False).index.values[0:topVenues]
    

In [276]:
# creating a dataset with 10 most popular avenues in a neighbourhood
topVenues_ = 10
columns = ['Postcode']

for index in np.arange(topVenues_):
    columns.append('{} Most Common Avenue'.format(index+1))
    
# create the new dataframe
df_commonVenues = pd.DataFrame(columns=columns)
df_commonVenues['Postcode'] = torontoVenues_grp['Postcode']

for index in np.arange(df_commonVenues.shape[0]):
    df_commonVenues.iloc[index, 1:] = sortVenues(torontoVenues_grp.iloc[index, :])
df_commonVenues.head()

Unnamed: 0,Postcode,1 Most Common Avenue,2 Most Common Avenue,3 Most Common Avenue,4 Most Common Avenue,5 Most Common Avenue,6 Most Common Avenue,7 Most Common Avenue,8 Most Common Avenue,9 Most Common Avenue,10 Most Common Avenue
0,M4E,Neighborhood,Music Venue,Health Food Store,Pub,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Furniture / Home Store,Italian Restaurant,Yoga Studio,Cosmetics Shop,Brewery,Bubble Tea Shop
2,M4L,Park,Sandwich Place,Italian Restaurant,Pet Store,Coffee Shop,Pub,Movie Theater,Burrito Place,Burger Joint,Brewery
3,M4M,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Yoga Studio,Fish Market,Pizza Place,Park,Neighborhood
4,M4N,Bus Line,Park,Construction & Landscaping,Swim School,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


## Clustering the dataset

preparing dataframe to kmeans algorithm

In [277]:
# exec kmeans on grouped dataframe
torontoVenues_grp2 = torontoVenues_grp.copy()
torontoVenues_grp2.drop('Postcode', inplace=True, axis=1)
torontoVenues_grp2.head()

Unnamed: 0,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,...,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Running the KMeans algorithm with 12 clusters

In [278]:
k=12
kmeans = KMeans(n_clusters=k, random_state=0).fit(torontoVenues_grp2)

kmeans.labels_[0:10]

array([0, 1, 1, 1, 6, 4, 1, 1, 8, 1], dtype=int32)

Inserting clusters numbers to top 10 common venues dataframe

In [279]:
# insert labels into common venues df
df_commonVenues.insert(0, 'Cluster Label', kmeans.labels_)
df_commonVenues.head()

Unnamed: 0,Cluster Label,Postcode,1 Most Common Avenue,2 Most Common Avenue,3 Most Common Avenue,4 Most Common Avenue,5 Most Common Avenue,6 Most Common Avenue,7 Most Common Avenue,8 Most Common Avenue,9 Most Common Avenue,10 Most Common Avenue
0,0,M4E,Neighborhood,Music Venue,Health Food Store,Pub,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
1,1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Furniture / Home Store,Italian Restaurant,Yoga Studio,Cosmetics Shop,Brewery,Bubble Tea Shop
2,1,M4L,Park,Sandwich Place,Italian Restaurant,Pet Store,Coffee Shop,Pub,Movie Theater,Burrito Place,Burger Joint,Brewery
3,1,M4M,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Yoga Studio,Fish Market,Pizza Place,Park,Neighborhood
4,6,M4N,Bus Line,Park,Construction & Landscaping,Swim School,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


Merging others fields (neighbourhood, latitude and longitude) to the processed dataframe

In [280]:
# merge with information about Neighbourhood (eg name)
df_TorontoOnly2 = df_TorontoOnly.join(df_commonVenues.set_index('Postcode'), on='Postcode')
df_TorontoOnly2.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Label,1 Most Common Avenue,2 Most Common Avenue,3 Most Common Avenue,4 Most Common Avenue,5 Most Common Avenue,6 Most Common Avenue,7 Most Common Avenue,8 Most Common Avenue,9 Most Common Avenue,10 Most Common Avenue
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Neighborhood,Music Venue,Health Food Store,Pub,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
1,41,M4K,East Toronto,"TheDanforthWest, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Furniture / Home Store,Italian Restaurant,Yoga Studio,Cosmetics Shop,Brewery,Bubble Tea Shop
2,42,M4L,East Toronto,"TheBeachesWest, IndiaBazaar",43.668999,-79.315572,1,Park,Sandwich Place,Italian Restaurant,Pet Store,Coffee Shop,Pub,Movie Theater,Burrito Place,Burger Joint,Brewery
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Yoga Studio,Fish Market,Pizza Place,Park,Neighborhood
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,6,Bus Line,Park,Construction & Landscaping,Swim School,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


## Ploting the map with clusters

In [281]:
# plot the cluster graph
map_Toronto = folium.Map(location = (location.latitude, location.longitude), zoom_start=12)

# color schem
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
c_ = cm.rainbow(np.linspace(0, 1, len(ys)))
colorsCm = [colors.rgb2hex(i) for i in c_]

for lat, lng, nei, cluster in zip(df_TorontoOnly2['Latitude'], df_TorontoOnly2['Longitude'], df_TorontoOnly2['Neighbourhood'], df_TorontoOnly2['Cluster Label']):
    label = folium.Popup(nei + '(cluster '+ str(cluster) + ')')
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color=colorsCm[cluster],
        fill=True,
        fill_color=colorsCm[cluster],
        fill_opacity=0.5
    ).add_to(map_Toronto)

In [282]:
# show map
map_Toronto