## Importing the libraries needed

In [1]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # converts an address into latitude and longitude values

!pip install geocoder
import geocoder

import json
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests 
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
from sklearn import preprocessing

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    attrs-21.2.0               |     pyhd8ed1ab_0          44 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    ca-certificates-2020.12.5  |       ha878542_0         137 KB  conda-forge
    entrypoints-0.3            |  pyhd8ed1ab_1003           8 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    jsonschema-3.2.0           |     pyhd8

# Part 1 
### Retrieving, cleaning and organizing the data from the Wikipedia table

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r=requests.get(url)
toronto_list= pd.read_html(r.text)
toronto=toronto_list[0]
print("Shape of data frame:", toronto.shape)
toronto.head()

Shape of data frame: (20, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park(Ontario Provincial Government),M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [3]:
# Iterating through every column and row from the toronto dataframe
#then appending all the values to an empty list and transforming the list
#into a dataframe with a single column
Zlist=[]
cols=list(toronto)
for k, row in toronto.iterrows():
    for j in cols:
        colsObject=toronto[j][k]
        Zlist.append(colsObject)
dfToronto=pd.DataFrame(Zlist)

# Removing the rows with "Not assigned" boroughs and renaming the only column
dfToronto=dfToronto[~dfToronto[0].str.contains("Not assigned")]
dfToronto=dfToronto.reset_index(drop=True)
dfToronto=dfToronto.rename(columns={ 0:'Information'})

# Iterating through the rows of the Toronto dataframe, then creating a list with
#the first 3 characters of the string in the "Information" column,
#transforming the list to a dataframe and assigning the values to a new column
#of the dfToronto called "PostalCode"
Klist=[]
for k, row in dfToronto.iterrows():
        colsObject=row['Information'][:3]
        Klist.append(colsObject)
result1=pd.DataFrame(Klist)
dfToronto["PostalCode"]=result1

# Iterating through the rows of the Toronto dataframe again, then creating a list with
#beginning at the 4th character of the string in the "Information" column,
#transforming the list to a dataframe and assigning the values to a new column
#of the dfToronto called "Hood"
Wlist=[]
for k, row in dfToronto.iterrows():
        colsObject=row['Information'][3:]
        Wlist.append(colsObject)
result2=pd.DataFrame(Wlist)
dfToronto["Hood"]=result2

# Splitting the string in the "Hood" column based on the "(" character to two other
#columns named "Borough" and "Neighborhood". The part before the bracket will be in
#the "Borough" column and then in the "Neighborhood" column the ")" character was removed
dfToronto[['Borough','Neighborhood']]=dfToronto.Hood.str.split("(",1,expand=True)
dfToronto['Neighborhood']=dfToronto['Neighborhood'].str.replace(')','')
dfToronto['Neighborhood']=dfToronto['Neighborhood'].str.replace(' /',',')

# Replacing the name of some of the boroughs
dfToronto['Borough']=dfToronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

# Removing the columns that are not needed from the dataframe
dfToronto.drop(['Information', 'Hood'], axis='columns', inplace=True)
dfToronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [4]:
print("Shape of the dataframe:", dfToronto.shape)

Shape of the dataframe: (103, 3)


# Part 2
### Getting the latitude and longitude for each neighborhood by using the geocoder

In [5]:
postal_code = dfToronto['PostalCode']

# Empty lists that will store the coordinates information from the geocoder
latitude = []
longitude = []
n = 0

# Loop that keeps on trying to get the latitude and longitude for each neighborhood until it succeedes
while n < len(postal_code):
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code[n]))
    lat_lng_coords = g.latlng
    print('The geograpical coordinate of {} are {}, {}.'.format(postal_code[n], lat_lng_coords[0], lat_lng_coords[1]))
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
    n = n + 1

dfToronto['Longitude'] = longitude
dfToronto['Latitude'] = latitude
dfToronto.head()

The geograpical coordinate of M3A are 43.75245000000007, -79.32990999999998.
The geograpical coordinate of M4A are 43.73057000000006, -79.31305999999995.
The geograpical coordinate of M5A are 43.65512000000007, -79.36263999999994.
The geograpical coordinate of M6A are 43.72327000000007, -79.45041999999995.
The geograpical coordinate of M7A are 43.66253000000006, -79.39187999999996.
The geograpical coordinate of M9A are 43.662630000000036, -79.52830999999998.
The geograpical coordinate of M1B are 43.811390000000074, -79.19661999999994.
The geograpical coordinate of M3B are 43.74923000000007, -79.36185999999998.
The geograpical coordinate of M4B are 43.70718000000005, -79.31191999999999.
The geograpical coordinate of M5B are 43.65739000000008, -79.37803999999994.
The geograpical coordinate of M6B are 43.70687000000004, -79.44811999999996.
The geograpical coordinate of M9B are 43.65034000000003, -79.55361999999997.
The geograpical coordinate of M1C are 43.78574000000003, -79.1587499999999

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-79.32991,43.75245
1,M4A,North York,Victoria Village,-79.31306,43.73057
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.36264,43.65512
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.45042,43.72327
4,M7A,Queen's Park,Ontario Provincial Government,-79.39188,43.66253


In [6]:
# Saving the dataframe to a CSV since the geocoder doesn't always return all the coordinates
dfToronto.to_csv("TorontoDfWithCoordinates")

# Part 3
### Analysing and clustering the neighborhoods of Toronto
####     I have decided to use Foursquare categories IDs that would return different dataframes for each of the categories Arts&Entertainment, Colleges, Restaurants, Night Life, Outdoors&Recreational, Government Buildings, Medical Centers, Spiritual Centers and Grocery Shops in the radius of 500m from the coordinates of each Toronto neighborhood.  

In [7]:
# Using the geopy library to find the coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="yyz_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347.


In [8]:
# Map of Toronto with the neighborhoods superimposed on it
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfToronto['Latitude'], dfToronto['Longitude'], dfToronto['Borough'], dfToronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='mediumorchid',
        fill=True,
        fill_color='#66CDAA',
        fill_opacity=0.6,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [9]:
borough_count=dfToronto['Borough'].value_counts()
borough_count
print("Number of boroughs in Toronto:", len(borough_count))
print("Neighborhood count for each borough:", borough_count)

Number of boroughs in Toronto: 15
Neighborhood count for each borough: North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East Toronto               4
East York                  4
East Toronto Business      1
Mississauga                1
East York/East Toronto     1
Downtown Toronto Stn A     1
Etobicoke Northwest        1
Queen's Park               1
Name: Borough, dtype: int64


In [10]:
# Foursquare credentials
CLIENT_ID = 'T3THHYDPV3NSFOCGTSFMG2REPZK2L1FWNLZPUUTYHEIJDGIF' # your Foursquare ID
CLIENT_SECRET = 'UPBT2EZKN4L054EO2NR2I0RD5EHHZ1LMCOMXKF4WOUJCZHBW' # your Foursquare Secret
ACCESS_TOKEN = 'YZHHK4EQ4RNDT14K5ISYM4DBXSYJRO0C3HT0H0RDU2OVV1WA'
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: T3THHYDPV3NSFOCGTSFMG2REPZK2L1FWNLZPUUTYHEIJDGIF
CLIENT_SECRET:UPBT2EZKN4L054EO2NR2I0RD5EHHZ1LMCOMXKF4WOUJCZHBW


In [11]:
# IDs from Foursquare for the places that the analysis of the neighborhoods is based
id_ArtsEntertainment='4d4b7104d754a06370d81259'
id_Colleges='4d4b7105d754a06372d81259'
id_Restaurants='4d4b7105d754a06374d81259'
id_NightLife='4d4b7105d754a06376d81259'
id_OutdoorsRecreation='4d4b7105d754a06377d81259'
id_GovernmentBuilding='4bf58dd8d48988d126941735'
id_MedicalCenter='4bf58dd8d48988d104941735'
id_SpiritualCenter='4bf58dd8d48988d131941735'
id_FoodShop='4bf58dd8d48988d1f9941735'

In [12]:
# Function for retrieving the venues of each neighborhood, based on the coordinates of the 
#neighborhood and category ID of the venues, on a radius of 500m
def getNearbyVenues(names, latitudes, longitudes, categoryID, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        #API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng,
            categoryID,
            radius, 
            LIMIT)
            
        #GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Appending to the venues list only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
# Dataframe consisting of the count of Arts&Entertainment venues in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Arts&Entertainment
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_ArtsEntertainment=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_ArtsEntertainment)
print("Shape of Toronto_ArtsEntertainment", Toronto_ArtsEntertainment.shape)

df_ArtsEntertainment=pd.DataFrame()
df_ArtsEntertainment['ArtsEntertainmentCount']=Toronto_ArtsEntertainment['Neighborhood'].value_counts()
df_ArtsEntertainment=df_ArtsEntertainment.reset_index()
df_ArtsEntertainment=df_ArtsEntertainment.rename(columns={'index':'Neighborhood'})
df_ArtsEntertainment.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,ArtsEntertainmentCount
0,Enclave of M4L,36
1,Enclave of L4W,36
2,Enclave of M5E,36
3,"Kensington Market, Chinatown, Grange Park",19
4,Studio District,16


In [14]:
# Dataframe consisting of the count of College venues in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Colleges
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_Colleges=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_Colleges)
print("Shape of Toronto_Colleges", Toronto_Colleges.shape)

df_Colleges=pd.DataFrame()
df_Colleges['CollegesCount']=Toronto_Colleges['Neighborhood'].value_counts()
df_Colleges=df_Colleges.reset_index()
df_Colleges=df_Colleges.rename(columns={'index':'Neighborhood'})
df_Colleges.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,CollegesCount
0,"University of Toronto, Harbord",81
1,Ontario Provincial Government,72
2,"Garden District, Ryerson",64
3,Central Bay Street,47
4,Enclave of M4L,15


In [15]:
# Dataframe consisting of the count of Restaurant venues in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Restaurants
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_Restaurants=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_Restaurants)
print("Shape of Toronto_Restaurants", Toronto_Restaurants.shape)

df_Restaurants=pd.DataFrame()
df_Restaurants['RestaurantsCount']=Toronto_Restaurants['Neighborhood'].value_counts()
df_Restaurants=df_Restaurants.reset_index()
df_Restaurants=df_Restaurants.rename(columns={'index':'Neighborhood'})
df_Restaurants.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,RestaurantsCount
0,"Richmond, Adelaide, King",100
1,"Commerce Court, Victoria Hotel",100
2,"First Canadian Place, Underground city",100
3,Enclave of M4L,100
4,Enclave of L4W,100


In [16]:
# Dataframe consisting of the count of Night Life venues in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Night Life
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_NightLife=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_NightLife)
print("Shape of Toronto_NightLife", Toronto_NightLife.shape)

df_NightLife=pd.DataFrame()
df_NightLife['NightLifeCount']=Toronto_NightLife['Neighborhood'].value_counts()
df_NightLife=df_NightLife.reset_index()
df_NightLife=df_NightLife.rename(columns={'index':'Neighborhood'})
df_NightLife.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,NightLifeCount
0,"Commerce Court, Victoria Hotel",58
1,Enclave of L4W,50
2,Enclave of M5E,50
3,Enclave of M4L,50
4,"Toronto Dominion Centre, Design Exchange",48


In [17]:
# Dataframe consisting of the count of Outdoors&Recreational venues in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Outdoors&Recreation venues
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_OutdoorsRecreation=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_OutdoorsRecreation)
print("Shape of Toronto_OutdoorsRecreation", Toronto_OutdoorsRecreation.shape)

df_OutdoorsRecreation=pd.DataFrame()
df_OutdoorsRecreation['OutdoorsRecreationCount']=Toronto_OutdoorsRecreation['Neighborhood'].value_counts()
df_OutdoorsRecreation=df_OutdoorsRecreation.reset_index()
df_OutdoorsRecreation=df_OutdoorsRecreation.rename(columns={'index':'Neighborhood'})
df_OutdoorsRecreation.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,OutdoorsRecreationCount
0,"Harbourfront East, Union Station, Toronto Islands",40
1,Enclave of M4L,39
2,Enclave of M5E,39
3,Enclave of L4W,39
4,"Richmond, Adelaide, King",37


In [18]:
# Dataframe consisting of the count of Government Buildings in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Government Buildings
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_GovernmentBuilding=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_GovernmentBuilding)
print("Shape of Toronto_GovernmentBuilding", Toronto_GovernmentBuilding.shape)

df_GovernmentBuilding=pd.DataFrame()
df_GovernmentBuilding['GovernmentBuildingCount']=Toronto_GovernmentBuilding['Neighborhood'].value_counts()
df_GovernmentBuilding=df_GovernmentBuilding.reset_index()
df_GovernmentBuilding=df_GovernmentBuilding.rename(columns={'index':'Neighborhood'})
df_GovernmentBuilding.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,GovernmentBuildingCount
0,Central Bay Street,30
1,"Richmond, Adelaide, King",23
2,Enclave of L4W,21
3,Enclave of M5E,21
4,Enclave of M4L,21


In [19]:
# Dataframe consisting of the count of Medical Centers in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Medical Center venues
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_MedicalCenter=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_MedicalCenter)
print("Shape of Toronto_MedicalCenter", Toronto_MedicalCenter.shape)

df_MedicalCenter=pd.DataFrame()
df_MedicalCenter['MedicalCenterCount']=Toronto_MedicalCenter['Neighborhood'].value_counts()
df_MedicalCenter=df_MedicalCenter.reset_index()
df_MedicalCenter=df_MedicalCenter.rename(columns={'index':'Neighborhood'})
df_MedicalCenter.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,MedicalCenterCount
0,"First Canadian Place, Underground city",44
1,"Richmond, Adelaide, King",44
2,"Commerce Court, Victoria Hotel",42
3,"Toronto Dominion Centre, Design Exchange",41
4,St. James Town,39


In [20]:
# Dataframe consisting of the count of Spiritual Centers in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Spiritual Center venues
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_SpiritualCenter=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_SpiritualCenter)
print("Shape of Toronto_SpiritualCenter", Toronto_SpiritualCenter.shape)

df_SpiritualCenter=pd.DataFrame()
df_SpiritualCenter['SpiritualCenterCount']=Toronto_SpiritualCenter['Neighborhood'].value_counts()
df_SpiritualCenter=df_SpiritualCenter.reset_index()
df_SpiritualCenter=df_SpiritualCenter.rename(columns={'index':'Neighborhood'})
df_SpiritualCenter.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,SpiritualCenterCount
0,St. James Town,8
1,"Garden District, Ryerson",7
2,WillowdaleSouth,7
3,"Brockton, Parkdale Village, Exhibition Place",6
4,"Little Portugal, Trinity",6


In [21]:
# Dataframe consisting of the count of Grocery Stores in the neighborhoods, resulted 
#by using the getNearbyVenues function with the Foursquare category ID for Grocery Stores
#then assigning to a new column the number of venues returned for each neighborhood
Toronto_FoodShop=getNearbyVenues(names=dfToronto['Neighborhood'], latitudes=dfToronto['Latitude'], longitudes=dfToronto['Longitude'], categoryID=id_FoodShop)
print("Shape of Toronto_FoodShop", Toronto_FoodShop.shape)

df_FoodShop=pd.DataFrame()
df_FoodShop['FoodShopCount']=Toronto_FoodShop['Neighborhood'].value_counts()
df_FoodShop=df_FoodShop.reset_index()
df_FoodShop=df_FoodShop.rename(columns={'index':'Neighborhood'})
df_FoodShop.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don MillsNorth
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don MillsSouth(Flemingdon Park
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
DownsviewEast 

Unnamed: 0,Neighborhood,FoodShopCount
0,Berczy Park,36
1,"Kensington Market, Chinatown, Grange Park",32
2,Central Bay Street,30
3,"First Canadian Place, Underground city",26
4,"Richmond, Adelaide, King",26


In [22]:
# Merging the categories dataframes with the Toronto dataframe and replacing the NaN values with 0
merged_dfToronto=pd.merge(left=dfToronto, right=df_ArtsEntertainment, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_Colleges, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_Restaurants, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_NightLife, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_OutdoorsRecreation, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_GovernmentBuilding, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_MedicalCenter, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_SpiritualCenter, how='left', left_on='Neighborhood', right_on='Neighborhood')
merged_dfToronto=pd.merge(left=merged_dfToronto, right=df_FoodShop, how='left', left_on='Neighborhood', right_on='Neighborhood')

merged_dfToronto=merged_dfToronto.fillna(0)
merged_dfToronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,M3A,North York,Parkwoods,-79.32991,43.75245,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0,1.0
1,M4A,North York,Victoria Village,-79.31306,43.73057,0.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.36264,43.65512,4.0,4.0,13.0,6.0,8.0,3.0,1.0,5.0,5.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.45042,43.72327,8.0,1.0,32.0,3.0,2.0,1.0,5.0,1.0,4.0
4,M7A,Queen's Park,Ontario Provincial Government,-79.39188,43.66253,4.0,72.0,16.0,4.0,7.0,17.0,20.0,3.0,2.0


In [23]:
# Creating a new dataframe that will have only the neighborhoods and categories columns
new_Torontodf=merged_dfToronto
new_Torontodf.drop(['PostalCode', 'Borough', 'Longitude', 'Latitude'], axis='columns', inplace=True)
new_Torontodf.head()

Unnamed: 0,Neighborhood,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,Parkwoods,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0,1.0
1,Victoria Village,0.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0
2,"Regent Park, Harbourfront",4.0,4.0,13.0,6.0,8.0,3.0,1.0,5.0,5.0
3,"Lawrence Manor, Lawrence Heights",8.0,1.0,32.0,3.0,2.0,1.0,5.0,1.0,4.0
4,Ontario Provincial Government,4.0,72.0,16.0,4.0,7.0,17.0,20.0,3.0,2.0


In [24]:
# Creating a dataframe that will store the neighborhood values from the Toronto dataframe
df_neigh=pd.DataFrame()
df_neigh['Neighborhood']=new_Torontodf['Neighborhood']
df_neigh.head()

Unnamed: 0,Neighborhood
0,Parkwoods
1,Victoria Village
2,"Regent Park, Harbourfront"
3,"Lawrence Manor, Lawrence Heights"
4,Ontario Provincial Government


In [25]:
# Creating a new dataframe that has only the columns with category counts
df_cat=pd.DataFrame()
df_cat=new_Torontodf
df_cat.drop(['Neighborhood'], axis='columns', inplace=True)
df_cat.head()

Unnamed: 0,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0,1.0
1,0.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0
2,4.0,4.0,13.0,6.0,8.0,3.0,1.0,5.0,5.0
3,8.0,1.0,32.0,3.0,2.0,1.0,5.0,1.0,4.0
4,4.0,72.0,16.0,4.0,7.0,17.0,20.0,3.0,2.0


In [26]:
# Normalizing the values of the venue category counts for more accurate clustering results
norm=pd.DataFrame(df_cat)
min_max_scaler=preprocessing.MinMaxScaler()
x_scaled=min_max_scaler.fit_transform(norm)
Cat_normalized=pd.DataFrame(x_scaled, columns=norm.columns)
Cat_normalized.head()

Unnamed: 0,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,0.0,0.0,0.02,0.0,0.05,0.0,0.022727,0.0,0.027778
1,0.0,0.0,0.02,0.0,0.025,0.0,0.045455,0.125,0.027778
2,0.111111,0.049383,0.13,0.103448,0.2,0.1,0.022727,0.625,0.138889
3,0.222222,0.012346,0.32,0.051724,0.05,0.033333,0.113636,0.125,0.111111
4,0.111111,0.888889,0.16,0.068966,0.175,0.566667,0.454545,0.375,0.055556


In [27]:
# Remaking the dataframe of Toronto neighborhoods and the normalized venue categories columns
merged_dfT=pd.merge(left=df_neigh, right=Cat_normalized, how='left', left_on=df_neigh.index, right_on=Cat_normalized.index)
merged_dfT.drop(['key_0'], axis='columns', inplace=True)
merged_dfT.head()

Unnamed: 0,Neighborhood,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,Parkwoods,0.0,0.0,0.02,0.0,0.05,0.0,0.022727,0.0,0.027778
1,Victoria Village,0.0,0.0,0.02,0.0,0.025,0.0,0.045455,0.125,0.027778
2,"Regent Park, Harbourfront",0.111111,0.049383,0.13,0.103448,0.2,0.1,0.022727,0.625,0.138889
3,"Lawrence Manor, Lawrence Heights",0.222222,0.012346,0.32,0.051724,0.05,0.033333,0.113636,0.125,0.111111
4,Ontario Provincial Government,0.111111,0.888889,0.16,0.068966,0.175,0.566667,0.454545,0.375,0.055556


In [28]:
# Function for most common venues
def return_most_common_venues(row, top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:top_venues]

In [29]:
# Sorting the venue categories in each neighborhood
top_venues = 9
indicators = ['st', 'nd', 'rd']

# Create columns according to the number of venue categories
columns = ['Neighborhood']
for ind in np.arange(top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# New dataframe for sorted venues
Toronto_venues_sorted = pd.DataFrame(columns=columns)
Toronto_venues_sorted['Neighborhood'] = merged_dfT['Neighborhood']

for ind in np.arange(merged_dfT.shape[0]):
    Toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(merged_dfT.iloc[ind, :], top_venues)

Toronto_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
0,Parkwoods,OutdoorsRecreationCount,FoodShopCount,MedicalCenterCount,RestaurantsCount,SpiritualCenterCount,GovernmentBuildingCount,NightLifeCount,CollegesCount,ArtsEntertainmentCount
1,Victoria Village,SpiritualCenterCount,MedicalCenterCount,FoodShopCount,OutdoorsRecreationCount,RestaurantsCount,GovernmentBuildingCount,NightLifeCount,CollegesCount,ArtsEntertainmentCount
2,"Regent Park, Harbourfront",SpiritualCenterCount,OutdoorsRecreationCount,FoodShopCount,RestaurantsCount,ArtsEntertainmentCount,NightLifeCount,GovernmentBuildingCount,CollegesCount,MedicalCenterCount
3,"Lawrence Manor, Lawrence Heights",RestaurantsCount,ArtsEntertainmentCount,SpiritualCenterCount,MedicalCenterCount,FoodShopCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,CollegesCount
4,Ontario Provincial Government,CollegesCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,OutdoorsRecreationCount,RestaurantsCount,ArtsEntertainmentCount,NightLifeCount,FoodShopCount


In [30]:
# Setting the number of clusters and performing the clustering operation
kclusters = 4
Toronto_grouped_clustering = merged_dfT.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=20).fit(Toronto_grouped_clustering)

In [31]:
# Adding clustering labels to the dataframe and merging dfToronto so we can add
#the coordinates of each neighborhood
merged_dfT.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_merged = dfToronto
Toronto_merged = Toronto_merged.join(merged_dfT.set_index('Neighborhood'), on='Neighborhood')
Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude,Cluster Labels,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,M3A,North York,Parkwoods,-79.32991,43.75245,2,0.0,0.0,0.02,0.0,0.05,0.0,0.022727,0.0,0.027778
1,M4A,North York,Victoria Village,-79.31306,43.73057,2,0.0,0.0,0.02,0.0,0.025,0.0,0.045455,0.125,0.027778
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.36264,43.65512,0,0.111111,0.049383,0.13,0.103448,0.2,0.1,0.022727,0.625,0.138889
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.45042,43.72327,2,0.222222,0.012346,0.32,0.051724,0.05,0.033333,0.113636,0.125,0.111111
4,M7A,Queen's Park,Ontario Provincial Government,-79.39188,43.66253,0,0.111111,0.888889,0.16,0.068966,0.175,0.566667,0.454545,0.375,0.055556


In [32]:
# Cheching how many neighborhoods are in each cluster
Toronto_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood,Longitude,Latitude,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,26,26,26,26,26,26,26,26,26,26,26,26,26,26
1,9,9,9,9,9,9,9,9,9,9,9,9,9,9
2,59,59,59,59,59,59,59,59,59,59,59,59,59,59
3,9,9,9,9,9,9,9,9,9,9,9,9,9,9


In [33]:
# Creating a label dataframe for each cluster and viewing the data in the cluster
Label0=Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]
Label0.head()

Unnamed: 0,Neighborhood,Cluster Labels,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
2,"Regent Park, Harbourfront",0,0.111111,0.049383,0.13,0.103448,0.2,0.1,0.022727,0.625,0.138889
4,Ontario Provincial Government,0,0.111111,0.888889,0.16,0.068966,0.175,0.566667,0.454545,0.375,0.055556
10,Glencairn,0,0.0,0.0,0.13,0.051724,0.0,0.033333,0.090909,0.375,0.055556
14,Woodbine Heights,0,0.111111,0.0,0.08,0.051724,0.1,0.0,0.204545,0.625,0.111111
17,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",0,0.0,0.012346,0.03,0.0,0.125,0.0,0.113636,0.5,0.027778


In [34]:
# Creating a label dataframe for each cluster and viewing the data in the cluster
Label1=Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]
Label1.head()

Unnamed: 0,Neighborhood,Cluster Labels,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
15,St. James Town,1,0.333333,0.160494,0.6,0.637931,0.75,0.433333,0.886364,1.0,0.583333
24,Central Bay Street,1,0.333333,0.580247,0.49,0.310345,0.8,1.0,0.840909,0.375,0.833333
30,"Richmond, Adelaide, King",1,0.277778,0.160494,1.0,0.706897,0.925,0.766667,1.0,0.25,0.722222
42,"Toronto Dominion Centre, Design Exchange",1,0.305556,0.08642,1.0,0.827586,0.825,0.333333,0.931818,0.25,0.694444
48,"Commerce Court, Victoria Hotel",1,0.277778,0.135802,1.0,1.0,0.75,0.433333,0.954545,0.375,0.694444


In [35]:
# Creating a label dataframe for each cluster and viewing the data in the cluster
Label2=Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]
Label2.head()

Unnamed: 0,Neighborhood,Cluster Labels,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
0,Parkwoods,2,0.0,0.0,0.02,0.0,0.05,0.0,0.022727,0.0,0.027778
1,Victoria Village,2,0.0,0.0,0.02,0.0,0.025,0.0,0.045455,0.125,0.027778
3,"Lawrence Manor, Lawrence Heights",2,0.222222,0.012346,0.32,0.051724,0.05,0.033333,0.113636,0.125,0.111111
5,Islington Avenue,2,0.055556,0.0,0.01,0.0,0.1,0.033333,0.045455,0.25,0.055556
6,"Malvern, Rouge",2,0.027778,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Creating a label dataframe for each cluster and viewing the data in the cluster
Label3=Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]
Label3.head()

Unnamed: 0,Neighborhood,Cluster Labels,ArtsEntertainmentCount,CollegesCount,RestaurantsCount,NightLifeCount,OutdoorsRecreationCount,GovernmentBuildingCount,MedicalCenterCount,SpiritualCenterCount,FoodShopCount
9,"Garden District, Ryerson",3,0.305556,0.790123,0.9,0.5,0.575,0.266667,0.545455,0.875,0.472222
20,Berczy Park,3,0.25,0.061728,0.55,0.5,0.325,0.133333,0.181818,0.0,1.0
36,"Harbourfront East, Union Station, Toronto Islands",3,0.333333,0.024691,0.38,0.465517,1.0,0.133333,0.272727,0.125,0.5
37,"Little Portugal, Trinity",3,0.361111,0.061728,0.41,0.37931,0.325,0.0,0.045455,0.75,0.305556
43,"Brockton, Parkdale Village, Exhibition Place",3,0.305556,0.074074,0.51,0.482759,0.35,0.133333,0.136364,0.75,0.222222


In [37]:
# Creating dataframes with the mean category values of each cluster
L0=Label0.mean()
L0=L0.to_frame().reset_index().rename(columns={0:'Label 0'}, inplace=False)

L1=Label1.mean()
L1=L1.to_frame().reset_index().rename(columns={0:'Label 1'}, inplace=False)

L2=Label2.mean()
L2=L2.to_frame().reset_index().rename(columns={0:'Label 2'}, inplace=False)

L3=Label3.mean()
L3=L3.to_frame().reset_index().rename(columns={0:'Label 3'}, inplace=False)

In [38]:
# Merging the mean values dataframes so the cluster information can be easily compared
labels_df=pd.merge(left=L0, right=L1, how='left', left_on='index', right_on='index')
labels_df=pd.merge(left=labels_df, right=L2, how='left', left_on='index', right_on='index')
labels_df=pd.merge(left=labels_df, right=L3, how='left', left_on='index', right_on='index')
labels_df=labels_df.drop(index=0)
labels_df

Unnamed: 0,index,Label 0,Label 1,Label 2,Label 3
1,ArtsEntertainmentCount,0.08547,0.537037,0.020245,0.290123
2,CollegesCount,0.05603,0.198903,0.007951,0.253772
3,RestaurantsCount,0.12,0.898889,0.038475,0.526667
4,NightLifeCount,0.047082,0.766284,0.010228,0.425287
5,OutdoorsRecreationCount,0.110577,0.863889,0.058898,0.494444
6,GovernmentBuildingCount,0.061538,0.62963,0.017514,0.118519
7,MedicalCenterCount,0.113636,0.881313,0.053159,0.239899
8,SpiritualCenterCount,0.471154,0.402778,0.088983,0.5
9,FoodShopCount,0.097222,0.62037,0.035311,0.459877


In [39]:
# Neighborhoods with significant amounts of Spiritual Centers, Outdoors&Recreational and
#Medical Center venues, few NighLife and College venues.
# I would categorize this cluster as Suburban areas/neighborhoods
labels_df.sort_values('Label 0', ascending=False)

Unnamed: 0,index,Label 0,Label 1,Label 2,Label 3
8,SpiritualCenterCount,0.471154,0.402778,0.088983,0.5
3,RestaurantsCount,0.12,0.898889,0.038475,0.526667
7,MedicalCenterCount,0.113636,0.881313,0.053159,0.239899
5,OutdoorsRecreationCount,0.110577,0.863889,0.058898,0.494444
9,FoodShopCount,0.097222,0.62037,0.035311,0.459877
1,ArtsEntertainmentCount,0.08547,0.537037,0.020245,0.290123
6,GovernmentBuildingCount,0.061538,0.62963,0.017514,0.118519
2,CollegesCount,0.05603,0.198903,0.007951,0.253772
4,NightLifeCount,0.047082,0.766284,0.010228,0.425287


In [40]:
# Neighborhoods with many Restaurants, NightLife and Outdoors&Recreational
#venues, also significant amounts of Government Buildings
# I would categorize this cluster as City Center/Downtown neighborhoods
labels_df.sort_values('Label 1', ascending=False)

Unnamed: 0,index,Label 0,Label 1,Label 2,Label 3
3,RestaurantsCount,0.12,0.898889,0.038475,0.526667
7,MedicalCenterCount,0.113636,0.881313,0.053159,0.239899
5,OutdoorsRecreationCount,0.110577,0.863889,0.058898,0.494444
4,NightLifeCount,0.047082,0.766284,0.010228,0.425287
6,GovernmentBuildingCount,0.061538,0.62963,0.017514,0.118519
9,FoodShopCount,0.097222,0.62037,0.035311,0.459877
1,ArtsEntertainmentCount,0.08547,0.537037,0.020245,0.290123
8,SpiritualCenterCount,0.471154,0.402778,0.088983,0.5
2,CollegesCount,0.05603,0.198903,0.007951,0.253772


In [41]:
# Neighborhoods with significant amounts of Spiritual Centers, Restaurants and
#Outdoors&Recreational venues, few NighLife and College venues.
# I would categorize this cluster as dense Residential areas in the city
labels_df.sort_values('Label 2', ascending=False)

Unnamed: 0,index,Label 0,Label 1,Label 2,Label 3
8,SpiritualCenterCount,0.471154,0.402778,0.088983,0.5
5,OutdoorsRecreationCount,0.110577,0.863889,0.058898,0.494444
7,MedicalCenterCount,0.113636,0.881313,0.053159,0.239899
3,RestaurantsCount,0.12,0.898889,0.038475,0.526667
9,FoodShopCount,0.097222,0.62037,0.035311,0.459877
1,ArtsEntertainmentCount,0.08547,0.537037,0.020245,0.290123
6,GovernmentBuildingCount,0.061538,0.62963,0.017514,0.118519
4,NightLifeCount,0.047082,0.766284,0.010228,0.425287
2,CollegesCount,0.05603,0.198903,0.007951,0.253772


In [42]:
# Neighborhoods with many Restaurants, NightLife and Outdoors&Recreational
#venues, not significant amounts of Government Buildings
# I would categorize this cluster as close to Downtown neighborhoods
labels_df.sort_values('Label 3', ascending=False)

Unnamed: 0,index,Label 0,Label 1,Label 2,Label 3
3,RestaurantsCount,0.12,0.898889,0.038475,0.526667
8,SpiritualCenterCount,0.471154,0.402778,0.088983,0.5
5,OutdoorsRecreationCount,0.110577,0.863889,0.058898,0.494444
9,FoodShopCount,0.097222,0.62037,0.035311,0.459877
4,NightLifeCount,0.047082,0.766284,0.010228,0.425287
1,ArtsEntertainmentCount,0.08547,0.537037,0.020245,0.290123
2,CollegesCount,0.05603,0.198903,0.007951,0.253772
7,MedicalCenterCount,0.113636,0.881313,0.053159,0.239899
6,GovernmentBuildingCount,0.061538,0.62963,0.017514,0.118519


In [43]:
#Creating the Toronto map with the neighborhoods clusters imposed on top
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Markers for the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters