This note book is to analyse the neighborhoods of Toronto.

# Part 1

In [1]:
# importing Libraries
import pandas as pd
import numpy as np

import urllib # Open URL and to fetch the HTML from the URL specified
from bs4 import BeautifulSoup # Extract data from the HTML, parse HTML and XML documents

In [2]:
# specify URL of the page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# open the url using urllib.request and save the HTML into the page variable
page = urllib.request.urlopen(url)

In [3]:
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

# Contents of the page
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XptG7wpAMNAAAUaw2@sAAAES","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":951325562,"wgRevisionId":951325562,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario

In [4]:
# get title of the table
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [5]:
#Get only the sting part of the title
table_title = soup.title.string

In [6]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and save 
all_tables=soup.find_all("table")
all_tables

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postal code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park / Harbourfront
 </td></tr>
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor / Lawrence Heights
 </td></tr>
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park / Ontario Provincial Government
 </td></tr>
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue
 </td></tr>
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malvern / Rouge
 </td></tr>
 <tr>
 <td>M2B
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>
 <tr>

In [7]:
pcode_table=soup.find('table', class_='wikitable sortable')
pcode_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>Ea

In [8]:
# Find the th tag to obtain the headers
H = []

for head in pcode_table.find_all("th"):
    H.append(head.find(text=True).strip())


In [9]:
# Set up 3 empty lists since there are 3 coulumns
P = [] # postal code
B = [] # borough
N = [] # neighborhood

# Populate the lists with the available respective data
for rdata in pcode_table.find_all("tr"):
    cdata = rdata.find_all("td")
    if len(cdata)==3:
        P.append(cdata[0].find(text=True).strip())
        B.append(cdata[1].find(text=True).strip())
        N.append(cdata[2].find(text=True).strip())



In [10]:
df_TOR_comp = pd.DataFrame([P,B,N],index=H).transpose()
df_TOR_comp.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


If more than one neighborhood has the same postal code, it is already indicated in the table in wikipedia with a "/" .

In [11]:
#formatting the neighborhood column
df_TOR_comp["Neighborhood"] = df_TOR_comp["Neighborhood"].str.replace("/",",")
df_TOR_comp.head(10)


Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern , Rouge"


In [12]:
# checking total number of rows/postal codes
df_TOR_comp.shape

(180, 3)

In [13]:
# verifying boroughs not assigned and assigned
df_TOR_comp["Borough"].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Name: Borough, dtype: int64

Looking at the value counts of Borough column, 77 are not assigned, which means total assigned = 180-77= 103.

In [14]:
# ignoring boroughs not assigned 
df_TOR_B = df_TOR_comp[df_TOR_comp.Borough != "Not assigned"].reset_index(drop=True)

# reset index for modified data frame
#df_TOR_B.drop(["index"], axis=1, inplace=True)
df_TOR_B.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [15]:
#checking if any assigned boroughs have no neighborhood assigned
df_TOR_noN = df_TOR_B[df_TOR_B.Neighborhood == "Not assigned"] | df_TOR_B[df_TOR_B.Neighborhood == " "] | df_TOR_B[df_TOR_B.Neighborhood == ""]

# checking the number of unassigned neighborhoods
df_TOR_noN.head()

Unnamed: 0,Postal code,Borough,Neighborhood


So it is seen that there are 0 boroughs with no neighborhoods assigned.

In [16]:
# creating a toronto borough dataframe
df_TOR = df_TOR_B

df_TOR.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [17]:
df_TOR.shape

(103, 3)

# Part 2

In [18]:
postal_code = df_TOR["Postal code"]

import geocoder # import geocoder

#initialize your variable to None
lat_lng_coords = None

#loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

The geocoder didn't fetch the required co-ordinates after many attempts. so taking data from csv file.

In [19]:
import wget as wget
url = "https://cocl.us/Geospatial_data"

wget.download(url)

'Geospatial_Coordinates (10).csv'

In [20]:
df_TOR_latlon = pd.read_csv("Geospatial_Coordinates.csv")
df_TOR_latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
# Create a latitude and longitude list
lat = list(np.zeros((103,1)))
long = list(np.zeros((103,1)))

# Assign appropriate latitude and logitude depending on postal code in main Toronto dataframe
for id1, PCode in enumerate(df_TOR_latlon["Postal Code"]):
    for id2, pcode in enumerate(df_TOR["Postal code"]):
        if pcode == PCode:
            lat[id2]=(df_TOR_latlon.loc[id1,"Latitude"])
            long[id2]=(df_TOR_latlon.loc[id1,"Longitude"])

In [22]:
df_TOR["Latitude"]=lat
df_TOR["Longitude"]=long
df_TOR.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Part 3

In [23]:
# importing libraries

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [24]:
# grouping boroughs
df_boroughs = df_TOR.sort_values(["Borough"], ascending=True)
df_boroughs.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
86,M4V,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049
79,M4S,Central Toronto,Davisville,43.704324,-79.38879
83,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
68,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
74,M5R,Central Toronto,"The Annex , North Midtown , Yorkville",43.67271,-79.405678


In [25]:
print("The dataframe has {} boroughs".format(len(df_boroughs["Borough"].unique()) ))

The dataframe has 10 boroughs


In [26]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [27]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough'], df_boroughs['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Segmenting and clustering of neighborhoods only in Central Toronto will be done first and later the same analysis will be applied to all boroughs containing "Toronto". I am choosing the same as in the assignment because this gives a lager data frame to get better understanding.

In [28]:
# Dataframe for Central Toronto
df_cenTor = df_boroughs[df_boroughs["Borough"]=="Central Toronto"].reset_index(drop=True)
df_cenTor

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M4V,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049
1,M4S,Central Toronto,Davisville,43.704324,-79.38879
2,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
4,M5R,Central Toronto,"The Annex , North Midtown , Yorkville",43.67271,-79.405678
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [29]:
# geographical co-ordinates of Central Toronto
address = 'Central Toronto, ON'

geolocator = Nominatim(user_agent="centor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.


In [30]:
#creating a map of Central Toronto Borough
map_centoronto = folium.Map(location=[latitude+0.02, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_cenTor['Latitude'], df_cenTor['Longitude'], df_cenTor['Borough'], df_cenTor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_centoronto)  
    
map_centoronto

Define Foursquare Credentials and Versions

In [31]:
CLIENT_ID = 'xxx' # your Foursquare ID
CLIENT_SECRET = 'xxx' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: xxx
CLIENT_SECRET:Pxxx


Exploring first neighborhood in Central Toronto. 

In [32]:
nh_name = df_cenTor.loc[0,"Neighborhood"] # Postal Code of the first neighborhood
nh_latitude = df_cenTor.loc[0,"Latitude"] # Latitude of first neighborhood
nh_longitude = df_cenTor.loc[0,"Longitude"] # Longitude of first neighborhood

print("The {} neighborhood in Central Toronto has a latitude {} and longitude {}".format(nh_name, nh_latitude, nh_longitude))

The Summerhill West , Rathnelly , South Hill , Forest Hill SE , Deer Park neighborhood in Central Toronto has a latitude 43.68641229999999 and longitude -79.4000493


To get the top 10 venues within a radius of 500m:

In [33]:
# Creating the get request URL 
radius=500
LIMIT=100
url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
CLIENT_ID,CLIENT_SECRET,VERSION,nh_latitude, nh_longitude, radius,LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?client_id=xxx&client_secret=xxx&v=20180605&ll=43.68641229999999,-79.4000493&radius=500&limit=100'

In [34]:
# obtaining the json file
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ea399d1bae9a2001bd17999'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Deer Park',
  'headerFullLocation': 'Deer Park, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 16,
  'suggestedBounds': {'ne': {'lat': 43.690912304499996,
    'lng': -79.39383797359734},
   'sw': {'lat': 43.68191229549999, 'lng': -79.40626062640267}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5a67afb973fe2528841f60f3',
       'name': 'The Market By Longo’s',
       'location': {'address': '111 St Clair Ave W',
        'lat': 43.686711,
        'lng': -79.399536,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.686711,


In [35]:
# required info is in the ITEMS key. Using get_category_type function from foursquare lab
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Extracting required data from json

In [36]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,The Market By Longo’s,Supermarket,43.686711,-79.399536
1,LCBO,Liquor Store,43.686991,-79.399238
2,Daeco Sushi,Sushi Restaurant,43.687838,-79.395652
3,Mary Be Kitchen,Restaurant,43.687708,-79.395062
4,Union Social Eatery,American Restaurant,43.687895,-79.394916
5,Starbucks,Coffee Shop,43.686756,-79.398292
6,Tim Hortons,Coffee Shop,43.687682,-79.39684
7,Fionn MacCool's,Pub,43.687921,-79.394783
8,RBC Royal Bank,Bank,43.688062,-79.395001
9,Raiders E-Sports Centre,Sports Bar,43.687683,-79.395944


In [37]:
# Repeating the process for all neighborhoods (postal codes) containing "Toronto"
df_boroughsTor = df_boroughs[df_boroughs["Borough"].str.contains("Toronto")]
df_boroughsTor.reset_index(drop=True)
df_boroughsTor["Borough"].unique()

array(['Central Toronto', 'Downtown Toronto', 'East Toronto',
       'West Toronto'], dtype=object)

In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [39]:
boroughsTor_venues = getNearbyVenues(names=df_boroughsTor['Neighborhood'],
                                   latitudes=df_boroughsTor['Latitude'],
                                   longitudes=df_boroughsTor['Longitude']
                                  )

Summerhill West , Rathnelly , South Hill , Forest Hill SE , Deer Park
Davisville
Moore Park , Summerhill East
Forest Hill North & West
The Annex , North Midtown , Yorkville
Davisville North
Roselawn
North Toronto West
Lawrence Park
Berczy Park
Christie
Central Bay Street
First Canadian Place , Underground city
Toronto Dominion Centre , Design Exchange
Church and Wellesley
University of Toronto , Harbord
Queen's Park , Ontario Provincial Government
Rosedale
Harbourfront East , Union Station , Toronto Islands
Stn A PO Boxes
St. James Town
Richmond , Adelaide , King
Regent Park , Harbourfront
Commerce Court , Victoria Hotel
St. James Town , Cabbagetown
Garden District, Ryerson
CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport
Kensington Market , Chinatown , Grange Park
The Beaches
The Danforth West , Riverdale
India Bazaar , The Beaches West
Studio District
Business reply mail Processing CentrE
Brockton , Parkdale Village , Ex

Map of Toronto to view the boroughs of Central, Downtown, East and West Toronto

In [40]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [41]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude+0.01, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_boroughsTor['Latitude'], df_boroughsTor['Longitude'], df_boroughsTor['Borough'], df_boroughsTor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [42]:
#size of resulting dataframe
print(boroughsTor_venues.shape)
boroughsTor_venues.head()

(1612, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,The Market By Longo’s,43.686711,-79.399536,Supermarket
1,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,LCBO,43.686991,-79.399238,Liquor Store
2,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,Daeco Sushi,43.687838,-79.395652,Sushi Restaurant
3,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,Mary Be Kitchen,43.687708,-79.395062,Restaurant
4,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,Union Social Eatery,43.687895,-79.394916,American Restaurant


Venues in each Neighborhood

In [43]:
boroughsTor_venues.groupby("Neighborhood").count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
"Brockton , Parkdale Village , Exhibition Place",22,22,22,22,22,22
Business reply mail Processing CentrE,18,18,18,18,18,18
"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",18,18,18,18,18,18
Central Bay Street,63,63,63,63,63,63
Christie,18,18,18,18,18,18
Church and Wellesley,72,72,72,72,72,72
"Commerce Court , Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,6,6,6,6,6,6


Unique categories from all the returned vanues

In [44]:
print('There are {} uniques categories.'.format(len(boroughsTor_venues['Venue Category'].unique())))

There are 231 uniques categories.


Analyze each neighborhood

In [45]:
# extracting the different types of venues using one hot coding
df_boroughsTor_ohc = pd.get_dummies(boroughsTor_venues["Venue Category"])

#adding neighborhood column
df_boroughsTor_ohc["Neighborhood"] = boroughsTor_venues["Neighborhood"]

#moving neighborhood to 1st column
df_boroughsTor_ohc.set_index("Neighborhood", inplace=True)
df_boroughsTor_ohc.reset_index(inplace=True)

df_boroughsTor_ohc.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Summerhill West , Rathnelly , South Hill , For...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Summerhill West , Rathnelly , South Hill , For...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Summerhill West , Rathnelly , South Hill , For...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Summerhill West , Rathnelly , South Hill , For...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Summerhill West , Rathnelly , South Hill , For...",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Checking the size of the dataframe

In [46]:
df_boroughsTor_ohc.shape

(1612, 231)

In [47]:
# Grouping the rows by neighborhood and taking the mean of frequency of occurence of each category
df_boroughsTor_ohc_grp = df_boroughsTor_ohc.groupby("Neighborhood").mean().reset_index()

In [48]:
df_boroughsTor_ohc_grp.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
3,"CN Tower , King and Spadina , Railway Lands , ...",0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.015873


In [49]:
df_boroughsTor_ohc_grp.shape

(39, 231)

In [50]:
# Sort venues in descending order of occurance
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Dataframe to display top 10 venues in each neighborhood

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughsTor_venues_sorted = pd.DataFrame(columns=columns)
boroughsTor_venues_sorted['Neighborhood'] = df_boroughsTor_ohc_grp['Neighborhood']

for ind in np.arange(df_boroughsTor_ohc_grp.shape[0]):
    boroughsTor_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_boroughsTor_ohc_grp.iloc[ind, :], num_top_venues)

boroughsTor_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Italian Restaurant,Beer Bar,Restaurant,Café,Cheese Shop,Bakery,Seafood Restaurant,Farmers Market
1,"Brockton , Parkdale Village , Exhibition Place",Café,Breakfast Spot,Coffee Shop,Nightclub,Stadium,Bar,Intersection,Bakery,Italian Restaurant,Climbing Gym
2,Business reply mail Processing CentrE,Light Rail Station,Yoga Studio,Spa,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Pizza Place,Recording Studio
3,"CN Tower , King and Spadina , Railway Lands , ...",Airport Service,Airport Lounge,Airport Terminal,Airport,Harbor / Marina,Coffee Shop,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Bubble Tea Shop,Burger Joint,Middle Eastern Restaurant,Japanese Restaurant,Salad Place,Ice Cream Shop


In [52]:
# For clustering, drop the Neighborhood column
df_boroughsTor_cluster = df_boroughsTor_ohc_grp.drop("Neighborhood", axis=1)
df_boroughsTor_cluster.head()

Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017544,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
3,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.015873


In [53]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_boroughsTor_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [54]:
# add clustering labels
boroughsTor_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

boroughsTor_merged = df_boroughsTor

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
boroughsTor_merged = boroughsTor_merged.join(boroughsTor_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

boroughsTor_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
86,M4V,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049,0,Coffee Shop,Pub,Bagel Shop,Supermarket,Light Rail Station,Vietnamese Restaurant,Sushi Restaurant,Liquor Store,American Restaurant,Restaurant
79,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Sandwich Place,Dessert Shop,Coffee Shop,Italian Restaurant,Café,Pizza Place,Sushi Restaurant,Gym,Diner,Farmers Market
83,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316,1,Playground,Restaurant,Yoga Studio,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
68,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,3,Park,Jewelry Store,Trail,Bus Line,Sushi Restaurant,Department Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
74,M5R,Central Toronto,"The Annex , North Midtown , Yorkville",43.67271,-79.405678,0,Sandwich Place,Café,Coffee Shop,Middle Eastern Restaurant,Pizza Place,Burger Joint,Liquor Store,Indian Restaurant,Pub,Flower Shop


Creating cluster map

In [55]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(boroughsTor_merged['Latitude'], boroughsTor_merged['Longitude'], boroughsTor_merged['Neighborhood'], boroughsTor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters