# 1. To get the postal code, borough, and neighborhoods in Toronto

In [1]:
#Import necessary modules and libraries
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # install folium 
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
#Get the webpage from Wikipedia - List of postal codes of Canada: M

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

#Create a Beautiful Soup object from the html
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [3]:
# Print the first 5 rows of the neighbourhood table
rows = soup.find_all('tr')
print(rows[:5])

[<tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>, <tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>, <tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>, <tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>]


In [4]:
# Find all the cells of the rows, remove html tags using Beautiful Soup get_text() method and append to a list
list_rows = []
for row in rows:
    row_td = row.find_all('td')
    str_cells = str(row_td)
    cleantext = BeautifulSoup(str_cells, "lxml").get_text(strip=True)
    list_rows.append(cleantext)
print(list_rows)
type(list_rows)

['[]', '[M1A,Not assigned,]', '[M2A,Not assigned,]', '[M3A,North York,Parkwoods]', '[M4A,North York,Victoria Village]', '[M5A,Downtown Toronto,Regent Park / Harbourfront]', '[M6A,North York,Lawrence Manor / Lawrence Heights]', "[M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government]", '[M8A,Not assigned,]', '[M9A,Etobicoke,Islington Avenue]', '[M1B,Scarborough,Malvern / Rouge]', '[M2B,Not assigned,]', '[M3B,North York,Don Mills]', '[M4B,East York,Parkview Hill / Woodbine Gardens]', '[M5B,Downtown Toronto,Garden District, Ryerson]', '[M6B,North York,Glencairn]', '[M7B,Not assigned,]', '[M8B,Not assigned,]', '[M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale]', '[M1C,Scarborough,Rouge Hill / Port Union / Highland Creek]', '[M2C,Not assigned,]', '[M3C,North York,Don Mills]', '[M4C,East York,Woodbine Heights]', '[M5C,Downtown Toronto,St. James Town]', '[M6C,York,Humewood-Cedarvale]', '[M7C,Not assigned,]', '[M8C,Not assigned,]', '[M9C,Eto

list

In [5]:
#Convert the list into a dataframe and drop the first row which is empty
df = pd.DataFrame(list_rows)
df.drop(df.index[0], inplace=True)
df.head(10)

Unnamed: 0,0
1,"[M1A,Not assigned,]"
2,"[M2A,Not assigned,]"
3,"[M3A,North York,Parkwoods]"
4,"[M4A,North York,Victoria Village]"
5,"[M5A,Downtown Toronto,Regent Park / Harbourfront]"
6,"[M6A,North York,Lawrence Manor / Lawrence Heig..."
7,"[M7A,Downtown Toronto,Queen's Park / Ontario P..."
8,"[M8A,Not assigned,]"
9,"[M9A,Etobicoke,Islington Avenue]"
10,"[M1B,Scarborough,Malvern / Rouge]"


In [6]:
# Split the "0" column into multiple columns at the comma position
df1 = df[0].str.split(',', expand=True)
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
1,[M1A,Not assigned,],,,,,,,,...,,,,,,,,,,
2,[M2A,Not assigned,],,,,,,,,...,,,,,,,,,,
3,[M3A,North York,Parkwoods],,,,,,,,...,,,,,,,,,,
4,[M4A,North York,Victoria Village],,,,,,,,...,,,,,,,,,,
5,[M5A,Downtown Toronto,Regent Park / Harbourfront],,,,,,,,...,,,,,,,,,,
6,[M6A,North York,Lawrence Manor / Lawrence Heights],,,,,,,,...,,,,,,,,,,
7,[M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government],,,,,,,,...,,,,,,,,,,
8,[M8A,Not assigned,],,,,,,,,...,,,,,,,,,,
9,[M9A,Etobicoke,Islington Avenue],,,,,,,,...,,,,,,,,,,
10,[M1B,Scarborough,Malvern / Rouge],,,,,,,,...,,,,,,,,,,


In [7]:
# We only need the first three columns
df1 = df1.iloc[:,[0,1,2]]
df1.head(5)

Unnamed: 0,0,1,2
1,[M1A,Not assigned,]
2,[M2A,Not assigned,]
3,[M3A,North York,Parkwoods]
4,[M4A,North York,Victoria Village]
5,[M5A,Downtown Toronto,Regent Park / Harbourfront]


In [8]:
#Strip square brackets
df1[0] = df1[0].str.strip('[')
df1[2] = df1[2].str.strip(']')
df1.head(5)

Unnamed: 0,0,1,2
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


In [9]:
#Get table header
col_labels = soup.find_all('th')
all_header = []
col_str = str(col_labels)
headertext = BeautifulSoup(col_str, "lxml").get_text(strip=True)
all_header.append(headertext)
print(all_header)

['[Postal code,Borough,Neighborhood,Canadian postal codes]']


In [10]:
#Covert list to dataframe, we only need the first 3 columns
df2 = pd.DataFrame(all_header)
df2 = df2[0].str.split(',', expand=True)
df2 = df2.iloc[:,[0,1,2]]
df2[0] = df2[0].str.strip('[')
df2

Unnamed: 0,0,1,2
0,Postal code,Borough,Neighborhood


In [11]:
#Cancat the two dataframes
frames = [df2, df1]

df3 = pd.concat(frames)
df3.head(5)

Unnamed: 0,0,1,2
0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [12]:
#Replace column names with first row
df3.rename(columns=df3.iloc[0], inplace=True)
df3.drop(df3.index[0], inplace=True)
df3.head()

Unnamed: 0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


In [13]:
#Drop all rows where Borough is Not assigned
df4 = df3.drop(df3[df3['Borough'] == "Not assigned"].index, axis=0)
#Drop last 4 rows which are irrelervant
df4 = df4[:-4]
#Replace '/' with ',' in 
df4.replace(to_replace=r'/', value=',', regex=True, inplace=True)
df4.rename(columns={'Postal code': 'Postal Code'}, inplace=True)
df4

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park , Harbourfront"
6,M6A,North York,"Lawrence Manor , Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,"Malvern , Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill , Woodbine Gardens"
14,M5B,Downtown Toronto,Garden District


In [14]:
df_toronto = df4
df_toronto.shape

(103, 3)

# 2. To get the latitude and the longitude coordinates of each neighborhood

In [15]:
#!conda install -c conda-forge geocoder
"""
import geocoder # import geocoder

# initialize variable to None
lat_lng_coords = None

latitude = []
longitude = []

for postal_code in df_toronto['Postal code']:
    # loop until you get the coordinates
    while(lat_lng_coords is None):
          print("Get lat, long of postal code: " + postal_code) 
          g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
          lat_lng_coords = g.latlng
        
    print(lat_lng_coords[0])    
    latitude.append(lat_lng_coords[0])
    print(lat_lng_coords[1])
    longitude.append(lat_lng_coords[1])
        
df_toronto['Latitude'] = latitude
df_toronto['Longitude'] = longitude
df_toronto
"""

'\nimport geocoder # import geocoder\n\n# initialize variable to None\nlat_lng_coords = None\n\nlatitude = []\nlongitude = []\n\nfor postal_code in df_toronto[\'Postal code\']:\n    # loop until you get the coordinates\n    while(lat_lng_coords is None):\n          print("Get lat, long of postal code: " + postal_code) \n          g = geocoder.google(\'{}, Toronto, Ontario\'.format(postal_code))\n          lat_lng_coords = g.latlng\n        \n    print(lat_lng_coords[0])    \n    latitude.append(lat_lng_coords[0])\n    print(lat_lng_coords[1])\n    longitude.append(lat_lng_coords[1])\n        \ndf_toronto[\'Latitude\'] = latitude\ndf_toronto[\'Longitude\'] = longitude\ndf_toronto\n'

In [16]:
url = "http://cocl.us/Geospatial_data"
df_lat_long = pd.read_csv(url)
df_lat_long.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
# Merge lat, long fields to neighbourhood dataframe to create new dataframe 
toronto_data = pd.merge(df_toronto, df_lat_long, on='Postal Code')
toronto_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [18]:
toronto_data.shape

(103, 5)

# 3. To explore and cluster the neighborhoods in the city of Toronto

#### Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = 'WWOQYEWDNSEYML2UYM5CS1NJMQWJWP00DM1SMHTP43LGBOE1' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WWOQYEWDNSEYML2UYM5CS1NJMQWJWP00DM1SMHTP43LGBOE1
CLIENT_SECRET:XGWUZGDH04OA3MJ21NAGRNY0JDM42XBGUPC21LAPQLX3S1GN


#### Create function to repeat the same process to all the neighborhoods in Toronto

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called toronto_venues

In [21]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park , Harbourfront
Lawrence Manor , Lawrence Heights
Queen's Park , Ontario Provincial Government
Islington Avenue
Malvern , Rouge
Don Mills
Parkview Hill , Woodbine Gardens
Garden District
Glencairn
West Deane Park , Princess Gardens , Martin Grove , Islington , Cloverdale
Rouge Hill , Port Union , Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate , Bloordale Gardens , Old Burnhamthorpe , Markland Wood
Guildwood , Morningside , West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor , Wilson Heights , Downsview North
Thorncliffe Park
Richmond , Adelaide , King
Dufferin , Dovercourt Village
Scarborough Village
Fairview , Henry Farm , Oriole
Northwood Park , York University
East Toronto
Harbourfront East , Union Station , Toronto Islands
Little Portugal , Trinity
Kennedy Park , Ionview , East Birchmount Park
Bayview Village
Downsview
T

#### Let's check the size of the resulting dataframe

In [22]:
print(toronto_venues.shape)
toronto_venues.head()

(2147, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [23]:
#Let's check how many venues were returned for each neighborhood

toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood , Long Branch",10,10,10,10,10,10
"Bathurst Manor , Wilson Heights , Downsview North",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park , Lawrence Manor East",23,23,23,23,23,23
Berczy Park,58,58,58,58,58,58
"Birch Cliff , Cliffside West",4,4,4,4,4,4
"Brockton , Parkdale Village , Exhibition Place",24,24,24,24,24,24
Business reply mail Processing CentrE,17,17,17,17,17,17
"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",16,16,16,16,16,16


#### Let's find out how many unique categories can be curated from all the returned venues

In [24]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 266 uniques categories.


### Let's Analyze Each Neighbourhood

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
toronto_onehot.shape

(2147, 266)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [27]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
1,"Alderwood , Long Branch",0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
2,"Bathurst Manor , Wilson Heights , Downsview North",0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.052632,0.0000,0.00,0.000000,0.0,0.000000
3,Bayview Village,0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
4,"Bedford Park , Lawrence Manor East",0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.043478,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
5,Berczy Park,0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.017241,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
6,"Birch Cliff , Cliffside West",0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
7,"Brockton , Parkdale Village , Exhibition Place",0.000000,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
8,Business reply mail Processing CentrE,0.058824,0.0,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000
9,"CN Tower , King and Spadina , Railway Lands , ...",0.000000,0.0,0.062500,0.0625,0.0625,0.125,0.1875,0.125,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0000,0.00,0.000000,0.0,0.000000


#### Let's confirm the new size

In [28]:
toronto_grouped.shape

(95, 266)

#### Let's print each neighborhood along with the top 5 most common venues

In [29]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.25
1                     Lounge  0.25
2             Breakfast Spot  0.25
3         Chinese Restaurant  0.25
4                     Market  0.00


----Alderwood , Long Branch----
            venue  freq
0     Pizza Place   0.2
1    Skating Rink   0.1
2  Sandwich Place   0.1
3             Gym   0.1
4    Dance Studio   0.1


----Bathurst Manor , Wilson Heights , Downsview North----
              venue  freq
0              Bank  0.11
1       Coffee Shop  0.11
2  Sushi Restaurant  0.05
3       Gas Station  0.05
4    Sandwich Place  0.05


----Bayview Village----
                      venue  freq
0        Chinese Restaurant  0.25
1                      Café  0.25
2                      Bank  0.25
3       Japanese Restaurant  0.25
4  Mediterranean Restaurant  0.00


----Bedford Park , Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1  Italian Restaurant  0.09
2          Restaura

#### Let's write a function to sort the venues in descending order.

In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [63]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Chinese Restaurant,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
1,"Alderwood , Long Branch",Pizza Place,Skating Rink,Pharmacy,Sandwich Place,Dance Studio,Pub,Athletics & Sports,Coffee Shop,Gym,Comfort Food Restaurant
2,"Bathurst Manor , Wilson Heights , Downsview North",Bank,Coffee Shop,Fried Chicken Joint,Ice Cream Shop,Pizza Place,Pharmacy,Restaurant,Middle Eastern Restaurant,Bridal Shop,Supermarket
3,Bayview Village,Japanese Restaurant,Café,Bank,Chinese Restaurant,Women's Store,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
4,"Bedford Park , Lawrence Manor East",Pizza Place,Coffee Shop,Sandwich Place,Italian Restaurant,Restaurant,Thai Restaurant,Indian Restaurant,Liquor Store,Pub,Butcher


### Let's cluster the neighbourhood

#### Run k-means to cluster the neighborhood into 5 clusters.

In [64]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

#print(toronto_grouped_clustering.isna().sum())
      
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [65]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!
toronto_merged['Cluster Labels'].fillna(toronto_merged['Cluster Labels'].mean(), inplace=True)

print(toronto_merged['Cluster Labels'].isna().sum())
print(toronto_merged['Cluster Labels'])

0
0      0.00
1      1.00
2      1.00
3      1.00
4      1.00
5      0.99
6      1.00
7      1.00
8      1.00
9      1.00
10     2.00
11     0.99
12     1.00
13     1.00
14     1.00
15     1.00
16     0.00
17     1.00
18     1.00
19     1.00
20     1.00
21     0.00
22     1.00
23     1.00
24     1.00
25     1.00
26     1.00
27     1.00
28     1.00
29     1.00
       ... 
73     1.00
74     1.00
75     1.00
76     1.00
77     1.00
78     1.00
79     1.00
80     1.00
81     1.00
82     1.00
83     4.00
84     1.00
85     0.00
86     1.00
87     1.00
88     1.00
89     1.00
90     1.00
91     0.00
92     1.00
93     1.00
94     1.00
95     0.99
96     1.00
97     1.00
98     0.00
99     1.00
100    1.00
101    1.00
102    1.00
Name: Cluster Labels, Length: 103, dtype: float64


#### Finally, let's visualize the resulting clusters

##### Let's get the geographical coordinates of Toronto.

In [66]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [67]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Let's Examine the Clusters

#### Cluster 1: 

In [68]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0.0,Park,Food & Drink Shop,Bus Stop,Women's Store,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
16,York,0.0,Hockey Arena,Park,Field,Trail,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Donut Shop,Cuban Restaurant
21,York,0.0,Park,Women's Store,Pool,Dim Sum Restaurant,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
35,East York,0.0,Coffee Shop,Park,Metro Station,Convenience Store,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
49,North York,0.0,Construction & Landscaping,Park,Bakery,Women's Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
61,Central Toronto,0.0,Park,Bus Line,Swim School,Women's Store,Dessert Shop,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Diner
64,York,0.0,Park,Convenience Store,Women's Store,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
66,North York,0.0,Park,Bank,Convenience Store,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
85,Scarborough,0.0,Coffee Shop,Park,Playground,Diner,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
91,Downtown Toronto,0.0,Park,Trail,Playground,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store


#### Cluster 2: 

In [69]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1.0,Coffee Shop,Hockey Arena,Grocery Store,Portuguese Restaurant,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
2,Downtown Toronto,1.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Restaurant,Café,Theater,Yoga Studio,Cosmetics Shop
3,North York,1.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Vietnamese Restaurant,Boutique,Miscellaneous Shop,Event Space,Carpet Store,Women's Store
4,Downtown Toronto,1.0,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,Beer Bar,Japanese Restaurant,Café,Hobby Shop,Bank,Bar
6,Scarborough,1.0,Fast Food Restaurant,Women's Store,Cupcake Shop,Drugstore,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
7,North York,1.0,Asian Restaurant,Japanese Restaurant,Coffee Shop,Beer Store,Gym,Restaurant,Clothing Store,Italian Restaurant,Bike Shop,Discount Store
8,East York,1.0,Pizza Place,Breakfast Spot,Intersection,Café,Athletics & Sports,Fast Food Restaurant,Pharmacy,Bank,Pet Store,Gym / Fitness Center
9,Downtown Toronto,1.0,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Japanese Restaurant,Middle Eastern Restaurant,Cosmetics Shop,Restaurant,Hotel,Tea Room
12,Scarborough,1.0,History Museum,Bar,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Curling Ice
13,North York,1.0,Asian Restaurant,Japanese Restaurant,Coffee Shop,Beer Store,Gym,Restaurant,Clothing Store,Italian Restaurant,Bike Shop,Discount Store


#### Cluster 3:

In [70]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,North York,2.0,Pizza Place,Japanese Restaurant,Pub,Women's Store,Dim Sum Restaurant,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
50,North York,2.0,Pizza Place,Women's Store,Diner,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store


#### Cluster 4: 

In [71]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
62,Central Toronto,3.0,Garden,Women's Store,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Cuban Restaurant


Cluster 5: 

In [72]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,Scarborough,4.0,Playground,Convenience Store,Diner,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cupcake Shop
83,Central Toronto,4.0,Playground,Diner,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cuban Restaurant
