## Segmenting and Clustering Neighborhoods in Toronto City

### Technologies used: Wikipedia scrapig using beautifulsoup in python, Foursquare API, Folium


#### Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.11.0          

AttributeError: module 'numpy.linalg.lapack_lite' has no attribute '_ilp64'

#### Now lets set up the enviorement for web scraping and start scraping the Wikipedia page were is the list of Toronto Postalcodes

In [2]:
import requests
from bs4 import BeautifulSoup
import csv

req = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(req,'lxml')

In [3]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YDeAj8sayba4-hwRD6ixBwAAAIc","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1008658788,"wgRevisionId":1008658788,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Ontario","Postal c

#### After studying the structure of the webpage we locate the table of the postal codes and use the html tags to find the postal code as well as Borough and Neighbourhood

In [11]:
table_pc = soup.find('table')
fields = table_pc.find_all('td')

fields

[<td>M1A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M2A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M3A
 </td>,
 <td>North York
 </td>,
 <td>Parkwoods
 </td>,
 <td>M4A
 </td>,
 <td>North York
 </td>,
 <td>Victoria Village
 </td>,
 <td>M5A
 </td>,
 <td>Downtown Toronto
 </td>,
 <td>Regent Park, Harbourfront
 </td>,
 <td>M6A
 </td>,
 <td>North York
 </td>,
 <td>Lawrence Manor, Lawrence Heights
 </td>,
 <td>M7A
 </td>,
 <td>Downtown Toronto
 </td>,
 <td>Queen's Park, Ontario Provincial Government
 </td>,
 <td>M8A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M9A
 </td>,
 <td>Etobicoke
 </td>,
 <td>Islington Avenue, Humber Valley Village
 </td>,
 <td>M1B
 </td>,
 <td>Scarborough
 </td>,
 <td>Malvern, Rouge
 </td>,
 <td>M2B
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M3B
 </td>,
 <td>North York
 </td>,
 <td>Don Mills
 </td>,
 <td>M4B
 </td>,
 <td>East York
 </td>,
 <td>Parkview Hill, Woodbine Gardens
 </td>,


In [46]:
post_code = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    post_code.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df_pc = pd.DataFrame(data=[post_code, borough, neighbourhood]).transpose()
df_pc.columns = ['Postal codes', 'Borough', 'Neighbourhood']
df_pc.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postal codes           Borough              Neighbourhood
0          M1A      Not assigned               Not assigned
1          M2A      Not assigned               Not assigned
2          M3A        North York                  Parkwoods
3          M4A        North York           Victoria Village
4          M5A  Downtown Toronto  Regent Park, Harbourfront

In [47]:
df_pc['Borough'].replace('Not assigned', np.nan, inplace=True)
df_pc.dropna(subset=['Borough'], inplace=True)

df_pc.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postal codes           Borough                                Neighbourhood
2          M3A        North York                                    Parkwoods
3          M4A        North York                             Victoria Village
4          M5A  Downtown Toronto                    Regent Park, Harbourfront
5          M6A        North York             Lawrence Manor, Lawrence Heights
6          M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government

In [18]:
dataset['Borough'].replace('Not assigned', np.nan, inplace=True)
dataset.dropna(subset=['Borough'], inplace=True)

dataset.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postalcode           Borough                                Neighbourhood
2        M3A        North York                                    Parkwoods
3        M4A        North York                             Victoria Village
4        M5A  Downtown Toronto                    Regent Park, Harbourfront
5        M6A        North York             Lawrence Manor, Lawrence Heights
6        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government

In [61]:
df_pcn = df_pc.groupby(['Postal codes', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_pcn.columns = ['Postalcodes', 'Borough', 'Neighbourhood']
df_pcn


AttributeError: 'NoneType' object has no attribute 'items'

    Postalcodes           Borough  \
0           M1B       Scarborough   
1           M1C       Scarborough   
2           M1E       Scarborough   
3           M1G       Scarborough   
4           M1H       Scarborough   
5           M1J       Scarborough   
6           M1K       Scarborough   
7           M1L       Scarborough   
8           M1M       Scarborough   
9           M1N       Scarborough   
10          M1P       Scarborough   
11          M1R       Scarborough   
12          M1S       Scarborough   
13          M1T       Scarborough   
14          M1V       Scarborough   
15          M1W       Scarborough   
16          M1X       Scarborough   
17          M2H        North York   
18          M2J        North York   
19          M2K        North York   
20          M2L        North York   
21          M2M        North York   
22          M2N        North York   
23          M2P        North York   
24          M2R        North York   
25          M3A        North York   
2

In [62]:
df_pcn['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)

df_pcn

AttributeError: 'NoneType' object has no attribute 'items'

    Postalcodes           Borough  \
0           M1B       Scarborough   
1           M1C       Scarborough   
2           M1E       Scarborough   
3           M1G       Scarborough   
4           M1H       Scarborough   
5           M1J       Scarborough   
6           M1K       Scarborough   
7           M1L       Scarborough   
8           M1M       Scarborough   
9           M1N       Scarborough   
10          M1P       Scarborough   
11          M1R       Scarborough   
12          M1S       Scarborough   
13          M1T       Scarborough   
14          M1V       Scarborough   
15          M1W       Scarborough   
16          M1X       Scarborough   
17          M2H        North York   
18          M2J        North York   
19          M2K        North York   
20          M2L        North York   
21          M2M        North York   
22          M2N        North York   
23          M2P        North York   
24          M2R        North York   
25          M3A        North York   
2

In [63]:
df_pcn.shape

(103, 3)

## Part 2 Retrieve geographical information (Latitude/Longitude)

In [74]:
import geopy

geo_lat = {}
geo_lon = {}

geo = geopy.Nominatim(user_agent="Detector", timeout=20)

for x in np.unique(df_pcn.Borough):
    
    loc = geo.geocode("{}, Toronto, Ontario".format(x))
    
    geo_lat[x] = loc.latitude
    geo_lon[x] = loc.longitude

#### Check the latitude and longitudes

In [65]:
geo_lat

{'Central Toronto': 43.6449033,
 'Downtown Toronto': 43.6563221,
 'East Toronto': 43.6261221,
 'East York': 43.699971000000005,
 'Etobicoke': 43.6435559,
 'Mississauga': 43.6677248,
 'North York': 43.7543263,
 'Scarborough': 43.7729744,
 'West Toronto': 43.6449033,
 'York': 43.6896191}

In [66]:
geo_lon

{'Central Toronto': -79.3818364,
 'Downtown Toronto': -79.3809161,
 'East Toronto': -79.3950351,
 'East York': -79.33251996261595,
 'Etobicoke': -79.5656326,
 'Mississauga': -79.586436,
 'North York': -79.44911696639593,
 'Scarborough': -79.2576479,
 'West Toronto': -79.3818364,
 'York': -79.479188}

 ### extract latitude & longitutde based on "Postcode" using Geospatial_Coordinates.csv file provided in coursera and uploaded into ibm  watson studio {ink - http://cocl.us/Geospatial_data}, 

In [79]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_b7d3eb2783bb4f66af58234ec40da39b = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='tVR1U6vmiNUpAFcn697gkhi78HaVSFX9huihulhl',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_b7d3eb2783bb4f66ho98u9jiikjlkm;.get_object(Bucket='clusteringneighbourhoodsintoronto-donotdelete-pr-gl3iwwpg4wpky7',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_geo = pd.read_csv(body)
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']
df_geo.head()


AttributeError: 'NoneType' object has no attribute 'items'

  Postcode   Latitude  Longitude
0      M1B  43.806686 -79.194353
1      M1C  43.784535 -79.160497
2      M1E  43.763573 -79.188711
3      M1G  43.770992 -79.216917
4      M1H  43.773136 -79.239476

In [80]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Postcode   103 non-null    object 
 1   Latitude   103 non-null    float64
 2   Longitude  103 non-null    float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [82]:

# check that all postcodes in Geospatial_Coordinates.csv match with postcodes in the dataframe we created df_pcn
# both files have 103 records

df_geo = df_geo.loc[df_geo["Postcode"].isin(df_pcn.Postalcodes.values), :]
df_geo.shape

(103, 3)

In [84]:
# combine 2 datasets into one df_pcn and df_geo using .merge() method on Postalcodes/Postcode columns

dfinal_geo = df_pcn.merge(df_geo, left_on="Postalcodes", right_on="Postcode")
dfinal_geo.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postalcodes      Borough                           Neighbourhood Postcode  \
0         M1B  Scarborough                          Malvern, Rouge      M1B   
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek      M1C   
2         M1E  Scarborough       Guildwood, Morningside, West Hill      M1E   
3         M1G  Scarborough                                  Woburn      M1G   
4         M1H  Scarborough                               Cedarbrae      M1H   

    Latitude  Longitude  
0  43.806686 -79.194353  
1  43.784535 -79.160497  
2  43.763573 -79.188711  
3  43.770992 -79.216917  
4  43.773136 -79.239476  

In [85]:
#remove the first column so Postalcodes
df_toronto = dfinal_geo.iloc[:,[3,1,2,4,5]] # exclude the first column with index = 0 ("Postalcodes") and re-arrange column names
df_toronto.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postcode      Borough                           Neighbourhood   Latitude  \
0      M1B  Scarborough                          Malvern, Rouge  43.806686   
1      M1C  Scarborough  Rouge Hill, Port Union, Highland Creek  43.784535   
2      M1E  Scarborough       Guildwood, Morningside, West Hill  43.763573   
3      M1G  Scarborough                                  Woburn  43.770992   
4      M1H  Scarborough                               Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  

In [86]:
df_toronto.shape

(103, 5)

## Part 3: Show Toronto map and venues and proceed to clustering

In [87]:
# use geopy to obtain latitude/longitude of Toronto.
# coordinates will be used for map visualization

address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto 43.6534817, -79.3839347.


In [95]:
#install folium
! python -m pip install folium
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium
# import map rendering library
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

  altair             co

In [96]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map based on coordinates of postcode
for lat, lng, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [124]:
CLIENT_ID = 'HKVBCJFHLJNGFLKJVJITGKVDCXJNVJCFLKBMJVKL' # your Foursquare ID
CLIENT_SECRET = 'HKVBCJFHLJNGFLKJVJITGKVDCXJNVJCFLKBMJVKLGFKJPFDLDKDS' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HKVBCJFHLJNGFLKJVJITGKVDCXJNVJCFLKBMJVKL
CLIENT_SECRET:HKVBCJFHLJNGFLKJVJITGKVDCXJNVJCFLKBMJVKLGFKJPFDLDKDS


### Explore venues in a radius of 1000m

In [98]:

# define limit = 5 (limit to 5 venues only) & radius = 1000 (meters)
LIMIT = 5
radius = 1000

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(df_toronto.Neighbourhood, df_toronto.Latitude, df_toronto.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [99]:
temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

AttributeError: 'NoneType' object has no attribute 'items'

                            Neighbourhood  N_Latitude  N_Longitude  \
0                          Malvern, Rouge   43.806686   -79.194353   
1  Rouge Hill, Port Union, Highland Creek   43.784535   -79.160497   
2       Guildwood, Morningside, West Hill   43.763573   -79.188711   
3                                  Woburn   43.770992   -79.216917   
4                               Cedarbrae   43.773136   -79.239476   

                       Venue  V_Latitude  V_Longitude             category  
0         Images Salon & Spa   43.802283   -79.198565                  Spa  
1  Fratelli Village Pizzeria   43.784008   -79.169787   Italian Restaurant  
2                Chick-N-Joy   43.768752   -79.187982  Fried Chicken Joint  
3                  Starbucks   43.770037   -79.221156          Coffee Shop  
4            CANBE Foods Inc   43.773546   -79.246082    Indian Restaurant  

In [101]:
print("{} nearby locations downloaded for {} neighbourhood.".format(len(temp.Venue), len(df_toronto.Neighbourhood)))

102 nearby locations downloaded for 103 neighbourhood.


In [103]:

cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df_test = pd.concat([temp[['Neighbourhood']], cat], axis=1) # combine neighbourhood & category tables
df_test.shape

(102, 56)

## Now lets Cluster the neighbourhood

In [106]:
!pip uninstall -y numpy
!pip uninstall -y setuptools
!pip install setuptools
!pip install numpy


Found existing installation: numpy 1.20.1
Uninstalling numpy-1.20.1:
  Successfully uninstalled numpy-1.20.1
Found existing installation: setuptools 49.6.0.post20210108
Uninstalling setuptools-49.6.0.post20210108:
  Successfully uninstalled setuptools-49.6.0.post20210108
Collecting setuptools
  Downloading setuptools-54.1.0-py3-none-any.whl (784 kB)
[K     |████████████████████████████████| 784 kB 15.7 MB/s eta 0:00:01
[?25hInstalling collected packages: setuptools
Successfully installed setuptools-54.1.0


In [108]:
!pip install scipy



In [113]:
!pip install numpy --upgrade
!pip install scipy --upgrade 
!pip install pandas==1.0.5

Collecting pandas==1.0.5
  Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 12.0 MB/s eta 0:00:01
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.3
    Uninstalling pandas-1.2.3:
      Successfully uninstalled pandas-1.2.3
Successfully installed pandas-1.0.5


In [119]:
!pip install -U scikit-learn



In [121]:
try:
    print("Installing BeautifulSoup4...\n")
    !conda install -c conda-forge beautifulsoup4 --yes
    print("BeautifulSoup4 has been successfully installed!\n")
except:
    print("ERROR: could not install BeautifulSoup4!\n")

try:
    print("Installing ProgressBar...\n")
    !conda install -c conda-forge ProgressBar2 --yes
    print("ProgressBar has been successfully installed!\n")
except:
    print("ERROR: could not install ProgressBar!\n")

try:
    print("Installing lxml...\n")
    !conda install -c conda-forge lxml --yes
    print("lxml has been successfully installed!\n")
except:
    print("ERROR: could not install lxml!\n")

try:
    print("Installing GeoPy...\n")
    !conda install -c conda-forge geopy --yes
    print("GeoPy has been successfully installed!\n")
except:
    print("ERROR: could not install GeoPy!\n")

try:
    print("Installing Folium...\n")
    !conda install -c conda-forge folium=0.5.0 --yes
    print("Folium has been successfully installed!\n")
except:
    print("ERROR: could not install Folium!\n")

try:
    print("Importing libraries...\n")
    from progressbar import ProgressBar
    from bs4 import BeautifulSoup as bts # library for web scraping
    import numpy as np # library to handle data in a vectorized manner
    import pandas as pd # library for data analysis
    from pandas.io.json import json_normalize
    import matplotlib.cm as cm
    import matplotlib.colors as colors
    import requests # library to handle requests
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import matplotlib as mp # library for visualization
    from sklearn.cluster import KMeans # import k-means from clustering stage
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import folium # map rendering library
    import lxml
    import re
    from time import sleep
    print("All libraries imported successfully!\n")
except:
    print("ERROR: Could not import all libraries!\n")

%matplotlib inline

Installing BeautifulSoup4...

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

BeautifulSoup4 has been successfully installed!

Installing ProgressBar...

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - progressbar2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    progressbar2-3.53.1        |     pyh9f0ad1d_0          25 KB  conda-forge
    python-utils-2.5.5         |     pyh44b312d_0          15 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          40 KB

The following NEW packages will be INSTALLED:

  progressbar2       conda-forge/noarch::progressbar2-3.53.1-pyh9f0ad1d_0
  py

In [122]:
df_02 = df_test.drop('Neighbourhood', axis=1)

n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_02)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
df_test.insert(1, 'label', kmeans.labels_)
df_test.head()

NameError: name 'KMeans' is not defined

In [117]:
# merge 'df_toronto' & df_test

toronto_merged = pd.merge(df_toronto, df_test, on='Neighbourhood', how='right')
toronto_merged.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postcode      Borough                           Neighbourhood   Latitude  \
0      M1B  Scarborough                          Malvern, Rouge  43.806686   
1      M1C  Scarborough  Rouge Hill, Port Union, Highland Creek  43.784535   
2      M1E  Scarborough       Guildwood, Morningside, West Hill  43.763573   
3      M1G  Scarborough                                  Woburn  43.770992   
4      M1H  Scarborough                               Cedarbrae  43.773136   

   Longitude  Airport  Arts & Crafts Store  Bakery  Bank  Bar  Beach  \
0 -79.194353        0                    0       0     0    0      0   
1 -79.160497        0                    0       0     0    0      0   
2 -79.188711        0                    0       0     0    0      0   
3 -79.216917        0                    0       0     0    0      0   
4 -79.239476        0                    0       0     0    0      0   

   Beer Bar  Boutique  Brewery  Burger Joint  Café  Caribbean Restaurant  \
0         0         0 

In [123]:
# create map
map_toronto_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_toronto_clusters

KeyError: 'label'