<center>
    <img src="https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/Logos/organization_logo/organization_logo.png" width="300" alt="cognitiveclass.ai logo"  />
</center>


<h1>Segmenting and Clustering Neighborhoods in Toronto </h1>

<h2 id="Section_1"> Part I: </h2>
<h4 id="Section_1"> Find the data source on the web, scrape the page and load the postal code / borough / neighbourhood data to a dataframe. </h4>


In [1]:
import pandas as pd
import requests

# Set options to display all rows and all columns without truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Get the contents of the Wikipedia wepage
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df_list = pd.read_html(html)

# Load the table with Postal Codes, Boroughs and Neighbourhoods to the dataframe
#  (two tables on the page, get the first one)
df = df_list[0]

# Assign column names; columns are initially unnamed
df.columns = ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8']

# Concatenate, i.e. append
df = pd.concat([df.col0,df.col1,df.col2,df.col3,df.col4,df.col5,df.col6,df.col7,df.col8])

# Assign column names
df = pd.DataFrame({'ind':df.index, 'whole_str':df.values})

# Drop the index column
df = df.drop('ind', 1)

# Build FSR as the first 3 characters of the concatenated string
df["PostalCode"] = df["whole_str"].str[:3]

# Build Borough as the substring beginning with the 4th character of the concatenated string and ending before the left parenthesis
df["Borough"] = df["whole_str"].str.split('(').str[0].str[3:]

# Build Neighbourhoods_temp as the substring beginning after the left parenthesis
df["Neighbourhood_temp"] = df["whole_str"].str.split('(').str[1]

# Build Neighbourhoods as the substring of Neighbourhoods_temp ending before the right parenthesis
df["Neighbourhood"] = df["Neighbourhood_temp"].str.split(')').str[0]

# Replace the delimiter: 'comma+space' instead 'space+slash+space' 
df["Neighbourhood"] = df["Neighbourhood"].str.replace(' / ', ', ')

# Remove rows where FSR is not assigned to a Borough
df = df[df["Borough"] != 'Not assigned']

# Drop the unneeded columns
df.drop(['whole_str', 'Neighbourhood_temp'], axis=1, inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

<h4 id="Section_1"> Use the .shape method to print the number of rows in the dataframe: </h4>

In [2]:
df.shape

(103, 3)

<h4 id="Section_1"> Display the dataframe: </h4>

In [3]:
# Display the result
display(df)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


<h2 id="Section_1"> Part II: </h2>
<h4 id="Section_1"> Get the latitude and the longitude coordinates of each borough and append to the dataframe.  </h4>

In [4]:
# For some reason I could not make geocoder.google to work, so I used "pgeocode" instead"
!pip install pgeocode

import pgeocode
nomi = pgeocode.Nominatim('ca')

latitude = []
longitude = []
for value in df["PostalCode"]:
    location = nomi.query_postal_code(value)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
df["Latitude"] = latitude
df["Longitude"] = longitude




<h4 id="Section_1"> Display the dataframe, now with latitude and longitude: </h4>

In [5]:
display(df)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.7298,-79.2639
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.7122,-79.2843
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.7247,-79.2312
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6952,-79.2646


<h2 id="Section_1"> Part III: </h2>
<h3 id="Section_1"> Explore and cluster the neighborhoods in Toronto.  </h3>

In [6]:
# Download all the dependencies:

!pip install geopy

import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

!conda install -c conda-forge geopy --yes
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



<h4 id="Section_1"> Create URL using Foursquare client_id and client_secret, and download the data in json format: </h4>

In [7]:

import urllib.request, json 
import pandas as pd

with urllib.request.urlopen("https://api.foursquare.com/v2/venues/explore?&client_id=1UJCVJ3INGBNBAIP4HGIA0PY0ZWYG1IES5NKIX0XJW1UCEEN&client_secret=AD33OJ12KFJOH40USMEPX1AZ1ELNJHV3EFPP1IHXT0GVG5BH&v=20210327&ll=43.6672,-79.5282&radius=30000&limit=10000") as url:
    toronto_data = json.loads(url.read().decode()) 
    df_toronto_data = pd.DataFrame(toronto_data)

#toronto_data.head() #commented out, for now; it is not necessary to show the whole dataset here

In [8]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [9]:
# Clean the json and structure it into a pandas dataframe

venues = toronto_data['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.shape

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Indie Alehouse,Gastropub,43.665475,-79.46529
1,High Park,Park,43.646479,-79.463425
2,The Good Neighbour,Café,43.662578,-79.470986
3,Cheese Boutique,Deli / Bodega,43.638466,-79.475258
4,Waterfront Trail,Trail,43.635859,-79.467529


<h4 id="Section_1"> Create a function to explore all the neighborhoods in Etobicoke: </h4>


In [10]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [11]:
import pgeocode
nomi = pgeocode.Nominatim('ca')

# randomly picked postal code for one of those assigned to Etobicoke
post_code = 'M9A'

location = nomi.query_postal_code(post_code)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6662 -79.5282


In [12]:
# create map of Etobicoke venues using latitude and longitude values

map_etobicoke = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke