Title: Segmenting and Clustering Neighbourhoods in Toronto

In [2]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup

!python -m pip install folium

#!conda install -c conda-forge folium=0.5.0 --yes
import folium
print('Folium installed and imported!')

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 3.7 MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Folium installed and imported!


Retrieving Data from Wikipedia URL and Creating Pandas Dataframe

In [3]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table', {'class':'wikitable sortable'}).tbody 

rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
columns

df = pd.DataFrame(columns=columns)

# populate dataframe with the table created
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    if len(tds) == 3:
        values = [tds[0].text.replace('\n',''), tds[1].text.replace('\n',''), tds[2].text.replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]
    #print(values)    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)

df = df.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df.reset_index(drop=True, inplace=True) #reset index values
df.head(10) 


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
8,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
9,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"


In [4]:
df.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
170,M9N,York,Weston
171,M9P,Etobicoke,Westmount
172,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
173,M9S,Not assigned,Not assigned
174,M9T,Not assigned,Not assigned
175,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
176,M9W,Etobicoke,"Northwest, West Humber - Clairville"
177,M9X,Not assigned,Not assigned
178,M9Y,Not assigned,Not assigned
179,M9Z,Not assigned,Not assigned


Cleaning Dataframe Created Based on Assignment Instructions 

In [5]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
#Note: all not assigned neighbourhood don't have borough
df['Borough'].replace("Not assigned", np.nan, inplace = True)
df.dropna(subset=['Borough'], axis=0, inplace=True)

#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print(df.shape)

df = df.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df.reset_index(drop=True, inplace=True) #reset index values
df.head(10)

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
df.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
94,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov..."
95,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Humberlea, Emery"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


Getting Latitude and the Longitude Coordinates of each Neighbourhood 

In [7]:
wiki_url = "https://github.com/cneves20/Coursera_Capstone/blob/main/Geospatial_Coordinates.csv" 
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

table2 = soup.find('table', {'class':'js-csv-data csv-data js-file-line-container'}).tbody 

rows2 = table2.find_all('tr')

values2_list = [] # create empty list to store final values

#populate dataframe with the table created
for k in range(0, len(rows2)):
    tds2 = rows2[k].find_all('td')
    if len(tds2) == 3:
        values2 = [tds2[0].text, tds2[1].text, tds2[2].text, tds2[3].text]
    else:
        values2 = [td.text for td in tds2]
    values2_list.append(values2)      

df2 = pd.DataFrame(values2_list, columns = ['Number','Postal Code','Latitude','Longitude'])

del df2['Number']
df2 = df2.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df2.reset_index(drop=True, inplace=True) #reset index values
df2.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.8066863,-79.1943534
1,M1C,43.7845351,-79.1604971
2,M1E,43.7635726,-79.1887115
3,M1G,43.7709921,-79.2169174
4,M1H,43.773136,-79.2394761
5,M1J,43.7447342,-79.2394761
6,M1K,43.7279292,-79.2620294
7,M1L,43.7111117,-79.2845772
8,M1M,43.716316,-79.2394761
9,M1N,43.692657,-79.2648481


In [8]:
df2.tail(10)

Unnamed: 0,Postal Code,Latitude,Longitude
93,M9A,43.6678556,-79.5322424
94,M9B,43.6509432,-79.5547244
95,M9C,43.6435152,-79.5772008
96,M9L,43.7563033,-79.5659633
97,M9M,43.7247659,-79.5322424
98,M9N,43.706876,-79.5181884
99,M9P,43.696319,-79.5322424
100,M9R,43.6889054,-79.5547244
101,M9V,43.7394164,-79.5884369
102,M9W,43.7067483,-79.5940544


Joining Both Dataframes

In [9]:
merged_inner = pd.merge(left=df, right=df2, left_on='Postal Code', right_on='Postal Code')
merged_inner.reset_index(drop=True, inplace=True)
merged_inner.shape
merged_inner = merged_inner.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
merged_inner.reset_index(drop=True, inplace=True) #reset index values
merged_inner.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8066863,-79.1943534
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.2648481


In [35]:
merged_inner.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6678556,-79.5322424
94,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.6509432,-79.5547244
95,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.6435152,-79.5772008
96,M9L,North York,Humber Summit,43.7563033,-79.5659633
97,M9M,North York,"Humberlea, Emery",43.7247659,-79.5322424
98,M9N,York,Weston,43.706876,-79.5181884
99,M9P,Etobicoke,Westmount,43.696319,-79.5322424
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.6889054,-79.5547244
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.7394164,-79.5884369
102,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.7067483,-79.5940544


Creating Map of Toronto with Neighbourhoods Superimposed on Top

In [58]:
# create map of Toronto (around Kensington) Market area using latitude and longitude values
df = merged_inner

locations = df[['Latitude', 'Longitude']]
locationlist = locations.values.tolist()
len(locationlist)
print(locationlist[67]) #use it as reference - Downtown Borough (Kensington) 

map_toronto = folium.Map(location=[43.6532057, -79.4000493], zoom_start=16)

# add markers to map
for lat, lng, borough, postalcode, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Postal Code'], df['Neighbourhood']):
    label = '{}, {}, {}'.format(neighbourhood, borough, postalcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

['43.6532057', '-79.4000493']


Utilizing the Foursquare API to Explore the Neighborhoods and Segment them

In [59]:
|---
CLIENT_ID = 'FJFH4EL0RMQS3CDTZQLK1DRG3MFE5IAE2TV4NW0PQHSSCHPB' # your Foursquare ID
CLIENT_SECRET = 'WSKGNLKYASDJZVPGP0BAOYEL4AJUA0GQTWNCHCGIFX12JWYQ' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FJFH4EL0RMQS3CDTZQLK1DRG3MFE5IAE2TV4NW0PQHSSCHPB
CLIENT_SECRET:WSKGNLKYASDJZVPGP0BAOYEL4AJUA0GQTWNCHCGIFX12JWYQ


Exploring Selected Neighbourhood (Index 94) in our Dataframe.

In [66]:
df.loc[67, 'Neighbourhood'] #Get the neighborhood's name.

neighbourhood_latitude = df.loc[67, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = df.loc[67, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = df.loc[67, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

# Send the GET request and examine the results
results = requests.get(url).json()
results

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

nearby_venues.head(74)


Latitude and longitude values of Kensington Market, Chinatown, Grange Park are 43.6532057, -79.4000493.
74 venues were returned by Foursquare.




Unnamed: 0,name,categories,lat,lng
0,Seven Lives - Tacos y Mariscos,Mexican Restaurant,43.654418,-79.400545
1,Essence of Life Organics,Organic Grocery,43.654111,-79.400431
2,Kid Icarus,Arts & Crafts Store,43.653933,-79.401719
3,Blackbird Baking Co,Bakery,43.654764,-79.400566
4,Jimmy's Coffee,Café,43.654493,-79.401311
...,...,...,...,...
69,KOS,Breakfast Spot,43.655026,-79.403358
70,Shoppers Drug Mart,Pharmacy,43.653702,-79.406093
71,Pho Tien,Vietnamese Restaurant,43.652832,-79.406242
72,The Supermarket,Bar,43.656680,-79.402954
