# Segmenting and Clustering Neighborhoods in Toronto - Exploring and clustering

Prepare the dataftame with geodata for each neigborhoods

In [17]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import re
POSTAL_CODES_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
contents = urllib.request.urlopen(POSTAL_CODES_URL).read()
soup = BeautifulSoup(contents, 'html.parser')
# Grid version
# cells = soup.select('td > p')
# span_parser = re.compile('(.*)\(+(.*)\)+')
# dataframe_data = []
# for cell in cells:    
#     if cell.span.i == None:
#         postal_code = cell.b.string
#         span_text = cell.span.getText()
#         match = span_parser.match(span_text)
#         if match == None:
#             borough = span_text
#             neighborhood = span_text
#         else:
#             borough = match.group(1)
#             neighborhood = match.group(2)
            
#         dataframe_data.append({
#             'Postal Code': postal_code, 
#             'Borough': ','.join(x.strip() for x in borough.split('/')),
#             'Neighborhood': ','.join(x.strip() for x in neighborhood.split('/'))
#         })
# df = pd.DataFrame(dataframe_data)

# Table view 
table = soup.find("table",{"class":"wikitable"})
df = pd.read_html(str(table))[0]

# Process dataframe
df = df[df.Borough != 'Not assigned']
df['Neighborhood'] = df['Neighborhood'].apply(lambda x: ','.join(y.strip() for y in x.split('/')))
df.rename(columns={"Postal code": "Postal Code"}, inplace=True)

geo_df = pd.read_csv("Geospatial_Coordinates.csv") 
result_df = pd.merge(df, geo_df, on='Postal Code')
df = result_df.set_index('Postal Code')
df

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park,Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...
M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.636258,-79.498509


Let's go and explore these neighborhoods. First, we should import the neccessary libraries:

In [2]:
import requests # library to handle requests
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image 
from IPython.core.display import HTML 
from pandas.io.json import json_normalize
import folium # plotting library

Than load the credentials for the Forsquare API from the environment variables

In [3]:
%reload_ext dotenv
%dotenv
import os
CLIENT_ID = os.getenv("FOURSQUARE_CLIENT_ID") # your Foursquare ID
CLIENT_SECRET = os.getenv("FOURSQUARE_CLIENT_SECRET") # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
if CLIENT_ID and CLIENT_SECRET:
    print('Credentials loaded')

Credentials loaded


Let's take all boroughs that contains word Toronto and take one from the each

In [30]:
df = result_df[result_df['Borough'].str.contains('Toronto')]
df = df.drop_duplicates(['Borough'])
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
31,M6H,West Toronto,"Dufferin,Dovercourt Village",43.669005,-79.442259
61,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Display all these boroughs on the map

In [41]:
venues_map = folium.Map(location=[43.696782, -79.371767], zoom_start=12)

for lat, lng, label in zip(df.Latitude, df.Longitude, df.Borough):
     folium.Marker(
        [lat, lng],
        popup=label,
        icon=folium.Icon(color='green')
     ).add_to(venues_map)
    
venues_map

Show the different venues in radius 500m around each borough

In [42]:
radius=500

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
for latitude, longitude in zip(df.Latitude, df.Longitude):    
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, 50)
    results = requests.get(url).json()
    items = results['response']['groups'][0]['items']
    dataframe = pd.json_normalize(items) 
    filtered_columns = ['venue.name', 'venue.categories'] + [col for col in dataframe.columns if col.startswith('venue.location.')] + ['venue.id']
    dataframe_filtered = dataframe.loc[:, filtered_columns]
    dataframe_filtered['venue.categories'] = dataframe_filtered.apply(get_category_type, axis=1)
    dataframe_filtered.columns = [col.split('.')[-1] for col in dataframe_filtered.columns]
    for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            fill=True,
            color='blue',
            fill_color='blue',
            fill_opacity=0.6
            ).add_to(venues_map)
venues_map    

On this map we can see that Downtown Toronto has the most venues