Import necessary libraries

In [20]:
#!pip install bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import wget
import os
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

Parse html document from url using beautiful soup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
soup_parse = requests.get(url).text
soup = BeautifulSoup(soup_parse, 'html.parser')

Create empty Dataframe using pandas

In [3]:
table_contents=[]
df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

If Neighborhood is Not Assigned but has a Borough, then the Neighborhood is the Borough<br>
If Neighborhood and Borough are both Not Assigned, skip that row<br>
If more than one neighborhood is in a postal area, the neighborhoods are listed in the same column seperated by a comma

In [4]:
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [5]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [6]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
df.shape

(103, 3)

In [8]:
'''#!pip install geocoder
import geocoder # import geocoder'''

'#!pip install geocoder\nimport geocoder # import geocoder'

In [9]:
'''values = {}
for postal_code in df['PostalCode']:
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    values[postal_code] = [latitude, longitude]'''
#Geocoder takes too long to return result of API call

"values = {}\nfor postal_code in df['PostalCode']:\n    lat_lng_coords = None\n    # loop until you get the coordinates\n    while(lat_lng_coords is None):\n      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n      lat_lng_coords = g.latlng\n\n    latitude = lat_lng_coords[0]\n    longitude = lat_lng_coords[1]\n    \n    values[postal_code] = [latitude, longitude]"

Download location CSV file to notebook for use

In [10]:
if os.path.exists('./Geospatial_Coordinates.csv'):
    print('file already exists')
else:
    wget.download('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')
    print('file downloaded')

file already exists


In [11]:
new_csv = pd.read_csv('Geospatial_Coordinates.csv')
new_csv.rename(columns = {'Postal Code':'PostalCode'}, inplace=True)

In [12]:
new_df = pd.merge(df, new_csv, on='PostalCode')
new_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Data encoding and Data wrangling

In [13]:
# one hot encoding
for_encoding = new_df #.drop(columns=['Neighborhood', 'PostalCode'])
toronto_onehot = pd.get_dummies(for_encoding[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Longitude'] = new_df['Longitude']
toronto_onehot['Latitude'] = new_df['Latitude']
toronto_onehot['Neighborhood'] = new_df['Neighborhood']



# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

#toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York,Longitude,Latitude
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.329656,43.753259
1,Victoria Village,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.315572,43.725882
2,"Regent Park, Harbourfront",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-79.360636,43.65426
3,"Lawrence Manor, Lawrence Heights",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.464763,43.718518
4,Ontario Provincial Government,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-79.389494,43.662301


In [14]:
toronto_onehot.shape

(103, 18)

Clustering

In [15]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_onehot.drop(columns = ['Neighborhood'])

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 4, 2, 3, 0, 1, 2, 3, 4], dtype=int32)

In [16]:
toronto_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

In [17]:
toronto_grouped_clustering['Neighborhood'] = new_df['Neighborhood']

In [18]:
# add clustering labels
toronto_grouped_clustering

Unnamed: 0,Cluster Labels,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York,Longitude,Latitude,Neighborhood
0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.329656,43.753259,Parkwoods
1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.315572,43.725882,Victoria Village
2,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-79.360636,43.654260,"Regent Park, Harbourfront"
3,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-79.464763,43.718518,"Lawrence Manor, Lawrence Heights"
4,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-79.389494,43.662301,Ontario Provincial Government
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-79.506944,43.653654,"The Kingsway, Montgomery Road, Old Mill North"
99,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-79.383160,43.665860,Church and Wellesley
100,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-79.321558,43.662744,Enclave of M4L
101,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-79.498509,43.636258,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Map Rendering/Visualization

In [19]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Neighborhood'], toronto_grouped_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters