# This notebook contains the code for my Capstone Project

In [97]:
import numpy as np
import pandas as pd
import bs4 as bs
import requests
pd.set_option("display.max_rows", 500, "display.max_columns", 500)
print("Package Installed")

Package Installed


### Using the requests and bs4.beautifulSoup modules, I pull a HTML formatted file from the Canada postal code Wiki page and extracted the table from that file, Later converting it into a Dataframe.

In [98]:
website_url = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969").text

In [99]:
soup = bs.BeautifulSoup(website_url,"lxml")
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YE5xPzggcRGantoZJcxXzAAAARg","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1012118802,"wgRevisionId":1011037969,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Ontario","Postal c

In [100]:
my_table = soup.find('table',{'class':'wikitable sortable'})

In [101]:
df=pd.read_html(str(my_table))
df=pd.DataFrame(df[0])
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### We then proceeded to remove all the "Not assigned" classification from the Borough column.

In [102]:
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [103]:
clean_df = df.reset_index(drop=True)
clean_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [104]:
df.shape

(103, 3)

### We opted to use the pgeocode package to retrieve latitude and and longitude coordinates - METHOD 1

In [105]:
!pip install pgeocode
print("Package Downloaded!")

Package Downloaded!


In [106]:
import pgeocode as pgcd

In [135]:
# Practice code to see it the pgeocode package worked
nomi = pgcd.Nominatim('ca')
lat = nomi.query_postal_code("M3A")
lat

postal_code                                                     M3A
country_code                                                     CA
place_name        North York (York Heights / Victoria Village / ...
state_name                                                  Ontario
state_code                                                       ON
county_name                                             North York 
county_code                                                     NaN
community_name                                                  NaN
community_code                                                  NaN
latitude                                                    43.7545
longitude                                                    -79.33
accuracy                                                          1
Name: 0, dtype: object

In [108]:
# Practice code to see it the pgeocode package worked
nomi = pgcd.Nominatim('ca')
lat = nomi.query_postal_code("M8W")["latitude"]
long = nomi.query_postal_code("M8W")["longitude"]
print(lat, ",",long)

43.6021 , -79.5402


### This part of the code adds the latitude and longitude by iterating through each row and identifying the postal code.

In [109]:
for lab, row in clean_df.iterrows():
    clean_df.loc[lab, "Latitude"] = nomi.query_postal_code(row["Postal Code"])["latitude"]
    clean_df.loc[lab, "Longitude"] = nomi.query_postal_code(row["Postal Code"])["longitude"]
clean_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


In [125]:
# Dropping one row taht pertains to for M7R that seems to belong to a sole warehouse, for Amazon
clean_df.dropna(inplace = True)

In [127]:
clean_df.shape

(102, 5)

### Visualizing all the Neighborhoods and clustering them

In [115]:
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
print("Package Downloaded!")
!pip install folium
import folium
print("Complete!")

Package Downloaded!
Complete!


In [116]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
tor_location = geolocator.geocode(address)
tor_latitude = tor_location.latitude
tor_longitude = tor_location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(tor_latitude, tor_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [128]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

for lat, lng, label in zip(clean_df['Latitude'], clean_df['Longitude'], clean_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 
 
    
map_toronto

## Initializing K-mean algorithm to cluster neighbourhoods based on distance from eachother.

#### Dropping the categorical columns first, then running the K-mean algorithm

In [129]:
clustering_table = clean_df.drop(['Postal Code','Borough','Neighbourhood'],axis = 1)
clustering_table

Unnamed: 0,Latitude,Longitude
0,43.7545,-79.33
1,43.7276,-79.3148
2,43.6555,-79.3626
3,43.7223,-79.4504
4,43.6641,-79.3889
5,43.6662,-79.5282
6,43.8113,-79.193
7,43.745,-79.359
8,43.7063,-79.3094
9,43.6572,-79.3783


In [130]:
k = 7
k_means = KMeans(init="k-means++", n_clusters=k, n_init=12)
k_means.fit(clustering_table)
clustering_labels = k_means.labels_
clustering_labels

array([5, 3, 1, 4, 1, 2, 0, 5, 3, 1, 4, 2, 0, 3, 3, 1, 4, 2, 0, 3, 1, 4,
       0, 1, 1, 1, 0, 5, 4, 3, 1, 1, 0, 5, 6, 3, 1, 1, 3, 5, 4, 1, 1, 1,
       3, 5, 6, 3, 1, 6, 6, 0, 4, 6, 1, 4, 6, 6, 3, 4, 6, 4, 4, 2, 6, 0,
       4, 4, 4, 2, 6, 3, 4, 4, 1, 2, 6, 0, 1, 1, 2, 5, 1, 1, 5, 1, 1, 2,
       6, 5, 1, 1, 2, 6, 0, 1, 1, 2, 1, 0, 2, 2], dtype=int32)

In [131]:
clean_df["Label"] = clustering_labels
clean_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Label
0,M3A,North York,Parkwoods,43.7545,-79.33,5
1,M4A,North York,Victoria Village,43.7276,-79.3148,3
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,1
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,4
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,1
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282,2
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193,0
7,M3B,North York,Don Mills,43.745,-79.359,5
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094,3
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,1


In [132]:
# To add color for identifying clusters easier
import matplotlib.cm as cm
import matplotlib.colors as colors

In [134]:
map_toronto_cluster = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, label, cluster in zip(clean_df['Latitude'], clean_df['Longitude'], clean_df['Neighbourhood'], clean_df['Label']):
    label = folium.Popup(str(label) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto_cluster)
map_toronto_cluster

Thanks!