In [25]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup    # for web scraping
import urllib.request

import folium

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = urllib.request.urlopen(url)
article = req.read().decode()

with open("List_of_postal_codes_of_Canada:_M", 'w') as fo:
    fo.write(article)

In [3]:
# Load article, turn into soup and get the <table>s.
article = open("List_of_postal_codes_of_Canada:_M").read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

In [4]:
# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    
# Obtain the information we want from each row
for table in tables:
    row = table.find_all('td')
    cot = [td.text.strip() for td in row]

In [5]:
# create the dataframe that obtains all the row
toronto = pd.DataFrame(cot)

# reshape the dateframe and rename the columns by headings
toronto = pd.DataFrame(toronto.values.reshape(180, 3), columns=headings)

In [6]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
# find the rows that have Not Assigned in Borough
toronto.loc[toronto["Borough"] == "Not assigned"]

# Drop these rows
toronto.drop(toronto.loc[toronto["Borough"] == "Not assigned"].index, inplace=True)

In [8]:
toronto

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
# check if there is any not assigned neighborhood
toronto.loc[toronto["Neighborhood"] == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [10]:
# Check the shape of the Dataframe
toronto.shape

(103, 3)

In [11]:
# load the geographical coordinates of each postal code
geo = pd.read_csv("Geospatial_Coordinates.csv")

# take a look of the shape of the postal code dataframe
geo.shape

(103, 3)

Match with the dataframe that we scrape from the wiki page

In [12]:
# Do a outer join to join the 2 dataframes and save it in a variable
toronto_geo = pd.merge(toronto, geo, on="Postal Code", how="outer")

In [13]:
# check the new dataframe
toronto_geo.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Explore and cluster the neighborhoods

In [23]:
# find all the rows the have Toronto in the Borough column
toronto_geo.loc[toronto_geo['Borough'].str.contains("Toronto")]

# slice the original dataframe and create a new dataframe of the Toronto data
toronto_data = toronto_geo.loc[toronto_geo['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [27]:
# create map
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto