This notebook is to: 
- scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
- create a dataframe accordingly

In [2]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd 

In [3]:
# get html from wiki page and create soup object, using lxml parser
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url)
soup = BeautifulSoup(source.text, 'lxml')

#using soup object to get the data
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())

    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
df_canada = pd.DataFrame(data = data,columns = columns)
df_canada.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [5]:
len(df_canada['Postal code'].unique())

180

In [6]:
# print the original shape of the dataset
df_canada.shape

(180, 3)

In [7]:
# drop Borough=='Not assigned'
df_canada = df_canada[df_canada.Borough!='Not assigned']
# print the shape of the dataset ignore Borough == 'Not assigned'
df_canada.shape

(103, 3)

In [8]:
# check the unique Postal codes
len(df_canada['Postal code'].unique())

103

In [9]:
# check the Neighborhood == 'Not assigned'
df_canada[df_canada.Neighborhood=='Not assigned'].count()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [10]:
# take a look at the head
df_canada.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


It turns out the Neighborhood does not have values with 'Not assigned', and there is no duplicated Postal codes in the dataset,
in stead, there are values in Neigborgood seperated with '/', while it is required to be ',', therefore it has to be replaced with ','.

In [11]:
def replace_function(x):
    temp_list = x.split('/')
    out = [item.strip() for item in temp_list]
    return ', '.join(out)

In [12]:
df_canada.Neighborhood = df_canada.Neighborhood.apply(replace_function)

In [13]:
# reset index
df_canada.reset_index(inplace=True)
df_canada.drop(columns=['index'], inplace=True)

In [30]:
df_canada.columns=['PostalCode','Borough','Neighborhood']

In [31]:
# take a look at the head
df_canada.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
# print out shape
df_canada.shape

(103, 3)

In [25]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')

In [26]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [32]:
geo_data.columns=['PostalCode', 'Latitude', 'Longitude']

In [33]:
merged_df = pd.merge(df_canada, geo_data, on='PostalCode')

In [36]:
merged_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [35]:
merged_df.shape

(103, 5)