# Scrape Wikipedia page for Canada Postalcodes: M

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Get page and parse into soup
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')

# Convert soup into dataframe
df = pd.read_html(str(table))[0]

df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [2]:
# Fix column headers
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0))
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [3]:
# Ignore cells where Borough is not assigned
df = df[df.Borough != "Not assigned"]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [4]:
# Copy Borough value to "Not assinged" Neighbourhoods
df['Neighbourhood'] = np.where(df['Neighbourhood'] == "Not assigned", df['Borough'], df['Neighbourhood'])
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [5]:
# Remember Postcode-Borough combinations
postcode_borough = df[['Postcode', 'Borough']].drop_duplicates()
postcode_borough.head()

Unnamed: 0,Postcode,Borough
3,M3A,North York
4,M4A,North York
5,M5A,Downtown Toronto
7,M6A,North York
9,M7A,Queen's Park


In [6]:
# Put Neighbourhoods with the same Postcode in the same row
df = df.groupby('Postcode')['Neighbourhood'].apply(lambda x: ", ".join(x))
df = pd.DataFrame(df).reset_index()
df.head()

Unnamed: 0,Postcode,Neighbourhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [7]:
# Merge both dataframes back together
df = pd.merge(postcode_borough, df, how='left', on=['Postcode'])
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [8]:
# Show dataframe shape
df.shape

(103, 3)