In [82]:
import pandas as pd
import numpy as np

Importing the Wikipedia page into a BeautifulSoup document:

In [85]:
from bs4 import BeautifulSoup
import requests
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
page = BeautifulSoup(html.text, 'html.parser')
page.title

<title>List of postal codes of Canada: M - Wikipedia</title>

Parsing the HTML table into a list of list of strings, and using it to create a Pandas DataFrame:

In [86]:
lines = page.select('.wikitable tr')
df = pd.DataFrame([line.stripped_strings for line in lines[1:]])
df.columns = list(lines[0].stripped_strings)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Filtering out cells with a borough that is **Not assigned**. If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough:

In [87]:
df = df[df['Borough'] != 'Not assigned']
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Grouping all neighborhoods within a single postal code into one row, with the neighborhoods separated with a comma:

In [88]:
dfg = df.groupby(['Postcode', 'Borough']).agg(lambda x: ', '.join(x)).reset_index()
dfg.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [89]:
dfg.shape

(103, 3)