In [1]:
import urllib3
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# fire GET request to fetch postal code data
http = urllib3.PoolManager()
response = http.request('GET', 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
page = response.data



In [3]:
# create lists
columns = []
postcodes = []
boroughs = []
neighbourhoods = []

# use BeautifulSoup to parse HTML page and extract table/rows/columns
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("table", class_="wikitable sortable")
headers = table.find_all("th")
rows = table.find_all("tr")

# extract column headers
for header in headers:
    columns.append(header.text.rstrip())
    
# extract postcodes, boroughs and neighbourhoods from rows/columns
for i in range(1, len(rows)):
    cols = rows[i].find_all("td")
    postcodes.append(cols[0].text.rstrip())
    boroughs.append(cols[1].text.rstrip())
    neighbourhoods.append(cols[2].text.rstrip())

# use zip to create dataframe
list_of_tuples = list(zip(postcodes, boroughs, neighbourhoods))  
df = pd.DataFrame(list_of_tuples, columns=columns)  
df  

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [8]:
## clean data and create final dataframe

# handle invalid values for Borough
df_clean = df.drop(df[df["Borough"] == 'Not assigned'].index)
df_clean.reset_index(drop=True)

# handle invalid values for Neighbourhood - no rows
print(df_clean[df_clean["Neighbourhood"] == 'Not assigned'])

# group by Postcode
df_grp = df_clean.groupby("Postcode")

# iterate through each group to extract required data
postcodes = []
boroughs = []
neighbourhoods = []
for key,group_df in df_grp:
    #print(f"key={key}, borough={group_df['Borough'].unique()[0]}, neighbourhoods={group_df[['Neighbourhood']].apply(lambda n: ','.join(n))[0]}")
    postcodes.append(key)
    boroughs.append(group_df['Borough'].unique()[0])
    neighbourhoods.append(group_df[['Neighbourhood']].apply(lambda n: ','.join(n))[0])

list_of_tuples = list(zip(postcodes, boroughs, neighbourhoods))  
df_final = pd.DataFrame(list_of_tuples, columns=columns)  
df_final

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [6]:
df_final.shape

(103, 3)