1. Create new notebook

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

2. Scrape a [wikipedia page](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
wikitable = soup.find('table', class_='wikitable sortable')

In [4]:
table = [[ele.text.strip() for ele in wikitable.find_all('th')]]
for row in wikitable.find_all('tr')[1:]:
    cols = row.find_all('td')
    table.append([ele.text.strip() for ele in cols])

In [5]:
with open('canada.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(table)

3. Create dataframe

In [6]:
canada = pd.read_csv('canada.csv')
canada

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [7]:
# consist of three columns: PostalCode, Borough, and Neighborhood
canada.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)

In [8]:
# ignore cells with a borough that is "Not assigned"
canada = canada[canada['Borough'] != 'Not assigned']

In [9]:
# more than one neighborhood can exist in one postal code area
canada = canada.groupby(['PostalCode']).agg(lambda x: ', '.join(set(x))).reset_index()

In [10]:
# if a cell has a borough but a Not assigned neighborhood, then the 
# neighborhood will be the same as the borough
canada['Neighborhood'].loc[canada['Neighborhood'] == 'Not assigned'] = canada['Borough']

In [11]:
# show top (just like the example, 12) rows
canada.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Kennedy Park, Ionview"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Scarborough Village West, Cliffcrest, Cliffside"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


In [12]:
# use the .shape method to print the number of rows of dataframe
canada.shape

(103, 3)