In [7]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [10]:
# HTML-script reading

html = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()

soup = BeautifulSoup(html, features='lxml')
#print(soup.prettify())

In [12]:
# Extract data with class Wikitable Sortable

My_table = soup.find('table',{'class':'wikitable sortable'})
#My_table

In [24]:
# Extracting table contents and adding data to the list

data = []

for record in My_table.findAll('td'):
    data.append(record.text)

data[0:9]

['M1A\n',
 'Not assigned\n',
 'Not assigned\n',
 'M2A\n',
 'Not assigned\n',
 'Not assigned\n',
 'M3A\n',
 'North York\n',
 'Parkwoods\n']

In [30]:
# Extracting three separate lists for Postal Code, Borough, and Neighborhood from data

PostalCode = data[0::3]
Borough = data[1::3]
Neighborhood = data[2::3]
PostalCode = [line.rstrip('\n') for line in PostalCode]
Borough = [line.rstrip('\n') for line in Borough]
Neighborhood = [line.rstrip('\n') for line in Neighborhood]

In [31]:
# Creating the dataframe df from the lists PostalCode, Borough and Neighborhood

df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [32]:

# To ignore cells with "Not assigned"

df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [33]:
# We combine row values of Neighborhood into one row that share the same postal code 

df = df.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(','.join)

# For Borough "Queen's Park", replace its Neighborhood column value with "Queen's Park"

df.replace('Not assigned', 'Queen\'s Park', inplace = True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [34]:
df.shape

(103, 3)