In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 # I am importing this as part of instructions that I will need BeautifulSoup()

In [2]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
res.raise_for_status()
soupObject = bs4.BeautifulSoup(res.text, "lxml")

In [3]:
soupObject.select('td')

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [4]:
# now I have to make a list will all cells in the tables
cells = []

# this is used to get what is contained between all the <td> </td>
for index, value in enumerate(soupObject.select('td')):
    cells.append(str(soupObject.select('td')[index])[4:-5]) # we slice the string first four characters (<td>) and the last five characters (</td>)

# remove lines    
for index, value in enumerate(cells):
    cells[index] = cells[index].rstrip('\n')

# I will keep keep only the title for the Boroughs and Neighbourhoods
for index, value in enumerate(cells):
    cells[index] = cells[index].split('>')
    try:
        cells[index] = cells[index][-2].rstrip('</a')
    except IndexError:
        cells[index] = cells[index]
cells

[['M1A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M2A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M3A'],
 'North York',
 'Parkwoods',
 ['M4A'],
 'North York',
 'Victoria Village',
 ['M5A'],
 'Downtown Toronto',
 'Harbourfront',
 ['M5A'],
 'Downtown Toronto',
 'Regent Park',
 ['M6A'],
 'North York',
 'Lawrence Heights',
 ['M6A'],
 'North York',
 'Lawrence Manor',
 ['M7A'],
 "Queen's Park",
 ['Not assigned'],
 ['M8A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M9A'],
 'Etobicoke',
 'Islington Avenue',
 ['M1B'],
 'Scarborough',
 'Rouge',
 ['M1B'],
 'Scarborough',
 'Malvern',
 ['M2B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M3B'],
 'North York',
 ['Don Mills North'],
 ['M4B'],
 'East York',
 'Woodbine Gardens',
 ['M4B'],
 'East York',
 'Parkview Hill',
 ['M5B'],
 'Downtown Toronto',
 'Ryerson',
 ['M5B'],
 'Downtown Toronto',
 ['Garden District'],
 ['M6B'],
 'North York',
 'Glencairn',
 ['M7B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M8B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M9B']

In [5]:
# this helps me get rid of the sublists
for index, value in enumerate(cells):
    if type(cells[index]) == list:
        cells[index] = cells[index][0]

cells

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M5A',
 'Downtown Toronto',
 'Regent Park',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 "Queen's Park",
 'Not assigned',
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 'Etobicoke',
 'Islington Avenue',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',
 

# Now I can create my dataframe

In [7]:
# I will first create an empty dataframe with my headers
column_names = ['Postalcode','Borough','Neighbourhood']
df = pd.DataFrame(columns=column_names)

# we have three values of the list to fill per row of the dataframe
x = 0
for index, value in enumerate(cells):
    try:
        df.loc[index] = [cells[x], cells[x+1], cells[x+2]]
        x = x + 3
    except IndexError:
        break
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [8]:
# This will get rid of the postal codes for which the borough are unavailable
for row in range(len(df)):
    if df.loc[row]['Borough'] == 'Not assigned':
        df.drop(row, axis = 0, inplace = True)

In [9]:
# This will put Neighbourhood, Boroughm when Neighbourhood is not assigned
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [10]:
# this will combine the neighbourhoods for each postal code
df = df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(', '.join)
df = pd.DataFrame(df.reset_index(name = ''))
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,,,
1,</table,NL,NS
2,AB,BC,NT
3,C,E,G
4,H,J,K
5,L,M,N
6,M1B,Scarborough,"Rouge, Malvern"
7,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
8,M1E,Scarborough,"Guildwood, Morningside, West Hill"
9,M1G,Scarborough,Woburn


# Lets get the shape using .shape

In [11]:
df.shape

(114, 3)