# Web scraping a table from Wikipedia

### Exercise part of the course *Applied Data Science Capstone* by Coursera

In [2]:
#pip install beautifulsoup4
#!pip install lxml

from bs4 import BeautifulSoup

In [3]:
import requests
import urllib.request
import numpy as np
import pandas as pd
from urllib.request import urlopen

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [5]:
tables = soup.find_all('table')

In [6]:
postal_codes = []
boroughs = []
neighborhoods = []

for table in tables:
    rows = table.find_all('tr')

    for row in rows:
        cells = row.find_all('td')
        
        if len(cells) > 2:
            postal_code = cells[0]
            postal_codes.append(postal_code.text.strip())
            
            borough = cells[1]
            boroughs.append(borough.text.strip())
            
            neighborhood = cells[2]
            neighborhoods.append(neighborhood.text.strip())
            
            

In [64]:
df1 = pd.DataFrame(postal_codes,
columns = ['Postal Code'])
df1['Neighborhood'] = neighborhoods
df1['Borough'] = boroughs
df1.head(10)


Unnamed: 0,Postal Code,Neighborhood,Borough
0,M1A,,Not assigned
1,M2A,,Not assigned
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,"Regent Park, Harbourfront",Downtown Toronto
5,M6A,"Lawrence Manor, Lawrence Heights",North York
6,M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto
7,M8A,,Not assigned
8,M9A,Islington Avenue,Etobicoke
9,M1B,"Malvern, Rouge",Scarborough


In [65]:
df1.shape

(185, 3)

In [66]:
#Deleting Not Assigned Boroughs
df1.drop(df1[df1.Borough == 'Not assigned'].index, inplace=True)


In [68]:
df1.reset_index(inplace=True,drop = True)

In [71]:
df1.tail(10)

Unnamed: 0,Postal Code,Neighborhood,Borough
98,M8X,"The Kingsway, Montgomery Road, Old Mill North",Etobicoke
99,M4Y,Church and Wellesley,Downtown Toronto
100,M7Y,Business reply mail Processing Centre,East Toronto
101,M8Y,"Old Mill South, King's Mill Park, Sunnylea, Hu...",Etobicoke
102,M8Z,"Mimico NW, The Queensway West, South of Bloor,...",Etobicoke
103,NL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n...,NS,NL
104,NL,PE,NS
105,A,C,B
106,NL,PE,NS
107,A,C,B


The last 5 rows contain data that isn't part of the table we were interested.

In [74]:
df1.drop(index= [103,104,105,106,107],inplace = True)


In [75]:
pd.value_counts(df1['Postal Code'])

M6A    1
M5M    1
M9M    1
M1M    1
M5C    1
      ..
M1B    1
M6G    1
M1J    1
M2M    1
M1L    1
Name: Postal Code, Length: 103, dtype: int64

In [78]:
df1.dtypes

Postal Code     object
Neighborhood    object
Borough         object
dtype: object

In [94]:
sum(df1.duplicated())

0

Because the sum of the previous list (False (0) and True (1)) is 0, it means all the cells are False therefore each row is unique.

In [90]:
sum(df1['Neighborhood'].isnull())

0

Because the sum of the previous list (False (0) and True (1)) is 0, it means all the cells are False therefore the column Neighborhood doesn't contain empty cells.

In [95]:
df1.head()

Unnamed: 0,Postal Code,Neighborhood,Borough
0,M3A,Parkwoods,North York
1,M4A,Victoria Village,North York
2,M5A,"Regent Park, Harbourfront",Downtown Toronto
3,M6A,"Lawrence Manor, Lawrence Heights",North York
4,M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto


In [100]:
sum(df1['Postal Code'].str.len() != 3)

0

the column Postal Code doesn't contain string values different than 3, corresponding to a normal postal code.

In [96]:
df1.shape

(103, 3)