In [27]:
!pip install BeautifulSoup4
!pip install requests



In [102]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [103]:
# getting data from webpage
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
URL_page= requests.get(URL).text

# using beautiful soup to read lxml codes.
soup = BeautifulSoup(URL_page,'xml')

In [104]:
# extracting the raw table inside that webpage
def table_cell(i):
    cells = i.find_all('td')
    row = []
    
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
        
    return row

def table_row():    
    data = []  
    
    for tr in table.find_all('tr'):
        row = table_cell(tr)
        if len(row) != 3:
            continue
        data.append(row)        
    
    return data

In [105]:
#Creating DataFrame that consist of three columns: PostalCode, Borough, and Neighborhood
data = table_row()
columns = ['Postcode', 'Borough', 'Neighbourhood']
wiki = pd.DataFrame(data, columns=columns)
wiki.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [106]:
wiki.shape

(180, 3)

## Cleaning the data:

In [107]:
#Ignore cells with a borough that is Not assigned
wiki = wiki[wiki['Borough'] != 'Not assigned']

#Lets combine the neighborhoods with same Postcode in rows.
wiki["Neighbourhood"] = wiki.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#We have to separate the Neighbourhood with"Comma" instead of "/"
wiki["Neighbourhood"]= wiki["Neighbourhood"].str.replace("/", ", ")

#To remove Duplicate Postcodes, we could assign the column as index and retain data as well.
wiki.set_index("Postcode", inplace=True)


In [108]:
#Check the Data
wiki.head(20)

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park , Harbourfront"
M6A,North York,"Lawrence Manor , Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,"Malvern , Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill , Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


In [109]:
#look up the Shape of wikidata
print (wiki.shape)

(103, 2)
