# Notebook for scrape data from Wikipedia page

### Import libraries

In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Scrape data from Wikipedia

In [8]:
#Get raw data from Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url).text
soup = BeautifulSoup(data,'lxml')
table=soup.tbody
#print (table)

In [12]:
#Create a data frame
column_names = ['Postalcode','Borough','Neighborhood']
df_Toronto = pd.DataFrame(columns = column_names)

In [13]:
#loop data in frame
for tr_code in table.find_all('tr'):
    i = 0
    for td_code in tr_code.find_all('td'):
        if i == 0:
            postcode = td_code.text
            i = i + 1
        elif i == 1:
            borough = td_code.text
            i = i + 1
        elif i == 2: 
            neighborhood = td_code.text.strip('\n').replace(']','')
    #Ignore cells with a borough that is Not assigned
    if borough == "Not assigned":
        pass
    else: df_Toronto = df_Toronto.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

df_Toronto.head(10)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [14]:
df_Toronto.shape

(211, 3)

In [15]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df_Toronto.loc[df_Toronto['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df_Toronto['Borough']

### Clean the data

In [16]:
#Count the number of Unique Postal Code
n = df_Toronto.Postalcode.nunique()
print (n)

103


In [17]:
# Group the dataset by PostalCode and Borough.
df_Toronto = df_Toronto.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

In [18]:
# Show the cleansed dataframe
df_Toronto.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
