### Performing Website Scrapping using Beautiful Soup Library

This package is imported from bs4 library

In [None]:
from bs4 import BeautifulSoup

Importing requests package to download the data from the web

In [None]:
import requests

In [None]:
import pandas as pd
import numpy as np

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [None]:
response = requests.get(url)

In [None]:
response.status_code

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
soup

#### Extracting the Table Structure from Wikipedia

In [19]:
table = soup.find('table', {'class': 'wikitable sortable'}).tbody
print(table)

<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North Yor

In [23]:
rows = soup.find_all('tr')
rows[0]

<tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>

#### Extracting the Columns for the Data Frame

In [141]:
columns = [i.text.replace('\n', '') for i in rows[0].find_all('th')]
columns

['Postcode', 'Borough', 'Neighbourhood']

In [142]:
df = pd.DataFrame(columns = columns)

In [143]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood


#### Putting each row into the Data Frame

In [144]:
for i in range(1, len(rows)):
    td_tags = rows[i].find_all('td')
    
    if len(td_tags) == 3:
        data = [td_tags[0].text.replace('\n', ''), td_tags[1].text.replace('\n', ''), td_tags[2].text.replace('\n', '')]
    else:
        data = [td.text.replace('\n', '') for td in td_tags]
   
    df = df.append(pd.Series(data, index = columns), ignore_index= True)
    

ValueError: Length of passed values is 1, index implies 3

In [145]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [146]:
df.Borough.value_counts()

Not assigned        77
Etobicoke           44
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

#### Eliminating all the rows having Borough as Not assigned

In [147]:
df = df[df['Borough'] != 'Not assigned']

In [155]:
df= df.reset_index()

In [160]:
df.drop('index', axis = 1, inplace = True)

In [161]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### Replacing all the 'Not assigned' values in Neighbourhood Column with the values from Borough

In [178]:
for i, row in enumerate(df):
    if df.loc[i,'Neighbourhood'] == 'Not assigned':
        df.loc[i,'Neighbourhood'] = df.Borough

In [179]:
df.Neighbourhood.value_counts()

St. James Town                           2
Runnymede                                2
L'Amoreaux West                          1
South Niagara                            1
Flemingdon Park                          1
Dovercourt Village                       1
CFB Toronto                              1
Dorset Park                              1
Lawrence Manor East                      1
Alderwood                                1
Silverthorn                              1
The Junction South                       1
Don Mills North                          1
East Toronto                             1
North Toronto West                       1
Victoria Village                         1
Parkwoods                                1
Steeles East                             1
Newtonbrook                              1
Caledonia-Fairbanks                      1
Willowdale West                          1
Kingsview Village                        1
The Danforth West                        1
The Queensw

In [186]:
df['Postcode'].unique()

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

### Final Shape of the Data Frame

In [188]:
df.shape

(210, 3)