# Scraping

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

In [6]:
row[:5]

['\nPostcode\nBorough\nNeighbourhood\n',
 '\nM1A\nNot assigned\nNot assigned\n',
 '\nM2A\nNot assigned\nNot assigned\n',
 '\nM3A\nNorth York\nParkwoods\n',
 '\nM4A\nNorth York\nVictoria Village\n']

# Creating the Dataframe

In [9]:
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand=True)

In [10]:
df1.head()

Unnamed: 0,0,1,2,3,4
0,,Postcode,Borough,Neighbourhood,
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,


In [11]:
df2 = df1.rename(columns=df1.iloc[0])

In [12]:
df2.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
0,,Postcode,Borough,Neighbourhood,
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,


In [13]:
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,


We now clean the dataframe and remove rows having 'Borough' attribute as 'Not Assigned'.

In [14]:
df4 = df3[df3.Borough != 'Not assigned']
df4.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,
6,,M5A,Downtown Toronto,Regent Park,
7,,M6A,North York,Lawrence Heights,


Group the dataframe on the basis of 'Postcode' and 'Borough' attributes.

In [15]:
df5 = df4.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


We replace the cells haiving 'Neighbourhood' attribute as 'Not assigned' to the name of the Borough itself.

In [16]:
df6 = df5.replace("Not assigned", "Queen's Park")
df6.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


We now examine the shape of the ready-to-use Dataframe.

In [18]:
df6.shape

(103, 3)

Adding Latitude and Longitude for each neighbourhood

In [21]:
df_cord = pd.read_csv(r'C:\Users\Udish Nagpal\Downloads\Geospatial_Coordinates.csv')

In [23]:
import numpy as np
df6['Latitude'] = np.nan
df6['Longitude'] = np.nan

# For each postcode in df_assigned, find corresponding coordinates in df_cord and assign it to df_assigned
for idx in df6.index:
    cord_idx = df_cord['Postal Code'] == df6.loc[idx, 'Postcode']
    df6.at[idx, 'Latitude'] = df_cord.loc[cord_idx, 'Latitude'].values
    df6.at[idx, 'Longitude'] = df_cord.loc[cord_idx, 'Longitude'].values

# Display the results
df6.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
