<h1>Scraping postcodes, getting the latitudes and longitudes</h1>

In [33]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import numpy as np

#Create dataframe

column_list = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns=column_list)

#Get webpage

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = BeautifulSoup(urllib.request.urlopen(url), 'lxml')
page.prettify()

#Find all tables

right_table = page.find('table', {'class':'wikitable sortable'})

In [34]:
PostalCodes = []
Boroughs = []
Neighbourhoods = []

#Extract from each table row the data, add to list then add to dataframe

for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3 and cells[1].find(text=True)!= 'Not assigned\n':
        PostalCodes.append(cells[0].find(text=True))
        Boroughs.append(cells[1].find(text=True))
        Neighbourhoods.append(cells[2].find(text=True))

New_PostalCodes = []

for item in PostalCodes:
    en = len(item)-1
    item = item[0:en]
    New_PostalCodes.append(item)

New_Boroughs = []

for item in Boroughs:
    en = len(item)-1
    item = item[0:en]
    New_Boroughs.append(item)
    
New_Neighbourhoods = []

for item in Neighbourhoods:
    en = len(item)-1
    item = item[0:en]
    New_Neighbourhoods.append(item)
    
df['PostalCode'] = New_PostalCodes
df['Borough'] = New_Boroughs
df['Neighborhood'] = New_Neighbourhoods

<h2>Cleaning the data</h2>

In [29]:
#Replace not assigned neighbourhoods

for neigh, bor in zip(df['Neighborhood'],df['Borough']):
    if neigh =='Not assigned':
        if bor == 'Not assigned':
            neigh = neigh
        else:
            neigh = bor
    else:
        neigh = neigh

In [30]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [31]:
lats_longs = pd.read_csv('http://cocl.us/Geospatial_data')
lats_longs.columns = ['PostalCode', 'Latitude', 'Longitude']
lats_longs.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h2>Final dataframe</h2>

In [37]:
final_frame = df.merge(lats_longs, on='PostalCode', how='left')
final_frame.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
