In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

First, let's get the wikipedia page of interest, then use BeautifulSoup to parse it

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(source, 'lxml')

let's open a csv file and write to it our headers

In [19]:
csv_file = open('postal_code.csv', 'w')

In [20]:
csv_writer = csv.writer(csv_file)

In [21]:
csv_writer.writerow(['PostalCode','Borough', 'Neighbourhood'])

34

In [22]:
table = soup.find('table')

In [23]:
for row in table.findAll('tr'):
    items = row.findAll('td')
    if len(items) == 3:
        csv_writer.writerow([items[0].text.strip(), items[1].text.strip(), items[2].text.strip()])

In [24]:
csv_file.close()

Close the file and open it with pandas

In [88]:
data = pd.read_csv('postal_code.csv')

In [89]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


If any cell doesn't have an assigned borough we  can drop it. If there are any borough's with no assigned neighbourhoods we can fix it

In [91]:
data.drop(data.loc[data.Borough == 'Not assigned'].index,inplace=True)
data.reset_index(drop=True,inplace=True)

In [92]:
data.loc[data.Neighbourhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M7A,Queen's Park,Not assigned


In [93]:
data.at[6, 'Neighbourhood'] = "Queen's Park"

Let's find all the unique postal codes and join the neighbourhoods together

In [94]:
unique = list(data.PostalCode.unique())

In [95]:
new_data = {}
for postal in unique:
    this = data.loc[data.PostalCode == postal]
    neighbourhood = list(this.Neighbourhood.unique())
    Borough = list(this.Borough.unique())
    new_data[postal] = [''.join(Borough), ', '.join(neighbourhood)]

In [96]:
pd_2 = pd.DataFrame.from_dict(new_data, orient='index', columns=['Borough', 'Neighborhood'])
pd_2.reset_index(drop=False,inplace=True)

In [97]:
pd_2.head()

Unnamed: 0,index,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [100]:
pd_2.rename(columns={'index': 'PostalCode'},inplace=True)

In [105]:
pd_2.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [104]:
pd_2.shape

(103, 3)

All done

# Geospatial Data

In [110]:
geospatial = pd.read_csv('Geospatial_Coordinates.csv')

In [111]:
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [119]:
combined = pd_2.set_index('PostalCode').join(geospatial.set_index('Postal Code'))
combined.reset_index(inplace=True)

In [120]:
combined.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
