In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

Get the Wiki page and parse it with BeautifulSoup

In [2]:
wiki_toronto = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content, 'lxml')

Extract postal codes, boroughs and neighborhoods and put them in a DataFrame

In [39]:
items = []
for item in wiki_toronto.table.find_all('td'):
    postal_code = item.p.b.text
    if item.p.span.text == "Not assigned":
        continue
    match = re.match("(?P<borough>.+)(\s\((?P<neighborhood>.+)\))*", item.p.span.text)
    
    if match is None:
        print(item.p.span.text)
        break
    
    items.append((postal_code, match.group('borough'), (match.group('neighborhood') or match.group('borough')).replace(' / ', ', ')))

df_toronto = pd.DataFrame([it for it in items], columns=['PostalCode', 'Borough', 'Neighborhood'])
df_toronto.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,East Toronto
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [36]:
df_toronto.shape

(103, 3)

In [24]:
# The code was removed by DSX for sharing.

In [29]:
locations = []

for item in df_toronto['PostalCode']:
        
    # construct URL to make API call
    url = 'https://maps.googleapis.com/maps/api/geocode/json?key={}&address="Toronto, ON {}"'.format(API_key, item)

    response = requests.get(url).json() # get response
    
    if response['status'] == 'ZERO_RESULTS':
        print(item)
        continue
    
    geographical_data = response['results'][0]['geometry']['location'] # get geographical coordinates
    latitude = geographical_data['lat']
    longitude = geographical_data['lng']
    locations.append((item, latitude, longitude))
    
df_locations = pd.DataFrame([it for it in locations], columns=['PostalCode', 'Latitude', 'Longitude'])
df_locations.tail()

Unnamed: 0,PostalCode,Latitude,Longitude
98,M8X,43.653654,-79.506944
99,M4Y,43.66586,-79.38316
100,M7Y,43.662744,-79.321558
101,M8Y,43.636258,-79.498509
102,M8Z,43.628841,-79.520999


In [21]:
df_locations

Unnamed: 0,PostalCode,Latitude,Longitude
0,M3A,43.753259,-79.329656
1,M4A,43.725882,-79.315572
2,M5A,43.654260,-79.360636
3,M6A,43.718518,-79.464763
4,M7A,43.662301,-79.389494
5,M9A,43.667856,-79.532242
6,M1B,43.806686,-79.194353
7,M3B,43.745906,-79.352188
8,M4B,43.706397,-79.309937
9,M5B,43.657162,-79.378937


In [43]:
import pickle

with open('df_toronto.p', 'wb') as f:
    pickle.dump(df_toronto, f)
    
with open('df_locations.p', 'wb') as f:
    pickle.dump(df_locations, f)

In [48]:
df_data = df_toronto.merge(df_locations, on="PostalCode", how='left')

In [49]:
with open('df_data.p', 'wb') as f:
    pickle.dump(df_data, f)

In [51]:
df_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
