Importing necessary libraries

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Using BeatifulSoup to scrape the web page

In [2]:
wiki_toronto = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content, 'lxml')

In [3]:
#function to clean the text
def clean(content):
    content = content.replace(' / ', ', ')
    content = content.replace('(', '')
    content = content.replace(')', '')
    content = content.replace('\n','')
    return content

items = []
for item in wiki_toronto.table.find_all('tr')[1:]:
    postal_code = item.find_all('td')[0].text
    if item.find_all('td')[1].text == "Not assigned":
        continue
    parts = [it for it in item.stripped_strings if it not in "()"]
    borough = parts[1]
    neighborhood = ','.join(parts[2:])
    
    items.append((postal_code, clean(borough), clean(neighborhood)))

df = pd.DataFrame([it for it in items], columns=['PostalCode', 'Borough', 'Neighborhood'])
df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
95,M2M,North York,Newtonbrook
96,M2M,North York,Willowdale
97,M3M,North York,Downsview Central
98,M4M,East Toronto,Studio District


Using groupby function along with 'apply' to aggregate neighborhoods with same Postal Code and Borough.

In [4]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).to_frame().reset_index()

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
df.shape

(103, 3)

Adding Latitudes and Longitudes with the dataframe

In [7]:
latlng = pd.read_csv('http://cocl.us/Geospatial_data')

Concatenating df and latlng dataframes

In [12]:
df_latlng = pd.concat([df.set_index('PostalCode'),latlng.set_index('Postal Code')],axis = 1,sort = False).reset_index()
df_latlng = df_latlng.rename(columns = {'index':'PostalCode'})

In [16]:
df_latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437
