# Scrape wikipedia page for Toronto data

In [44]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [45]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

In [46]:
columns = []
for k in soup.table.find_all('th'):
    k = str(k.contents[0]).replace('\n', '')
    columns.append(k)
columns

['Postcode', 'Borough', 'Neighbourhood']

In [47]:
rows = []
for i in soup.table.tbody.find_all('tr')[1:]:
    row = []
    for j in i.find_all('td'):
        j = str(j.contents[0]).replace('\n', '')
        j = re.sub('<a.*?>', '', j)
        j = re.sub('</a>', '', j)
        row.append(j)
    rows.append(row)

In [48]:
df = pd.DataFrame(data = rows, columns = columns)

In [49]:
df_cleaned = df[df['Borough'].apply(str.lower) != 'not assigned']
def replace_not_assigned(df_row):
    if df_row.Neighbourhood.lower() == 'not assigned':
        df_row.Neighbourhood = df_row.Borough
    return df_row
df_cleaned = df_cleaned.apply(lambda x: replace_not_assigned(x), axis=1)

In [50]:
df_cleaned = df_cleaned.groupby(['Postcode', 'Borough']).agg({'Neighbourhood': ', '.join})
df_cleaned.reset_index(inplace=True)

In [51]:
df_cleaned

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [52]:
#!pip install geocoder

# Get coordinates for each postcode

In [53]:
geo = pd.read_csv('Geospatial_Coordinates.csv')#.rename(columns={'Postal Code':'Postcode'})
geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [54]:
df_final = df_cleaned.set_index('Postcode').join(geo.set_index('Postal Code')).reset_index()
df_final

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [55]:
# geocoder didn't seem to work
"""
import geocoder

latitude = []
longitude = []
lat_long = None

# loop until you get the coordinates
for postal_code in df_cleaned['Postcode']:
    while(lat_long is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        print(g.latlng)
        lat_long = g.lat_long
    latitude.append(lat_long[0])
    longitude.append(lat_long[1])
"""       

"\nimport geocoder\n\nlatitude = []\nlongitude = []\nlat_long = None\n\n# loop until you get the coordinates\nfor postal_code in df_cleaned['Postcode']:\n    while(lat_long is None):\n        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n        print(g.latlng)\n        lat_long = g.lat_long\n    latitude.append(lat_long[0])\n    longitude.append(lat_long[1])\n"