# Applied Data Science Capstone Project

Week 3: Webscraping information about Toronto and the neighborhoods

Import Libraries

In [3]:
pip install folium

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 6.0 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import folium

Obtaining the data from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url).text

Working with the data

In [5]:
soup = BeautifulSoup(html_data, 'html.parser')

In [6]:
#separate table from html data
table = soup.find('table')

#start with an empty table, sort through the table rows, create the dictionaries and save them to the list
pc_table = []

for tr in table.find_all('td'):
    cell = {} #cell refers to the cell in the table that we will be drawing the information from where each cell has a zipcode, borough, and assosciated neighborhoods
    if tr.span.text == 'Not assigned':
        pass
    else:
        cell['Postal Code'] = tr.p.text[:3]
        cell['Borough'] = (tr.span.text).split('(')[0]
        cell['Neighborhood'] = ((((tr.span.text).split('(')[1]).replace(')', ' ')).replace(' /', ',')).strip(' ')
        pc_table.append(cell)

In [7]:
df = pd.DataFrame(pc_table)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
df.shape #How many rows and columns do we have?

(103, 3)

In [9]:
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade': 'Downtown Toronto Stn A',
                                       'East TorontoBusiness reply mail Processing Centre969 Eastern': 'East Toronto Business',
                                       'East YorkEast Toronto': 'East York/East Toronto',
                                       'MississaugaCanada Post Gateway Processing Centre': 'Mississauga'})

In [10]:
display(df)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [14]:
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
df.shape

(103, 3)

# Geo Coordinates for Toronto Postal Codes



In [20]:
place = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(place)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [21]:
#Getting coordinates for each zipcode using geocoder
coordinates = {}
zipcodes = list(df['Postal Code'])

for zipcode in zipcodes:
        
    geolocator = Nominatim(user_agent = 'toronto_explorer')

    location = geolocator.geocode('{}, Toronto, Ontario'.format(zipcode))
    
    try:
        latitude = location.latitude    
    except Exception:
        latitude = 'NaN'
    
    try:
        longitude = location.longitude
    except Exception:
        longitude = 'NaN'

    coordinates[zipcode] = [latitude, longitude]
coord_df = pd.DataFrame(coordinates)

In [29]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_id = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='enter ibm_api_key_id',
    ibm_auth_endpoint="enter ibm_auth_endpoint",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_id.get_object(Bucket='bucket',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
#Merge the dataframes together on the Postal Code column
df_geo = df.merge(geo_coord, how='right', on=['Postal Code'])
df_geo.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
