Segmenting and Clustering Neighborhoods in Toronto

In [2]:
# import necessary libraries
from bs4 import BeautifulSoup
import requests # Requests allows you to send organic, grass-fed HTTP/1.1 requests, without the need for manual labor.
from urllib import request, response, error, parse
from urllib.request import urlopen
import pandas as pd

In [3]:
# Load article, turn into soup and check the title to confirm it
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html =urlopen(website_url)
soup = BeautifulSoup(html, "lxml")
title = soup.title
titleText = title.get_text ()
print(titleText)


List of postal codes of Canada: M - Wikipedia


In [4]:
# find class ‘wikitable sortable’ in the HTML script
My_table = soup.find('table',{'class':'wikitable sortable'})

In [5]:
# Search through the tables for the one with the headings we want.
for table in My_table:
    ths = My_table.findAll('th')
    headings = [th.text.strip() for th in ths]
    headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']

In [6]:
# Extract the columns we want and write to a comma-delimited file.
with open('output.csv', 'w') as fo:
    for tr in My_table.findAll('tr'):
        tds = tr.findAll('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:3]]
        print(', '.join([Postcode, Borough, Neighbourhood]), file=fo)

In [21]:
#Convert the list into Pandas DataFrame to work in python
import pandas as pd
df = pd.read_csv('output.csv', header=None, names=['PostalCode', 'Borough', 'Neighbourhood'])
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [8]:
#Drop a row if it contains 'Not assigned' 
df[df.Borough != 'Not assigned']

df = df.dropna()
null = 'Not assigned'
df = df[(df.PostalCode != null) & (df.Borough != null) & (df.Neighbourhood != null)]

In [9]:
df.shape

(287, 3)

In [24]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighbourhood'].tolist()))
                    
grp = df.groupby(['PostalCode', 'Borough'])
df2 = grp.apply(neighborhood_list).reset_index(name='Neighbourhood')

In [10]:
import numpy as np # library to handle data in a vectorized manner
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library



Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

# Part II- Add Latitude and Longitude

In [13]:
!conda install -c conda-forge geocoder --yes 
import geocoder # import geocoder

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

    geocoder: 1.38.1-py_1 conda-forge
    ratelim:  0.1.6-py_2  conda-forge


Downloading and Extracting Packages
ratelim-0.1.6        | 6 KB      | ##################################### | 100% 
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [14]:

def get_latlng(postal_code):
# initialize your variable to None
    lat_lng_coords = None
# loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
get_latlng('M9W')

[43.71174000000008, -79.57918134599998]

In [25]:
postal_codes = df2['PostalCode']    
coordins = [ get_latlng(postal_code) for postal_code in postal_codes.tolist() ]

In [26]:
df3 = pd.DataFrame(coordins, columns=['Latitude', 'Longitude'])
df2['Latitude'] = df3['Latitude']
df2['Longitude'] = df3['Longitude']

In [27]:
df2[df2.PostalCode == 'M9W']

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
176,M9W,Etobicoke,Northwest,43.71174,-79.579181
