Applied Data Science Capstone
=============================

Week 3: Segmenting and Clustering Neighborhoods in Toronto
------

In [94]:
# Import libraries.
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import lxml
import geocoder

### Part 1: Scraping the online table

We will scrape the Wikipedia page using the BeautifulSoup package and convert the table on the website into a pandas DataFrame:

In [95]:
# Scrape the Wikipedia page using the BeautifulSoup library.
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')

# Find the table of interest within the webpage.
table = soup.find('table', class_='wikitable sortable')

# Go through the table and extract the values into three lists, one for each column.
A = []
B = []
C = []
column_names = []
for row in table.findAll('tr'):
    # Extract and save the column headers when you find them.
    headers = row.findAll('th')
    if len(headers) == 3:
        for header in headers:
            column_names.append(header.find(text=True).rstrip().replace(' ', ''))
    cells = row.findAll('td')
    # Extract and save the values.
    if len(cells) == 3:
        A.append(cells[0].find(text=True).rstrip())
        B.append(cells[1].find(text=True).rstrip())
        C.append(cells[2].find(text=True).rstrip())

# Create a dataframe with the values scraped from the webpage table.
df = pd.DataFrame(A, columns=[column_names[0]])
df[column_names[1]] = B
df[column_names[2]] = C

# Remove rows with non-assigned boroughs. Remember to reset the indices on the table.
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
df = df.reset_index(drop=True)

We first run some checks that the data has been cleaned correctly:

In [96]:
# Check that there are no more non-assigned boroughs or neighbourhoods.
print('Number of non-assigned boroughs:', len(df.loc[df.Borough == 'Not assigned']))
print('Number of non-assigned neighbourhoods:', len(df.loc[df.Neighbourhood == 'Not assigned']))

# Check that there are no duplicate values in the postal codes column.
print('Postal code duplicates:', df.PostalCode.duplicated().any())

Number of non-assigned boroughs: 0
Number of non-assigned neighbourhoods: 0
Postal code duplicates: False


Now we can display the first few rows of the table:

In [97]:
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


And finally we display the shape of the table:

In [93]:
df.shape

(103, 3)

### Part 2: Adding geographical coordinates

We will use the Geocoder package to find the longitude and latitude values for each postal code. Within Geocoder we will make use of the ArcGIS system, as it is free and stable.

In [32]:
latitude = []
longitude = []
for postal_code in df.PostalCode:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
    latitude.append(g.latlng[0])
    longitude.append(g.latlng[1])

df['Latitude'] = latitude
df['Longitude'] = longitude

df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66263,-79.52831
6,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
7,M3B,North York,Don Mills,43.74923,-79.36186
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


Now we can display the new table with the added geographical coordinates:

In [98]:
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Part 3: Segmenting and clustering neighbourhoods

In [37]:
import sys
!{sys.executable} -m pip install 'geocoder'

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 3.8 MB/s eta 0:00:01
Collecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 2.3 MB/s eta 0:00:01
[?25hCollecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 4.6 MB/s eta 0:00:01
[?25hCollecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Using legacy 'setup.py install' for future, since package 'wheel' is not installed.
Installing collected packages: ratelim, future, click, geocoder
    Running setup.py install for future ... [?25ldone
[?25hSuccessfully installed click-7.1.2 future-0.18.2 geocoder-1.38.1 ratelim-0.1.6
