# Segmenting and Clustering Neighborhoods in Toronto
This notebook contains my submission for week 3 of the Coursera course 'Applied Data Science Capstone'.

In [36]:
import pandas as pd
import numpy as np
import lxml
import urllib.request
import geocoder
import folium

## 1. Data collection
The method of scraping is based on:
- https://scipython.com/blog/scraping-a-wikipedia-table-with-beautiful-soup/
- https://scipython.com/blog/scraping-a-wikipedia-table-with-pandas/

In [3]:
html_file_name = 'postal_codes_canada.html'
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

with open(html_file_name, 'w') as fo:
    fo.write(article)

In [4]:
tables = pd.read_html(html_file_name, header=0,
                      keep_default_na=False)
headers = ['Postcode','Borough','Neighbourhood']
for df in tables:
    current_headers = df.columns.values[:4]
    if len(current_headers) != len(headers):
        continue
    if all(current_headers == headers):
        break
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Data cleaning

In [7]:
# first, remove rows where borough is not assigned
df = df[df.Borough != 'Not assigned']
# second, set neighbourhood equal to borough when neighbourhood is not assigned
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.Borough
# third, combine neighbourhood with same postcode in one row, separate by a comma
df = df.groupby(by=['Postcode', 'Borough'], as_index=False, sort=False).agg(', '.join)
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [26]:
df.shape

(103, 5)

### Adding coordinates of postcodes

In [33]:
# first create two columns containing NaNs
df['Latitude'] = np.nan
df['Longitude'] = np.nan
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Harbourfront,,
3,M6A,North York,"Lawrence Heights, Lawrence Manor",,
4,M7A,Queen's Park,Queen's Park,,


In [42]:
# fill two columns with lat, lng
for x in range(len(df)):
    g = geocoder.arcgis(df.Postcode[x] + ', Toronto, Ontario')
    df.Latitude[x] = g.lat
    df.Longitude[x] = g.lng
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75242,-79.329242
1,M4A,North York,Victoria Village,43.7306,-79.313265
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.72327,-79.451286
4,M7A,Queen's Park,Queen's Park,43.66115,-79.391715


## 2. Data exploration

In [43]:
# retrieve coordinates of Toronto
g = geocoder.arcgis('Toronto, Ontario')
latitude = g.lat
longitude = g.lng
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.648690000000045, -79.38543999999996.


In [47]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [48]:
# continue with only boroughs containing the word Toronto
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166
1,M9A,Downtown Toronto,Queen's Park,43.662299,-79.528195
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818
3,M5C,Downtown Toronto,St. James Town,43.65121,-79.375481
4,M4E,East Toronto,The Beaches,43.676531,-79.295425
