## Title: Segmenting and Clustering Neighborhoods in Toronto

Download all the necessary dependencies.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# improt BeautifulSoup for data scraping
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


Build a dataframe with the Toronto neigbourhoods from wikipedia.

In [12]:
# assign the link of the Wikipedia page to a variable named wiki_url
wiki_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

soup = BeautifulSoup(wiki_url.content, "html.parser") # create a BeautifulSoup object from the wikipedia page

# find the information regarding the wikipedia table
wiki_table = soup.find("table", { "class" : "wikitable sortable" })

# create lists for each of the columns of the wikipedia table
A = [] 
B = []
C = []

# iterate through each row (tr element) and then assign each element to a variable. 
for row in wiki_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

# join the lists into a single dataframe
toronto_df = pd.DataFrame(A,columns=['PostalCode'])
toronto_df['Borough'] = B
toronto_df['Neighbourhood'] = C

# clean-up the dataframe 
# remove rows with a "Not assigned" Borough)
toronto_df.drop(toronto_df[toronto_df.Borough == 'Not assigned'].index, inplace=True)

toronto_df = toronto_df.replace('\n','', regex=True) # remove newlines

# replace not assigned neighborhood name with its respective borough name
idx = toronto_df.index[toronto_df.Neighbourhood == 'Not assigned']
toronto_df.Neighbourhood[idx] = toronto_df.Borough[idx]

toronto_df = toronto_df.reset_index(drop=True) # reset the df index after the clean-up

# combine neighbourhoods within the same postal code area
toronto_df = toronto_df.groupby(['PostalCode','Borough'], as_index=False, 
                                sort=False).Neighbourhood.agg(lambda x: ', '.join(x))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [3]:
toronto_df.shape

(103, 3)

Get the latitude and the longitude coordinates of each neighborhood using the provided csv file.

In [17]:
latlon_df = pd.read_csv('Geospatial_Coordinates.csv') # import the csv file to a df
latlon_df.columns = latlon_df.columns.str.replace('Postal Code','PostalCode') # match the column names of the two df
latlon_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Create the final combined dataframe with the coordinates for each postal code area.

In [23]:
df = pd.merge(toronto_df, latlon_df, on='PostalCode', how='outer') # merge the two df with outer join
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
