In [1]:
# install the lxml to read html pages

!conda install -c conda-forge lxml --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



__Install the required libraries__

In [2]:
#install the required libraries

import numpy as np
import pandas as pd
import lxml

__Read the html page and store the required rows in a dataframe__

In [3]:
# read the html page
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# read the table
df = df[0]

# remove all Not assigned borough rows

df = df[df['Borough'] != 'Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


__Sort the dataframe__

In [4]:
# sort the table on postcode to ensure all same postcode rows comes together for merging

df.sort_values(by='Postcode',inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
29,M1C,Scarborough,Port Union
28,M1C,Scarborough,Rouge Hill
27,M1C,Scarborough,Highland Creek
...,...,...,...
232,M9V,Etobicoke,Mount Olive
234,M9V,Etobicoke,South Steeles
235,M9V,Etobicoke,Thistletown
233,M9V,Etobicoke,Silverstone


__Loop through the dataframe and merge the neighbourhood in the rows with same Postcode__

In [5]:
# initialize the data elements to be used in the loop

list = []  # empty list
columns = ['Postcode','Borough','Neighbourhood'] # set the column headers
neigh_list = ''
prev_postcode = df.iloc[0][0]
prev_borough = df.iloc[0][1]
first_row = 'TRUE'

# loop through the entire list

for postcode, borough, neighbourhood in zip(df['Postcode'], df['Borough'], df['Neighbourhood']): 
    if neighbourhood == 'Not assigned': # set Not assigned neighbourhood with Borough names
        neighbourhood = borough
    if postcode == prev_postcode: # compare with previous rows
        if first_row == 'TRUE': # special processing for first row to take care of ,
            neigh_list = neighbourhood
            first_row = 'FALSE'
        else:
            neigh_list = neigh_list + ', ' + neighbourhood # keep appending the neighbourhood to the list
    else: # we are done with all same postcode rows so append to the list
        list.append([prev_postcode,prev_borough,neigh_list])
        neigh_list = neighbourhood
    prev_postcode = postcode # save the current row values
    prev_borough = borough # save the current row values

# write the last row
list.append([postcode,borough,neigh_list])

# convert list to dataframe
df_n = pd.DataFrame(list,columns=columns)
df_n

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Richview Gardens, Kingsview Village, St. Phill..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


__Print the shape of the final dataframe__

In [6]:
df_n.shape

(103, 3)

__Read the Geospatial Coordinates data__

In [7]:
# read the csv file with latitude and longitude (Geocoder not working..)

df_latlon = pd.read_csv('Geospatial_Coordinates.csv')
df_latlon = df_latlon.rename(columns = {'Postal Code':'Postcode'}) # rename the column to merge
df_latlon

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


__Merge the dataframe with the original dataframe__

In [8]:
# Merge both the dataframe on Postcode

df_merged = pd.merge(df_n, df_latlon, on='Postcode')
df_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Richview Gardens, Kingsview Village, St. Phill...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


__Drop the columns and retain only Latitude and Longitude__

In [9]:
df_new = df_merged.drop(['Postcode','Neighbourhood','Borough'],1)
df_new

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476
...,...,...
98,43.706876,-79.518188
99,43.696319,-79.532242
100,43.688905,-79.554724
101,43.739416,-79.588437


__Include the libraries required for creating Maps and Clusters__

In [10]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

__Using kMeans, cluster the Neighbourhood__

In [11]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_new)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 2], dtype=int32)

In [12]:
# add clustering labels
df_new.insert(0, 'Cluster Labels', kmeans.labels_)


__Draw the map using Folium and show the Clusters__

In [13]:

# Center the map around Toronto

latitude = 43.65
longitude = -79.34

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(df_new['Latitude'], df_new['Longitude'], df_new['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters