In [1]:
!pip install lxml




In [2]:
import pandas as pd

#scraping table from wikipedia page using pandas

df = pd.DataFrame(pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0])
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


In [3]:
#drop rows with "Not assignted" borough

df2 = df.drop(df[df['Borough']=='Not assigned'].index)
df2 = df2.rename(columns={"Postal code": "PostalCode"})
df2.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [4]:
df2.shape

(103, 3)

In [5]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates = coordinates.rename(columns={"Postal Code": "PostalCode"})
coordinates

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [6]:
merged_df = pd.merge(df2, coordinates[['PostalCode', 'Latitude', 'Longitude']], on = 'PostalCode')
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [8]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    brotlipy-0.7.0             |py36h8c4c3a4_1000         346 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    cryptography-2.9.2         |   py36h45558ae_0         613 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1g             |       h51

In [9]:
toronto_data = merged_df[merged_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
2,M5B,Downtown Toronto,Garden District / Ryerson,43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [10]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [11]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [37]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [26]:
# set number of clusters
kclusters = 5

toronto_clustering = toronto_data.drop(['PostalCode','Borough','Neighborhood'], axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 0, 0, 4, 0, 0, 3, 0, 1], dtype=int32)

In [35]:
# add clustering labels
#toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_data

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,0,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
2,0,M5B,Downtown Toronto,Garden District / Ryerson,43.657162,-79.378937
3,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,0,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568
9,1,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259


In [39]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster 1

In [61]:
toronto_data.loc[toronto_data['Cluster Labels'] == 0, toronto_data.columns[[1] + list(range(0, toronto_data.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
0,M5A,0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,M7A,0,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
2,M5B,0,M5B,Downtown Toronto,Garden District / Ryerson,43.657162,-79.378937
3,M5C,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M5E,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,0,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568
10,M5J,0,M5J,Downtown Toronto,Harbourfront East / Union Station / Toronto Is...,43.640816,-79.381752
13,M5K,0,M5K,Downtown Toronto,Toronto Dominion Centre / Design Exchange,43.647177,-79.381576
16,M5L,0,M5L,Downtown Toronto,Commerce Court / Victoria Hotel,43.648198,-79.379817


## Cluster 2

In [62]:
toronto_data.loc[toronto_data['Cluster Labels'] == 1, toronto_data.columns[[1] + list(range(0, toronto_data.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
9,M6H,1,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259
22,M6P,1,M6P,West Toronto,High Park / The Junction South,43.661608,-79.464763
25,M6R,1,M6R,West Toronto,Parkdale / Roncesvalles,43.64896,-79.456325
28,M6S,1,M6S,West Toronto,Runnymede / Swansea,43.651571,-79.48445


## Cluster 3

In [63]:
toronto_data.loc[toronto_data['Cluster Labels'] == 2, toronto_data.columns[[1] + list(range(0, toronto_data.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
18,M4N,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
19,M5N,2,M5N,Central Toronto,Roselawn,43.711695,-79.416936
20,M4P,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
21,M5P,2,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
23,M4R,2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
26,M4S,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
29,M4T,2,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
31,M4V,2,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


## Cluster 4

In [64]:
toronto_data.loc[toronto_data['Cluster Labels'] == 3, toronto_data.columns[[1] + list(range(0, toronto_data.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
7,M6G,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
11,M6J,3,M6J,West Toronto,Little Portugal / Trinity,43.647927,-79.41975
14,M6K,3,M6K,West Toronto,Brockton / Parkdale Village / Exhibition Place,43.636847,-79.428191
24,M5R,3,M5R,Central Toronto,The Annex / North Midtown / Yorkville,43.67271,-79.405678
27,M5S,3,M5S,Downtown Toronto,University of Toronto / Harbord,43.662696,-79.400049
30,M5T,3,M5T,Downtown Toronto,Kensington Market / Chinatown / Grange Park,43.653206,-79.400049


## Cluster 5

In [65]:
toronto_data.loc[toronto_data['Cluster Labels'] == 4, toronto_data.columns[[1] + list(range(0, toronto_data.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
4,M4E,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
12,M4K,4,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
15,M4L,4,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
17,M4M,4,M4M,East Toronto,Studio District,43.659526,-79.340923
38,M7Y,4,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
