__Needed libraries__

In [134]:
import pandas as pd
import numpy as np

__Reading the table directly with `.read_html` pandas method__

In [135]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


__Ignoring the 'Not assigned' Borough cells__

In [136]:
i = df[((df.Borough == 'Not assigned'))].index
print(len(i))
df.drop(i, inplace=True)
print(df.shape)
df.head()

77
(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


__Reseting the index__

In [137]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


__Checking if any Neighborhood has 'Not assigned' value__

In [138]:
df.isin(['Not assigned']).any()

Postal Code     False
Borough         False
Neighborhood    False
dtype: bool

__Checking if any postal code is duplicated__

In [139]:
df['Postal Code'].duplicated().any()

False

In [140]:
df.shape

(103, 3)

#####################################################################################################################

__Importing the Geospatial data__

In [141]:
df2 = pd.read_csv('http://cocl.us/Geospatial_data')
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [142]:
df2.shape

(103, 3)

__Joining the 2 dataframes__

In [143]:
df3 = df.join(df2.set_index('Postal Code'), on='Postal Code')
df3.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#####################################################################################################################

In [146]:
df4 = df3[['Latitude','Longitude']]
df4.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [147]:
from sklearn.preprocessing import StandardScaler
X = df4.values
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet[:5]

array([[ 0.93187104,  0.69818881],
       [ 0.40749844,  0.84388426],
       [-0.96437519,  0.37773518],
       [ 0.26644077, -0.6993678 ],
       [-0.81034434,  0.07922652]])

In [148]:
from sklearn.cluster import KMeans 
clusterNum = 7
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[2 2 0 1 0 5 3 4 2 0 1 5 3 2 2 0 1 5 3 2 0 1 3 0 0 0 3 4 4 0 0 1 3 4 6 0 0
 0 2 4 1 0 0 0 2 4 6 0 0 1 6 2 4 6 0 4 1 6 2 4 6 4 4 1 6 2 4 4 0 1 6 2 4 4
 0 1 5 6 3 0 0 1 2 0 0 3 0 0 5 6 2 0 0 5 6 3 0 0 5 0 0 5 5]


In [149]:
df3["Clus_km"] = labels
df3.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Clus_km
0,M3A,North York,Parkwoods,43.753259,-79.329656,2
1,M4A,North York,Victoria Village,43.725882,-79.315572,2
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0


In [150]:
locations = df3[['Latitude','Longitude']]
labels = df3[['Clus_km']]
locationlist = locations.values.tolist()
labelslist = labels.values.tolist()
colors = ['blue', 'green', 'red', 'black', 'orange', 'darkgreen' ,'gray']

In [155]:
# !conda install -c conda-forge folium=0.5.0 --yes
import folium
map = folium.Map(location=[43.7, -79.3], zoom_start=10)
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], icon=folium.Icon(color=colors[labelslist[point][0]])).add_to(map)
map