In [12]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
! pip install folium==0.5.0
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors



# Section 1

In this section, I import the table as a dataframe. I remove any cells with unassigned borough names and reset the index, as the command otherwise would've left the indexes out of order. I then group the cells with the same postal code and borough and concatenate the neighbourhoods. Then, I replace any unassigned neighbourhoods with the borough name. Lastly, I print the dataframe shape.

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
df = dfs[0]
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df = df.groupby(['Postal Code','Borough']).agg({'Neighbourhood':lambda x:', '.join(x)}).reset_index()
df['Neighbourhood']=df['Neighbourhood'].replace('Not assigned', df['Borough'])
print(df.shape)

(103, 3)


# Section 2

In [6]:
gs_data = pd.read_csv('https://cocl.us/Geospatial_data')
df_2 = pd.merge(df, gs_data, on=['Postal Code'])
df_2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


# Section 3

In [16]:
df_3 = df_2[df_2['Borough'].astype(str).str.contains('Toronto')].reset_index(drop=True)

In this first part, I created a dataframe that contained all Boroughs that included "Toronto" in the name

In [17]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_3.drop(['Neighbourhood','Borough','Postal Code'],1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

df_3.insert(0, 'Cluster Labels', kmeans.labels_)

In the cell above, I completed the clustering. I chose to include 5 clusters. I utilized the kmeans method and created a new dataframe using the cluster labels.

In [18]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_3['Latitude'], df_3['Longitude'], df_3['Neighbourhood'], df_3['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters