In [10]:
import requests
import pandas as pd
from sklearn.cluster import KMeans

## The following steps are implemente in the code:
1. Read the HTML table on the wiki page as a pandas dataframe
2. clean the table by dropping cells with lacking info
3. Merge cells with same postal code
4. Print out shape of dataframe to show number of rows

In [6]:
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.drop(df[df['Borough']=='Not assigned'].index,inplace=True)
df.reset_index(drop=True, inplace=True)
df.loc[df['Neighbourhood']=='Not assigned','Neighbourhood']=df['Borough']
foo= lambda a: ", ".join(a)
df_final=df.groupby(df["Postcode"], sort=False).aggregate({'Borough':'first', 'Neighbourhood': foo}).reset_index()
print (df_final.shape[0])

103


### Read the csv file of latitude and longitude vaues of areas in Toronto (since geocoder library wasn't available in IBM watson studio notebook), and added them to our original dataframe.


In [7]:
latlon=pd.read_csv("http://cocl.us/Geospatial_data")
latlon.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_info=pd.merge(df_final,latlon,on='Postcode', how='left')
df_info.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [2]:
import sys
!{sys.executable} -m pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/72/ff/004bfe344150a064e558cb2aedeaa02ecbf75e60e148a55a9198f0c41765/folium-0.10.0-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 280kB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.0


In [8]:
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

### The locations all around Toronto are clustered into 5 different groups based on their latitude and longitude co-ordinates. 

In [14]:
map_lat=df_info['Latitude'][0]
map_lon=df_info['Longitude'][0]
toronto_map=folium.Map(location=[map_lat,map_lon],zoom_start=10)
km=KMeans(n_clusters=5, random_state=4).fit(df_info[['Latitude','Longitude']])
x=np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
df_info.insert(0,"cluster",km.labels_)

for lat,long,borough,neigh, clusters in zip(df_info['Latitude'],df_info['Longitude'],df_info['Borough'],df_info['Neighbourhood'], df_info['cluster']):
    label='{}, {}'.format(neigh,borough)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius=5,
        popup=label,
        color=rainbow[clusters-1],
        fill=True,
        fill_color=rainbow[clusters-1],
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)        
    

toronto_map