In [4]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [6]:
import sys
!{sys.executable} -m pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 2.8 MB/s eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [38]:
#import some important functions 
import pandas as pd 
from bs4 import BeautifulSoup
import requests 
import numpy as np
from IPython.display import display_html

In [40]:
URL = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
bs = BeautifulSoup(URL, 'lxml')
#transform to a table
table = str(bs.table)
display_html(table,raw = True)

Postal Code,Borough,Neighborhood
M1A,Not assigned,
M2A,Not assigned,
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,"Malvern, Rouge"


In [41]:
df = pd.read_html(table)
df1 = df[0]
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [46]:
#clean up "not assigned" rows
df2 = df1[df1.Borough != 'Not assigned']
#combine multiple rows 
df3 = df2.groupby(['Postal Code', 'Borough'], sort = False).agg(', '.join)
df3.reset_index(inplace = True)

df3['Neighborhood'] = np.where(df3['Neighborhood'] == 'NaN', df3['Borough'],df3['Neighborhood'])
df3

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [47]:
df3.shape

(103, 3)

new data for latitudes and longitudes

In [48]:
ll = pd.read_csv('http://cocl.us/Geospatial_data')
ll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
df4 = pd.merge(df3, ll, on = 'Postal Code')
df4.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [50]:
df5 = df4[df4['Borough'].str.contains('Toronto',regex = False)]
df5

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


folium visualization 

In [51]:
import folium 
from IPython.display import Image 
from IPython.core.display import HTML 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [54]:
map1 = folium.Map(location = [43.651070, -79.347015], zoom_start = 10)
for lat, lng, borough, neighborhood in zip (df5['Latitude'], df5['Longitude'], df5['Borough'],df5['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = 'red', fill=True, fill_opacity = 0.7, parse_html=False).add_to(map1)
map1

kmeans analysis

In [55]:
k = 5
t_cluster = df5.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)
kmeans = KMeans(n_clusters = k, random_state = 0).fit(t_cluster)
kmeans.labels_
df5.insert(0, 'Cluster Labels', kmeans.labels_)

In [56]:
map2 = folium.Map(location = [43.651070, -79.347015], zoom_start = 10)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, neighborhood, cluster in zip(df5['Latitude'], df5['Longitude'], df5['Neighborhood'], df5['Cluster Labels']):
    label = folium.Popup('Cluster' + str(cluster), parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = rainbow[cluster-1], fill=True, fill_color = rainbow[cluster-1], fill_opacity = 0.7).add_to(map2)

map2

In [57]:
df5

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,2,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [69]:
df5.loc[df5['Cluster Labels'] == 0, df5.columns[[3] + list(range(6, df5.shape[1]))]]

Unnamed: 0,Neighborhood
2,"Regent Park, Harbourfront"
4,"Queen's Park, Ontario Provincial Government"
9,"Garden District, Ryerson"
15,St. James Town
20,Berczy Park
24,Central Bay Street
30,"Richmond, Adelaide, King"
36,"Harbourfront East, Union Station, Toronto Islands"
42,"Toronto Dominion Centre, Design Exchange"
48,"Commerce Court, Victoria Hotel"


In [70]:
df5.loc[df5['Cluster Labels'] == 1, df5.columns[[3] + list(range(6, df5.shape[1]))]]

Unnamed: 0,Neighborhood
19,The Beaches
41,"The Danforth West, Riverdale"
47,"India Bazaar, The Beaches West"
54,Studio District
100,Business reply mail Processing Centre


In [71]:
df5.loc[df5['Cluster Labels'] == 2, df5.columns[[3] + list(range(6, df5.shape[1]))]]

Unnamed: 0,Neighborhood
31,"Dufferin, Dovercourt Village"
69,"High Park, The Junction South"
75,"Parkdale, Roncesvalles"
81,"Runnymede, Swansea"


In [72]:
df5.loc[df5['Cluster Labels'] == 3, df5.columns[[3] + list(range(6, df5.shape[1]))]]

Unnamed: 0,Neighborhood
25,Christie
37,"Little Portugal, Trinity"
43,"Brockton, Parkdale Village, Exhibition Place"
74,"The Annex, North Midtown, Yorkville"
80,"University of Toronto, Harbord"
84,"Kensington Market, Chinatown, Grange Park"


In [73]:
df5.loc[df5['Cluster Labels'] == 4, df5.columns[[3] + list(range(6, df5.shape[1]))]]

Unnamed: 0,Neighborhood
61,Lawrence Park
62,Roselawn
67,Davisville North
68,Forest Hill North & West
73,North Toronto West
79,Davisville
83,"Moore Park, Summerhill East"
86,"Summerhill West, Rathnelly, South Hill, Forest..."


results and observations: I clustered the borough into 5 parts based on the data, and we can notice that the cluster 0 (Downtown Toronto) has most various neighborhood, including government, park, post office, etc. for east Toronto, there is beach that is unique compared with other area. for the left area (west and central part), there is villege style. 

conclusion: for me, I prefer to work in downtown Toronto. there are various neighborhood that is so convenient for me on life. I can find almost everything that I am interested. Besides, I am not a sun guy and not interested in beach. 