In [2]:
pip install beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install requests


Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
import lxml.html as lh
import bs4 as bs
import urllib.request
import numpy as np 
import pandas as pd

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#opening connection grabbing page
results = requests.get(url)
#parsing
soup = bs.BeautifulSoup(results.content,'lxml')
#grabbing only the 'table' portion of the page (right click inspect)
table = soup.find_all('table')[0]


In [7]:
df = pd.read_html(str(table))
#convert df to json string, orientation 'records':list like
data = pd.read_json(df[0].to_json(orient='records'))

In [8]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
#selecting only where Borough is filled
remove_nan = data[data['Borough'] != 'Not assigned']
remove_nan.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
org_data = remove_nan.groupby(["Borough","Postal Code"], as_index=False).agg(','.join)
org_data.head()


Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


In [11]:

#Replace 'Not Assigned' values in Neighborhood column by the Borough value
#-----------------np.where(condition, output when condition = true, output when condition = false)-----------------------
org_data['Neighborhood'] = np.where(org_data['Neighborhood'] == 'Not Assigned', org_data['Borough'], org_data['Neighborhood'])
                            

In [12]:
org_data

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
...,...,...,...
98,York,M6C,Humewood-Cedarvale
99,York,M6E,Caledonia-Fairbanks
100,York,M6M,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
101,York,M6N,"Runnymede, The Junction North"


In [13]:
org_data.shape

(103, 3)

# Part 2: Obtaining Lat and Lon and merging data sets

In [14]:
#geospatial data read
geo_url = "http://cocl.us/Geospatial_data"
geo_data = pd.read_csv(geo_url)

In [15]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
#merging data sets

data_merged = pd.merge(org_data, geo_data, on = 'Postal Code')

In [17]:
data_merged.head()

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


# Part 3: plotting neighborhoods in toronto on map

In [18]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium
import json 
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge geopy --yes
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [19]:

#-------------Unable to get coordinates from geolocator--------------
address = 'Toronto, Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto Canada are {}, {}.'.format(latitude, longitude))

ConfigurationError: Using Nominatim with default or sample `user_agent` "geopy/2.0.0" is strongly discouraged, as it violates Nominatim's ToS https://operations.osmfoundation.org/policies/nominatim/ and may possibly cause 403 and 429 HTTP errors. Please specify a custom `user_agent` with `Nominatim(user_agent="my-application")` or by overriding the default `user_agent`: `geopy.geocoders.options.default_user_agent = "my-application"`.

In [20]:

#Found coordinates from external source
toronto_map = folium.Map(location=[43.653963, -79.387207], zoom_start=11)

X = data_merged['Latitude']
Y = data_merged['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
data_merged['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(data_merged['Latitude'], data_merged['Longitude'], data_merged['Borough'], data_merged['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map