In [43]:
import pandas as pd
import numpy as np
import wikipedia as wp
from bs4 import BeautifulSoup
import lxml

In [44]:
!pip install wikipedia
!pip install lxml



# Part 1

In [45]:
html = wp.page('List of postal codes of Canada: M').html().encode('utf-8')
html

b'<div class="mw-parser-output"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Wikipedia list article</div>\n<p>This is a list of <a href="/wiki/Postal_codes_in_Canada" title="Postal codes in Canada">postal codes in Canada</a> where the first letter is M. Postal codes beginning with M are located within the city of <a href="/wiki/Toronto" title="Toronto">Toronto</a> in the province of <a href="/wiki/Ontario" title="Ontario">Ontario</a>. Only the first three characters are listed, corresponding to the Forward Sortation Area.\n</p><p><a href="/wiki/Canada_Post" title="Canada Post">Canada Post</a> provides a free postal code look-up tool on its website,<sup id="cite_ref-1" class="reference"><a href="#cite_note-1">&#91;1&#93;</a></sup> via its <a href="/wiki/Mobile_app" title="Mobile app">applications</a> for such <a href="/wiki/Smartphones" class="mw-redirect" title="Smartphones">smartphones</a> as the <a href="/wiki/IPhone" title="IPhone">iPhone</

In [46]:
df = pd.read_html(html, header = 0)[0]

In [47]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [48]:
#df = df.groupby(['Postal code', 'Borough'])['Neighborhood']

In [49]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [51]:
df = df.groupby(['Postal code', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.columns = ['Postal code', 'Borough', 'Neighborhood']
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [52]:
df['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [53]:
df.shape

(103, 3)

# Part 2

In [55]:
dfgeo = pd.read_csv('http://cocl.us/Geospatial_data')

In [56]:
dfgeo.columns = ['Postal code', 'Latitude', 'Longitude']

In [57]:
dfgeo.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [59]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [63]:
dfmerged = pd.merge(df, dfgeo, on='Postal code')

In [65]:
dfmerged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [66]:
dfmerged.shape

(103, 5)

# Part 3

## Toronto dataframe

In [67]:
dftorn = dfmerged[dfmerged['Borough'].str.contains('Toronto')]

In [69]:
dftorn = dftorn.reset_index(drop=True)

In [70]:
dftorn

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


## clustering

In [82]:
from sklearn.cluster import KMeans

In [89]:
dftorn

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


In [None]:
#!conda install -c conda-forge geopy --yes

In [75]:
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.7.12
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /Users/zhe/newminiconda2/envs/dsenv3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    certifi-2020.4.5.1         |   py37hc8dfbb8_0         151 KB  conda-forge
    conda-4.8.3                |   py37hc8dfbb8_1         3.0 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1g             |       h0b31af3_0         1.9 MB  conda-forge
    python_abi-3.7             |          1_cp37m 

  """


In [90]:
# create map of Toronto using latitude and longitude values
import folium 
map_tohood = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dftorn['Latitude'], dftorn['Longitude'], dftorn['Borough'], dftorn['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_tohood)  
    
map_tohood

<folium.vector_layers.CircleMarker at 0x1229800f0>

<folium.vector_layers.CircleMarker at 0x1229805c0>

<folium.vector_layers.CircleMarker at 0x1229801d0>

<folium.vector_layers.CircleMarker at 0x122986f28>

<folium.vector_layers.CircleMarker at 0x12282bcc0>

<folium.vector_layers.CircleMarker at 0x12282ba20>

<folium.vector_layers.CircleMarker at 0x12282b160>

<folium.vector_layers.CircleMarker at 0x12282b780>

<folium.vector_layers.CircleMarker at 0x12282b320>

<folium.vector_layers.CircleMarker at 0x12298f0f0>

<folium.vector_layers.CircleMarker at 0x12298f2e8>

<folium.vector_layers.CircleMarker at 0x12298f3c8>

<folium.vector_layers.CircleMarker at 0x12298f550>

<folium.vector_layers.CircleMarker at 0x12298f6d8>

<folium.vector_layers.CircleMarker at 0x12298f7b8>

<folium.vector_layers.CircleMarker at 0x12298f7f0>

<folium.vector_layers.CircleMarker at 0x12298f6a0>

<folium.vector_layers.CircleMarker at 0x1229915f8>

<folium.vector_layers.CircleMarker at 0x122994a90>

<folium.vector_layers.CircleMarker at 0x1229917b8>

<folium.vector_layers.CircleMarker at 0x12299fcf8>

<folium.vector_layers.CircleMarker at 0x12299ff60>

<folium.vector_layers.CircleMarker at 0x12299fac8>

<folium.vector_layers.CircleMarker at 0x1229a8208>

<folium.vector_layers.CircleMarker at 0x1229a8748>

<folium.vector_layers.CircleMarker at 0x1229a8e80>

<folium.vector_layers.CircleMarker at 0x1229a88d0>

<folium.vector_layers.CircleMarker at 0x1229a85f8>

<folium.vector_layers.CircleMarker at 0x1229b3438>

<folium.vector_layers.CircleMarker at 0x1229b3d30>

<folium.vector_layers.CircleMarker at 0x1229b3f28>

<folium.vector_layers.CircleMarker at 0x1229b6438>

<folium.vector_layers.CircleMarker at 0x1229b6940>

<folium.vector_layers.CircleMarker at 0x1229b6d30>

<folium.vector_layers.CircleMarker at 0x1229b6b70>

<folium.vector_layers.CircleMarker at 0x1229bb4a8>

<folium.vector_layers.CircleMarker at 0x1229bb828>

<folium.vector_layers.CircleMarker at 0x1229bbc88>

<folium.vector_layers.CircleMarker at 0x1229bbf98>