In [29]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1b             |       h14c3975_1         4.0 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.6.16          |           py36_0         148 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         5.0 MB

The following NEW packages will be 

## Question 1
### Scrape the Wikipedia page

In [16]:
wiki = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki.content, 'lxml')

### Populate the table

In [17]:
table_zc = soup.find('table')
cell = table_zc.find_all('td')

postalcode = []
borough = []
neighborhood = []

for i in range(0, len(cell), 3):
    postalcode.append(cell[i].text.strip())
    borough.append(cell[i+1].text.strip())
    neighborhood.append(cell[i+2].text.strip())
        
df_zc = pd.DataFrame(data=[postalcode, borough, neighborhood]).transpose()
df_zc.columns = ['PostalCode', 'Borough', 'Neighborhood']

### Remove "Not assigned" cells

In [18]:
df_zc['Borough'].replace('Not assigned', np.nan, inplace=True)
df_zc.dropna(subset=['Borough'], inplace=True)

### Group boroughs with the same zip codes

In [19]:
df_zc2 = df_zc.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_zc2.columns = ['PostalCode', 'Borough', 'Neighborhood']

### Fix the "Queen's Park" issue

In [24]:
df_zc2['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

df_zc2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [25]:
df_zc2.shape

(103, 3)

## Question 2

In [26]:
df_gsdata = pd.read_csv('http://cocl.us/Geospatial_data')
df_gsdata.head()

df_gsdata.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

df_merged = pd.merge(df_zc2, df_gsdata, on='PostalCode')
df_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Question 3

In [30]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [35]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for latit, longi, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latit, longi],
        radius=5,
        popup=label,
        color='white',
        fill=True,
        fill_color='#0000ff',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)  
    
map_toronto