In [2]:
import numpy as np # library to handle data in a vectorized manner
import wikipedia as wp
import pandas as pd # library for data analsysis


### Scrape the following Wikipedia page

In [3]:
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [4]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df['Neighbourhood'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], axis=0, inplace=True)
df['Neighbourhood'].fillna(df['Borough'],inplace=True)
df.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue


- **More than one neighborhood can exist in one postal code area**

In [5]:
df['Neighbourhood']=df.groupby(['Postcode'])['Neighbourhood'].transform(lambda x: ','.join(x))
df=df.drop_duplicates()
df.reset_index(drop=True,inplace=True)
df.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North


In [6]:
df.shape

(103, 3)

- **This is location [data](http://cocl.us/Geospatial_data).**

In [35]:
geo=pd.read_csv('Geospatial_Coordinates.csv')

df.rename(columns={'Postcode':'Postal Code'}, inplace=True)

data=pd.merge(df,geo,on='Postal Code')

data.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [36]:
neighborhoods=data[data['Borough'].str.contains('Toronto')].drop('Postal Code',axis=1)
neighborhoods.reset_index(drop=True,inplace=True)
neighborhoods.head(11)

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
1,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
2,Downtown Toronto,St. James Town,43.651494,-79.375418
3,East Toronto,The Beaches,43.676357,-79.293031
4,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,Downtown Toronto,Christie,43.669542,-79.422564
7,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
8,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259
9,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


In [44]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [47]:
neighborhoods['Borough'].unique()

array(['Downtown Toronto', 'East Toronto', 'West Toronto',
       'Central Toronto'], dtype=object)

In [68]:
Downtown_Toronto = neighborhoods[neighborhoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
Downtown_Toronto.head()

Downtown_Toronto['Latitude']

0     43.654260
1     43.657162
2     43.651494
3     43.644771
4     43.657952
5     43.669542
6     43.650571
7     43.640816
8     43.647177
9     43.648198
10    43.662696
11    43.653206
12    43.628947
13    43.679563
14    43.646435
15    43.667967
16    43.648429
17    43.665860
Name: Latitude, dtype: float64

In [71]:
address = 'Downtown Toronto, Ontario'
#address = 'New York City, NY'
from geopy.exc import GeocoderTimedOut

def do_geocode(address):
    geopy = Nominatim(user_agent="ny_explorer")
    try:
        return geopy.geocode(address)
    except GeocoderTimedOut:
        return do_geocode(address)
#do_geocode(address)

**Latitude = 43.6529 Longitude = -79.3849**
- to avoid timeout

In [70]:

map_dt = folium.Map(location=[43.6529, -79.3849], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Downtown_Toronto['Latitude'], Downtown_Toronto['Longitude'], Downtown_Toronto['Borough'], Downtown_Toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt)  
    
map_dt