# Applied Data Scince Capstone Week 3 Lab

## Part 1

### Install beautifulsoup and other required libs for web scraping

In [1]:
#!pip install lxml html5lib beautifulsoup4

### Install Pandas and read data frames from Wikipedia page

In [2]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
print(len(dfs))

3


### Select the appropraite data frame

In [3]:
print(dfs[0])

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                         Neighbourhood  
0                                         Not assigned  
1                                         Not assigned  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
..                                                 ...  
175                                       Not assigned  
176                                       Not assigned  
177                                       

### Assign the data frame and filter out rows where Borough is not assigned

In [4]:
df1 = dfs[0]
mask = df1['Borough'] != 'Not assigned'
df2 = df1[mask]
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Group by postal code



In [5]:
df3 = df2.groupby("Postal Code").sum()
df3

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### Reset index



In [6]:
df4 = df3.reset_index()
df4

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### Print the shape of the dataframe

In [7]:
df4.shape

(103, 3)

# ========================================

## Part 2

### Get the geolocations and append them to the dataframe

In [8]:
import requests
from pandas.io.json import json_normalize

### Single example of geolocation extraction to be used on for loop below

In [9]:
resp = requests.get('http://geogratis.gc.ca/services/geolocation/en/locate?q=M1B')
df_test = pd.json_normalize(resp.json())
print('M1B latitude = ' + str(df_test['geometry.coordinates'][0][1]))
print('M1B longitude = ' + str(df_test['geometry.coordinates'][0][0]))

M1B latitude = 43.809444
M1B longitude = -79.193321


### Loop through all area codes to retireve geolocations

M7R returns an empty JSON file  
Latitude, Longitude coordinates are: 43.63705212384915, -79.61562720000305  
dealt with "manually" through an if: else: structure

In [10]:
lat = []
long = []
url_base = "http://geogratis.gc.ca/services/geolocation/en/locate?q="

for postal_code in df4["Postal Code"]:
    if postal_code == 'M7R':
        lat.append('43.637052')
        long.append('-79.615627')
    else:
        url = url_base + postal_code
        resp = requests.get(url)
        df = pd.json_normalize(resp.json())
        lat.append(df['geometry.coordinates'][0][1])
        long.append(df['geometry.coordinates'][0][0])

df4['Latitude'] = lat
df4['Longitude'] = long
df4

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8094,-79.1933
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.788,-79.1587
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7697,-79.1731
3,M1G,Scarborough,Woburn,43.7724,-79.2151
4,M1H,Scarborough,Cedarbrae,43.7713,-79.2408
...,...,...,...,...,...
98,M9N,York,Weston,43.7067,-79.52
99,M9P,Etobicoke,Westmount,43.6984,-79.5343
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.6897,-79.5572
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.7454,-79.5865


# ========================================

## Part 3