<a href="https://cognitiveclass.ai"><img src = "https://ibm.box.com/shared/static/9gegpsmnsoo25ikkbl4qzlvlyjbgxs5x.png" width = 400> </a>


# Notebook for Applied Data Science Capstone - Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto by Emil Fuerstenberg Haegg

## Part 1

### Import and install libraries

In [134]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis

# Set diplay options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests # library to handle requests

#!pip install bs4  # uncomment if not already installed
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


### Scrape data from HTML tables into a DataFrame using BeautifulSoup and Pandas

In [135]:
# Webpage URL
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [136]:
# Getting content in text format and store in variable 'data'
data  = requests.get(url).text

In [137]:
# Creating beautifulsoup object on 'data'
soup = BeautifulSoup(data,"html.parser")

In [138]:
# Find all html-tables in the webpage
tables = soup.find_all('table') # in html table is represented by the tag <table>

In [139]:
# Check how many tables were found
len(tables)

3

In [140]:
# Indices for tables found  
for index,table in enumerate(tables):
    table_index = index
    print(table_index)

0
1
2


In [141]:
# Check for name of tables
for index, table in enumerate(tables):
    print(tables[index].caption)

None
None
None


In [142]:
# Inspecting webpage and finding that table in interest is wikitable, 
# testing find all for that class, rather than looking through tables above
wikitables = soup.find_all("table",{"class":"wikitable"})
# number of tables
len(wikitables)

1

In [143]:
# Table in interest
tablen = wikitables[0]

In [144]:
# Create dataframe
postal_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

# Enumerating over tablerows and columns, fill dataframe with data from table
for row in tablen.tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        PostalCode = col[0].text
        Borough = col[1].text
        Neighborhood = col[2].text.strip()
        postal_data = postal_data.append({"PostalCode":PostalCode, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)

postal_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M2A\n,Not assigned\n,Not assigned
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


### Process dataframe according to instructions

In [145]:
# Remove(replace with empty string) newline characters in dataframe
postal_data = postal_data.replace('\n', '', regex=True)

postal_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [146]:
# Create varible with True if condition met
selection = postal_data["Borough"] != "Not assigned"

# Selection from table using varieble
postal_data = postal_data[selection]

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
postal_data.loc[postal_data.Neighborhood == "Not assigned", 'Neighborhood'] = postal_data["Borough"]

# Reset index
postal_data.reset_index(drop=True, inplace=True)

# Diplay full dataframe
postal_data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [147]:
# Number of rows and columns in dataframe
postal_data.shape

(103, 3)

## Part 2

### Import and install libraries

In [148]:
#!pip install pgeocode # uncomment if not already installed
import pgeocode

print('Libraries imported.')

Libraries imported.


In [153]:
# Find latitudes and longitudes for postcodes

pgeocode.Nominatim('ca')
geolocator = pgeocode.Nominatim('ca')
postal_codes = postal_data['PostalCode'].tolist()
latitudes = []
longitudes = []
for i, postal_code in enumerate(postal_codes):

    print(f'--Getting Postal Code: {postal_code}')
    g = geolocator.query_postal_code(postal_code)
    
    if not g.empty:
        print(f'Postal Code {postal_code} has been retrieved. {len(postal_codes) - (i + 1)} codes left')
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)

--Getting Postal Code: M3A
Postal Code M3A has been retrieved. 102 codes left
--Getting Postal Code: M4A
Postal Code M4A has been retrieved. 101 codes left
--Getting Postal Code: M5A
Postal Code M5A has been retrieved. 100 codes left
--Getting Postal Code: M6A
Postal Code M6A has been retrieved. 99 codes left
--Getting Postal Code: M7A
Postal Code M7A has been retrieved. 98 codes left
--Getting Postal Code: M9A
Postal Code M9A has been retrieved. 97 codes left
--Getting Postal Code: M1B
Postal Code M1B has been retrieved. 96 codes left
--Getting Postal Code: M3B
Postal Code M3B has been retrieved. 95 codes left
--Getting Postal Code: M4B
Postal Code M4B has been retrieved. 94 codes left
--Getting Postal Code: M5B
Postal Code M5B has been retrieved. 93 codes left
--Getting Postal Code: M6B
Postal Code M6B has been retrieved. 92 codes left
--Getting Postal Code: M9B
Postal Code M9B has been retrieved. 91 codes left
--Getting Postal Code: M1C
Postal Code M1C has been retrieved. 90 codes l

In [154]:
# Add columns to dataframe
postal_data['Latitude'] = latitudes
postal_data['Longitude'] = longitudes

In [155]:
# Display dataframe (M7R looks to be retrieved succesfully "isnot empty" but no coordinates returned)
postal_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


In [156]:
postal_data.shape

(103, 5)