# Segmenting and Clustering Neighborhoods in Toronto   
Peer-graded assignment   
Coursera Applied data science capstone

# Import libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# 1 Scrape table from Wikipedia page   

scrape the this Wikipedia page: [https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M]
in order to obtain the data that is in the table of postal codes.

### Using the `requests` library to scape the url

In [2]:
# Wikipedia page to scape:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# establishing the connection on the web page:
results = requests.get(url)

# check the status codes
print("status: {}".format(results.status_code))

# pull the html text out of requests 
html_text = results.text


status: 200


### Creating a BeautifulSoup object

In [3]:
soup = BeautifulSoup(html_text, 'lxml')
#print(soup.prettify())

In [4]:
# table that I want to extract is in table, with class = 'wikitable sortable'
table = soup.find('table', {'class': "wikitable sortable"})

# find all table row (tr)
table_rows = table.find_all('tr')

# create empty list to store teh data row by iterate through the table_rows
column_name = []
row_data = []
    
for row in table_rows[:1]:   #first row to get the table_head only
    column_name.append([t.text.strip() for t in row.find_all('th')])
    
for row in table_rows[1:]:   # exclude the first row
    row_data.append([t.text.strip() for t in row.find_all('td')])

In [5]:
# convert data into dataframe
df_postal = pd.DataFrame(row_data, columns = column_name[0])

# ignore row with a borough that is "Not Assigned"
df_postal = df_postal[~df_postal['Borough'].str.contains("Not assigned")].reset_index(drop = True)
df_postal.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
# alternative approach to pull one table from wiki page without using Beautifulsoup:
df_postal = pd.read_html(url, header = 0)[0]
df_postal = df_postal[~df_postal['Borough'].str.contains("Not assigned")].reset_index(drop = True)
df_postal.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Check dataframe

In [7]:
# check if there is duplicated postal code but with different neighborhood
# there is only 1 row that contains both neighborhood already
df_postal[df_postal['Postal Code']=='M5A']

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
# check if there is any borough with neighborhood of 'Not assigned'
# none
df_postal[df_postal['Neighborhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


In [9]:
# save the table to csv file, to be used for the part2 notebook
df_postal.to_csv('../data/toronto_postal.csv', index = False)

In [10]:
print("df_postal (row, column): {}".format(df_postal.shape))

df_postal (row, column): (103, 3)
