## Segmenting and Clustering Neighborhoods in Toronto

# PART 1

##### Install libraries

In [1]:
# install beautifulsoup4 library using to parse HTML page

!pip install beautifulsoup4
!pip install html5lib

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests


print('Complete imported libraries')

Complete imported libraries


##### Using BeautifulSoup library to scrape the Wikipedia page to get Postal Code

In [2]:
# parsing the html to get Postal_codes_in_Canada
def getPostalCode_DataFrame(page_content):

    soup = BeautifulSoup(page_content, 'html.parser')

    #print(soup.prettify())
    # find the table that contains Postal_codes_in_Canada
    tbl = (soup.find_all(class_='wikitable sortable'))


    # if the table exists then parsing the rows.
    if (len(tbl) >0):
        rows = tbl[0].find_all('tr')

        col1 = pd.DataFrame(columns = ["PostalCode"])
        col2 = pd.DataFrame(columns = ["Borough"])
        col3 = pd.DataFrame(columns = ["Neighborhood"])

        # loop through rows
        for  i, row in enumerate(rows): 

            # skip the header, get data rows
            if (i>0):
                columns = row.find_all('td') 

                # there are 3 columns
                postalcode = str(columns[0]).replace('<td>','').replace('</td>','').replace('\n','').strip()
                borough = str(columns[1]).replace('<td>','').replace('</td>','').replace('\n',''). strip()
                neighborhoods =  str(columns[2]).replace('<td>','').replace('</td>','').replace('\n','').strip()
              
                col1.loc[i-1] = postalcode 
                col2.loc[i-1] = borough
                col3.loc[i-1] = neighborhoods
                    
                df_raw = pd.concat([col1, col2, col3],  axis=1)
    
        return df_raw
    else:
        return None

In [3]:
# request html page
html = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(html)
page_content = page.content


##### Cleansing data and combine postal code and geo code data frames to create a final data frame

In [5]:
# cleansing postal code dataframe

# get postal code dataframe from parsing function
df_raw = getPostalCode_DataFrame(page_content)

#print(df_raw.dtypes)
#print(df_raw.shape)
#print(df_raw.Borough.unique())

# removed Borough = 'Not assigned'
df_postal = df_raw[df_raw.Borough != 'Not assigned'].copy()

#print(df_postal.shape)  #removed 77 Borough = 'Not assigned'

# borough has value but neighborhood has Not assigned. Set neighborhood to be the same as the borough.
# check Neighborhood = 'Not assigned' in the df_postal. Since Borough = 'Not assigned' was removed

# it is safe to assume Borough is valid and should assign to 'Not assigned' Neighbourhood
# replace Neighborhood = 'Not assigned' to NaN
df_postal.Neighborhood = df_postal.Neighborhood.replace('Not assigned',np.NaN)

# then replace NaN with Borough's value
df_postal['Neighborhood'].fillna(df_postal['Borough'], inplace=True)

df_postal.reset_index(drop=True,inplace = True)

df_postal.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df_postal.shape

(103, 3)