## Segmenting and Clustering Neighborhoods in Toronto

##### Install libraries

In [100]:
# install beautifulsoup4 library using to parse HTML page

!pip install beautifulsoup4
!pip install html5lib

print('Complete install beautifulsoup4')

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

Complete install beautifulsoup4


##### Using BeautifulSoup library to scrape the Wikipedia page to get Postal Code

In [94]:
# parsing the html to get Postal_codes_in_Canada
def getPostalCode_DataFrame(page_content):

    soup = BeautifulSoup(page_content, 'html.parser')

    #print(soup.prettify())
    # find the table that contains Postal_codes_in_Canada
    tbl = (soup.find_all(class_='wikitable sortable'))


    # if the table exists then parsing the rows.
    if (len(tbl) >0):
        rows = tbl[0].find_all('tr')

        col1 = pd.DataFrame(columns = ["PostalCode"])
        col2 = pd.DataFrame(columns = ["Borough"])
        col3 = pd.DataFrame(columns = ["Neighborhood"])

        # loop through rows
        for  i, row in enumerate(rows): 

            # skip the header, get data rows
            if (i>0):
                columns = row.find_all('td') 

                # there are 3 columns
                postalcode = str(columns[0]).replace('<td>','').replace('</td>','').replace('\n','').strip()
                borough = str(columns[1]).replace('<td>','').replace('</td>','').replace('\n',''). strip()
                neighborhoods =  str(columns[2]).replace('<td>','').replace('</td>','').replace('\n','').strip()
              
                col1.loc[i-1] = postalcode 
                col2.loc[i-1] = borough
                col3.loc[i-1] = neighborhoods
                    
                df_raw = pd.concat([col1, col2, col3],  axis=1)
    
        return df_raw
    else:
        return None

In [95]:
# request html page
html = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(html)
page_content = page.content


##### Cleansing data and combine postal code and geo code data frames to create a final data frame

In [96]:
# cleansing postal code dataframe

# get postal code dataframe from parsing function
df_raw = getPostalCode_DataFrame(page_content)

#print(df_raw.dtypes)
#print(df_raw.shape)
#print(df_raw.Borough.unique())

# removed Borough = 'Not assigned'
df_postal = df_raw[df_raw.Borough != 'Not assigned'].copy()

#print(df_postal.shape)  #removed 77 Borough = 'Not assigned'

# borough has value but neighborhood has Not assigned. Set neighborhood to be the same as the borough.
# check Neighborhood = 'Not assigned' in the df_postal. Since Borough = 'Not assigned' was removed

# it is safe to assume Borough is valid and should assign to 'Not assigned' Neighbourhood
# replace Neighborhood = 'Not assigned' to NaN
df_postal.Neighborhood = df_postal.Neighborhood.replace('Not assigned',np.NaN)

# then replace NaN with Borough's value
df_postal['Neighborhood'].fillna(df_postal['Borough'], inplace=True)

df_postal.set_index('PostalCode',inplace=True)
#print(df_postal.shape)

In [97]:
# load geographical coordinates of each postal code
url = 'http://cocl.us/Geospatial_data'
df_geo = pd.read_csv(url, index_col=None, header='infer')

df_geo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_geo.set_index('PostalCode',inplace=True)

df_geo.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [98]:
# combine dataframe
df = df_postal.merge(df_geo, left_index=True, right_index=True)

# reset index
df.reset_index(inplace=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [99]:
df.shape

(103, 5)