This notebook will be used to analyze clusters of neighborhoods in Canada.

First, we will grab the packages we need and open the url.

In [74]:
import pandas as pd
import numpy as mp
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html = urlopen(url)

Next, we'll use the BeautifulSoup package to access the html code from the site.

In [75]:
soup = BeautifulSoup(html,'lxml')

Now that we have the code, we'll need to grab the data that is in the table. Specifically, that means we need to find everything between the tr tags in the code.

In [76]:
#print the table rows in the html code
rows = soup.find_all('tr')

#make the rows into a list

list_rows = []

for row in rows:

    row_td = row.find_all('td')

    str_cells = str(row_td)

    cleantext = BeautifulSoup(str_cells, "lxml").get_text()

    list_rows.append(cleantext)

print(list_rows[:10])

#create dataframe from the list

df = pd.DataFrame(list_rows)
df.head()


['[]', '[M1A, Not assigned, Not assigned\n]', '[M2A, Not assigned, Not assigned\n]', '[M3A, North York, Parkwoods\n]', '[M4A, North York, Victoria Village\n]', '[M5A, Downtown Toronto, Harbourfront\n]', '[M5A, Downtown Toronto, Regent Park\n]', '[M6A, North York, Lawrence Heights\n]', '[M6A, North York, Lawrence Manor\n]', "[M7A, Queen's Park, Not assigned\n]"]


Unnamed: 0,0
0,[]
1,"[M1A, Not assigned, Not assigned\n]"
2,"[M2A, Not assigned, Not assigned\n]"
3,"[M3A, North York, Parkwoods\n]"
4,"[M4A, North York, Victoria Village\n]"


Now that we have the data in a dataframe, we need to format it according to the assignment specifications.

In [77]:
# remove blank row, split on comma, and grab headers
df1 = df.iloc[1:]

df2=df1[0].str.split(',',expand=True)

#strip the opening bracket and the new line text
df2[0] = df2[0].str.strip('[')
df2[2] = df2[2].str.strip('\n]')

#keep only the PostalCode, Borough, and Neighborhood columns
df2 = df2.iloc[:,0:3]

#create a list of header names and assign it to the dataframe
headers = ['PostalCode','Borough','Neighborhood']

df2.columns = headers

#remove irrelevant rows

df2=df2.iloc[0:289,:]

print(df2)

    PostalCode            Borough  \
1          M1A       Not assigned   
2          M2A       Not assigned   
3          M3A         North York   
4          M4A         North York   
5          M5A   Downtown Toronto   
6          M5A   Downtown Toronto   
7          M6A         North York   
8          M6A         North York   
9          M7A       Queen's Park   
10         M8A       Not assigned   
11         M9A          Etobicoke   
12         M1B        Scarborough   
13         M1B        Scarborough   
14         M2B       Not assigned   
15         M3B         North York   
16         M4B          East York   
17         M4B          East York   
18         M5B   Downtown Toronto   
19         M5B   Downtown Toronto   
20         M6B         North York   
21         M7B       Not assigned   
22         M8B       Not assigned   
23         M9B          Etobicoke   
24         M9B          Etobicoke   
25         M9B          Etobicoke   
26         M9B          Etobicoke   
2

Now we need to remove unassigned boroughs and format the neighborhood names that are unassigned

In [83]:
#remove the leading space in the Borough cell
df2.Borough = df2.Borough.str.strip()

#remove boroughs that are not assigned
df3 = df2[df2.Borough != 'Not assigned']

df3.head()

# replace missing neighborhood with borough

# remove leading space in neighborhood and replace "Not assigned" neighborhood names with borough names
df3.Neighborhood = df3.Neighborhood.str.strip()

df3.Neighborhood.replace('Not assigned', df3.Borough, inplace = True)

df3.reset_index(inplace = True, drop = True)

df4 = df3.groupby(['PostalCode','Borough'])['Neighborhood'].agg(lambda x: ",".join(x)).reset_index()

df4.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [84]:
df4.shape

(103, 3)