## Clustering Toronto Neighborhoods

### Install Beautiful Soup & Import libraries

In [17]:
!pip -q install --upgrade beautifulsoup4

In [18]:
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd

### Web scrapiing the wiki page

In [60]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url)
table = soup.find_all('table')[0]

### read the resulting html code back into a dataframe

In [74]:
df = pd.read_html(str(table))[0]

df

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned


In [75]:
# Create a new variable called 'header' from the first row of the dataset
header = df.iloc[0]

In [76]:
# Replace the dataframe with a new one which does not contain the first row
df = df[1:]

In [77]:
# Rename the dataframe's column values with the header variable

mydf = df.rename(columns = header)

mydf.columns = ['PostalCode', 'Borough', 'Neighborhood']
mydf

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [78]:
mydf.drop(mydf.index[mydf['Borough'] == 'Not assigned'], inplace = True)
mydf

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


### More than one neighborhood can exist in one postal code area. COmbine rows into one row with the neighborhoods separated with a comma

In [79]:
grouped_df = mydf.groupby("PostalCode").agg(lambda x: ','.join(set(x))).reset_index()
grouped_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood,West Hill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview,East Birchmount Park,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Oakridge,Golden Mile"
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [80]:
grouped_df.loc[grouped_df.Neighborhood == 'Not assigned', 'Neighborhood'] = grouped_df.Borough

In [81]:
grouped_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood,West Hill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [83]:
grouped_df.shape

(103, 3)