# Segmenting and Clustering Neighborhoods in Toronto

## Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

   ## Read html using BeautifulSoup library

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M';
html = requests.get(url).text
soup = BeautifulSoup(html,'lxml')

In [3]:
# find the table
postal_codes_table = soup.find('table',{'class':'wikitable sortable'})

In [4]:
# extract td's only
table_cells = postal_codes_table.find_all('td')

In [5]:
# group each 3 cells together
N = 3
table_rows = [table_cells[n:n+N] for n in range(0, len(table_cells), N)]
postal_code = []
borough = []
neighborhood = []
for row in table_rows:
    postal_code_text = row[0].text
    borough_text = row[1].text
    neighborhood_text = row[2].text.rstrip('\n')
    if borough_text != 'Not assigned':
        postal_code.append(postal_code_text)
        borough.append(borough_text)
        neighborhood.append(neighborhood_text)

## Create dataframe

In [6]:
df = pd.DataFrame()
df['PostalCode'] = postal_code
df['Borough'] = borough
df['Neighborhood'] = neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M2A\n,Not assigned\n,Not assigned
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


In [7]:
# group neighborhoods of the same postal code
df = df.groupby(['PostalCode','Borough']).aggregate(lambda x : ', '.join(x)).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn


In [8]:
# check M5A 
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
# fill 'Not assigned' neighborhood to the proper borough
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

In [10]:
# check M5A 
df.loc[df['PostalCode'] == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [11]:
# df shape
df.shape

(180, 3)