# Notebook to Cluster Neighborhoods in Toronto

#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

#### Scraping Wikipedia Page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
page = urllib.request.urlopen(url)

In [4]:
soup = BeautifulSoup(page, "lxml")

In [7]:
table=soup.find('table', class_='wikitable sortable')

#### Getting the required table and cleaning

In [9]:
postal_codes = []
boroughs = []
neighborhoods = []
for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        postal_codes.append(cells[0].find(text=True))
        boroughs.append(cells[1].find(text=True))
        neighborhoods.append(cells[2].find(text=True))

In [19]:
for i in range(len(postal_codes)):
    postal_codes[i] = postal_codes[i].replace('\n','')
    boroughs[i] = boroughs[i].replace('\n','')
    neighborhoods[i] = neighborhoods[i].replace('\n','')

#### Converting to Dataframe

In [21]:
df = pd.DataFrame(postal_codes, columns=['PostalCode'])
df['Borough'] = boroughs
df['Neighborhoods'] = neighborhoods
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Removing unassigned Boroughs

In [27]:
new_df = df[df['Borough'] != 'Not assigned']

In [28]:
new_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [29]:
new_df.shape

(103, 3)

#### Checking to see if all rows have unique postal code

In [38]:
counts = new_df['PostalCode'].value_counts()
counts[counts > 1]

Series([], Name: PostalCode, dtype: int64)

Since we have an empty serries, all the rows have unique postal code

In [39]:
new_df.shape

(103, 3)