# Neighborhoods in Toronto
(Data Ingestion and Data Cleansing)

### Import libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Request URL

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
Canada_data = BeautifulSoup(source, 'lxml')

### Scrape data into dataframe

In [4]:
column_names = ['Postalcode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

In [5]:
content = Canada_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = ''
borough = ''
neighborhood = ''
table_contents=[]

# scrape data from url
for tr in table.find_all('tr'):
    for td in tr.find_all('td'):
        cell = {}
        postcode = td.text.replace('\n', '').strip()[:3]
        if td.text.find('(') == -1:
            borough = td.text.replace('\n', '').strip()[3::]
        else:
            borough = td.text.replace('\n', '').strip()[3:td.text.find('(')-1]
        if td.text.find('(') != -1 and td.text.find(')') != -1:
            neighborhood = td.text[td.text.find('(')+1:td.text.find(')')].replace('\n', '').replace(' /', ',')
        cell['PostalCode'] = postcode
        cell['Borough'] = borough
        cell['Neighborhood'] = neighborhood
        table_contents.append(cell)

# assign content into dataframe
toronto=pd.DataFrame(table_contents)

# special handling
toronto['Borough']=toronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [6]:
# drop Not assigned rows
empty = 'Not assigned'
df = toronto[(toronto.PostalCode != empty ) & (toronto.Borough != empty) & (toronto.Neighborhood != empty)]

df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,"Regent Park, Harbourfront",M5A
5,North York,"Lawrence Manor, Lawrence Heights",M6A
6,Queen's Park,Ontario Provincial Government,M7A


In [7]:
# group neighborhood by postal code area
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['PostalCode', 'Borough'])
df2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [8]:
# print number of rows of dataframe
print(df2.shape)

df2.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df2.to_csv('toronto_neighborhood.csv')