# Scraping Toronto Postal Codes From Wikipedia

In [97]:
import pandas as pd
import numpy as np
import requests

# using BeautifulSoup for parsing the html
from bs4 import BeautifulSoup

## First, get the source

I use the requests library to get the page's text using the url. Then, using beautiful soup, I can begin to parse the data.

In [None]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(wiki_url).text
soup = BeautifulSoup(source, 'lxml')

## Next, find the data

The data is in a table on the wikipedia page. So, going row by row through the data, I find what data to keep.

If the borough is 'Not assigned' then the entry is skipped.
if there is more than one neighborhood in a postal code, they should be in the same 'neighborhood' entry separated by commas.

If a cell has a borough, but the neighborhood is 'Not assigned' then it will be the same as the borough.

In [99]:
neighborhood_data = []
existingPostal = False # boolean to determine whether or not it is a new code

# get rows
rows = soup('tr')
for row in rows[1:290]:
    existingPostal = False
    
    row_vals = row('td') # list of data values within a row
    
    # if the borough is not assigned, skip this row!
    if row_vals[1].text == 'Not assigned':
        continue
    # if the neighborhood is not assigned, change it to the borough name!
    if row_vals[2].text[:-1] == 'Not assigned':
        row_vals[2] = row_vals[1].text
    else:
        row_vals[2] = row_vals[2].text[:-1] # slice to get rid of '\n'
    # check if the postal code already exists. If it does, then just add the neighborhood to existing entry
    for postal_code in neighborhood_data:
        if row_vals[0].text == postal_code['PostalCode']:
            postal_code['Neighborhood'] = postal_code['Neighborhood'] + ', ' + row_vals[2]
            existingPostal = True
            break
    
    # if it is a new code, add new row.
    if not existingPostal:        
        neighborhood_data.append({
            'PostalCode' : row_vals[0].text,
            'Borough' : row_vals[1].text,
            'Neighborhood' : row_vals[2]
        })

# create dataframe for neighborhoods
neighborhood_df = pd.DataFrame(neighborhood_data, columns=['PostalCode','Borough','Neighborhood'])

neighborhood_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [96]:
neighborhood_df.shape

(103, 3)