### Import necessary libraries

In [6]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

print('Libraries Imported!')

Libraries Imported!


### Create a dataframe of postal codes in Canada

In [7]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url).text

In [8]:
bs = BeautifulSoup(results, 'xml')

In [9]:
table = bs.find('table')

In [10]:
col = ['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns = col)

In [11]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data) ==3:
        df.loc[len(df)] = row_data

In [12]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Data Cleaning

1. Only process the cells that have an assigned borough. Ignore cells with a borough that is __Not assigned__.

In [13]:
# Drop 'Not assigned' rows of Borough
df = df[df.Borough != 'Not assigned']

In [14]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


2. For the ones having multiple neighborhoods, separate them with a comma, not /.

In [15]:
listNeigh = df.Neighborhood.values.tolist()

In [16]:
newListNeigh = []
for neigh in listNeigh:
    newNeigh = neigh.replace(" /",",")
    newListNeigh.append(newNeigh)
#print(newListNeigh)

In [17]:
n = pd.Series(newListNeigh)
df['Neighborhood'] = n.values

In [18]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


3. If a cell has a borough but a __Not assigned__ neighborhood, then the neighborhood will be the same as the borough.

In [19]:
df.Neighborhood.values == 'Not assigned'

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

** None of Neighborhood shows __Not assigned__. So, no need of this process.

4. Reset the index values

In [20]:
df = df.reset_index(drop=True)

In [21]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


5. Show the size of the dataframe.

In [22]:
df.shape

(103, 3)

### Get Latitudes and Longitudes for each Postal Code

In [23]:
latLon = pd.read_csv('Geospatial_Coordinates.csv')

In [24]:
inner_join = pd.merge(df, latLon, on='PostalCode', how='inner')

In [25]:
inner_join.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
