# Capstone Project Notebook (part 1 of 3)
### by Dimitar Kunev
This notebook will be used as the primary vehicle for my capstone project.

In [1]:
%%time
# set up the libraries
#    this list will grow as I progress through the project
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
# from bs4 import BeautifulSoup

Wall time: 3.37 s


In [2]:
%%time
# get the table from the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)
source = lh.fromstring(page.content)
tr_els = source.xpath('//tr')
headers=[]
i=0

for t in tr_els[0]:
    i+=1
    name=t.text_content()
    headers.append((name,[]))

for j in range(1, len(tr_els)):
    T=tr_els[j]
    if len(T) != 3:
        break
    i=0
    for t in T.iterchildren():
        data=t.text_content()
        headers[i][1].append(data)
        i+=1

Dict={title:column for (title,column) in headers}
df=pd.DataFrame(Dict)

Wall time: 201 ms


In [3]:
# get rid of new line characters in the dataframe and column heading:
df = df.replace('\n', '', regex=True)
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
len(df.index)

181

In [5]:
df1 = pd.DataFrame(df[df.Borough != 'Not assigned'])
df1 = df1[df1['Postal Code']!=""]
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
len(df1.index)

103

In [7]:
# sort the data frame by Postal Code... just to be on the safe side
df1.sort_values(by=['Postal Code'], inplace=True)
df1.reset_index(inplace=True, drop=True)
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Assumption:
There are no postal codes with more than 1 neighborhood where at least one of the neighborhoods is listed as "Not assigned" ***and*** at least one other is known.

In [8]:
# merge the neighbourhoods by postal code and borough
df1 = df1.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# copy borough name to neighborhood when latter = "Not assigned"
df1['Neighbourhood']=np.where(df1['Neighbourhood']=='Not assigned', df1['Borough'], df1['Neighbourhood'])

In [10]:
df1.shape

(103, 3)

In [11]:
df1.to_csv('toronto.csv')