# Postal Data from Canada's Neighborhoods

## Dataframe Scraping and Preparation



**First, load the dependencies:**

In [86]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

from bs4 import BeautifulSoup  #HTML parsing

# import the library we use to open URLs
import urllib.request

print('Libraries imported.')

Libraries imported.


**Scrape the Wikipedia page and isolate the table from the page:**

In [87]:
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")


# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")

postal_table=soup.find('table',class_='wikitable sortable')

**Loop through the rows to assign data to cells, then build the dataframe:**

In [88]:
#loop through the rows and assign to cells

A=[]
B=[]
C=[]


for row in postal_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True).replace('\n',''))  #removes carrige return characters
        B.append(cells[1].find(text=True).replace('\n',''))
        C.append(cells[2].find(text=True).replace('\n',''))
        


In [89]:
#build the dataframe

code_df=pd.DataFrame(A,columns=['Postal Code'])
code_df['Borough']=B
code_df['Neighborhood']=C
code_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Clean the dataframe:**

In [92]:
#drop rows with NAs
code_df=code_df[code_df['Borough'] != 'Not assigned']
code_df.reset_index(inplace=True,drop=True)
code_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [91]:
code_df.shape

(103, 3)