# List of postal codes of Toronto Canada

This notebook use Beutiful Soup to scrap data from web to a dataframe

In [1]:
# intall packages need
!pip3 install requests
!pip3 install beautifulsoup4



In [2]:
# Establish environment
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Load page from URL
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
# Print to see page content, use pprint for better display
# This too long and we can pass this step
# import pprint
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(page.content)

In [5]:
# use table to get data that we need
venues = soup.find('table')
venues.prettify()

'<table class="wikitable sortable">\n <tbody>\n  <tr>\n   <th>\n    Postal Code\n   </th>\n   <th>\n    Borough\n   </th>\n   <th>\n    Neighbourhood\n   </th>\n  </tr>\n  <tr>\n   <td>\n    M1A\n   </td>\n   <td>\n    Not assigned\n   </td>\n   <td>\n    Not assigned\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M2A\n   </td>\n   <td>\n    Not assigned\n   </td>\n   <td>\n    Not assigned\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M3A\n   </td>\n   <td>\n    North York\n   </td>\n   <td>\n    Parkwoods\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M4A\n   </td>\n   <td>\n    North York\n   </td>\n   <td>\n    Victoria Village\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M5A\n   </td>\n   <td>\n    Downtown Toronto\n   </td>\n   <td>\n    Regent Park, Harbourfront\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M6A\n   </td>\n   <td>\n    North York\n   </td>\n   <td>\n    Lawrence Manor, Lawrence Heights\n   </td>\n  </tr>\n  <tr>\n   <td>\n    M7A\n   </td>\n   <td>\n    Downtown Toronto\n   </td>\n   <td>

In [6]:
# We found that, all values we need standing after <td> mark
# we load them into a list
toronto = venues.find_all('td')

In [7]:
print(toronto[0].text.strip(), ' ', # item has order % 3 = 0 is Postal Code
      toronto[1].text.strip(), ' ', # item has order % 3 = 1 is Borough
      toronto[2].text.strip())      # item has order % 3 = 2 is Neighbourhood  

M1A   Not assigned   Not assigned


In [8]:
# extract values into lists and creat dataframe from these lists
postal_code = []
borough = []
neighbourhood = []
for i in range(0, len(toronto)):
  if i % 3 == 0:
    postal_code.append(toronto[i].text.strip())
  elif i % 3 == 1:
    borough.append(toronto[i].text.strip())
  else:
    neighbourhood.append(toronto[i].text.strip())

# creat data frame
df = pd.DataFrame(list(zip(postal_code, borough,neighbourhood)),
                  columns = ['Postal Code', 'Borough', 'Neighbourhood'])

df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
# creat final data frame by filter value != 'Not assigned'
df = df[df['Borough'] != 'Not assigned']
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
df.shape

(103, 3)

# Add Latitude and Longitude to data frame

In [11]:
lng_lat = pd.read_csv('http://cocl.us/Geospatial_data')
lng_lat.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
toronto_venues = pd.merge(df, lng_lat, on = 'Postal Code', how = 'inner')
toronto_venues.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [18]:
toronto_venues.shape

(103, 5)