In [2]:
#To be used for IBM Applied Data Science Capstone Project
import pandas as pd
import numpy as np

In [3]:
#Import other useful libraries for segmenting and clustering
import xml
import csv
import json
import requests
from bs4 import BeautifulSoup

In [4]:
#Get HTML data from wikipedia page
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


In [5]:
#Apply beautiful soup to get url data and then prettify to get nested tags
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml') # make Soup using URL

In [6]:
#Parse the wiki data using table and tr tags based on the nested tagging done above
neighborhood = soup.find('table', class_ = 'wikitable')
neighborhood_rows = neighborhood.find_all('tr')

In [7]:
#Get postal code, borough and neighbourhood from the table
information = []
for row in neighborhood_rows:
    info = row.text.split('\n')[1:-1] # remove empty str (the first and last items)
    information.append(info)
    
#Convert to pandas data frame
neighbor_df = pd.DataFrame(information[1:], columns=information[0])

#Remove unassigned boroughs
neighbor_df = neighbor_df[neighbor_df.Borough != 'Not assigned']
neighbor_df.reset_index(drop=True, inplace=True)
neighbor_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


# Combining neighbourhoods with the same postal code

In [8]:
#Combining neighbourhoods with the same postal code
grouped = neighbor_df.groupby(['Postcode']) # group by Postcode
# combine the neighborhoods grouped by postcode and into a new df
neighborhood_grouped = grouped['Neighbourhood'].apply(lambda x: x.sum()) 
# adds spaces and commas between neighborhoods
neighborhood_grouped = grouped['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
# matches a borough to each postcode
borough_grouped = grouped['Borough'].apply(lambda x: set(x).pop())
# turn borough_grouped and neighborhood_grouped into dataframes
borough = borough_grouped.to_frame()
neighborhood = neighborhood_grouped.to_frame()
#combine the dataframe borough and the dataframe neighborhood into one dataframe
grouped_final = borough.merge(neighborhood, on="Postcode")
#Confirm shape of the newly created dataframe
grouped_final.shape

(103, 2)

In [9]:
#Get geo-spatial longitude and latitude of the Toronto neihgborhoods
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#Renaming Postal Code to Postcode to simplifying joining 
geospatial_data = geospatial_data.rename(columns={geospatial_data.columns[0]: "Postcode"})

# Adding geo spatial data to the main table using Postal Code
full_table = grouped_final.merge(geospatial_data, on = 'Postcode')
full_table.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
