In [None]:
!pip install geopy

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Install website scraping libraries and packages in Python from BeautifulSoup 
#!conda install -c conda-forge beautifulsoup4 --yes  # uncomment this line if you haven't completed 
from bs4 import BeautifulSoup as bs

print('Libraries imported.')

In [None]:

!wget -q -O 'canada_postal_code_list_from_wikipedia.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('HTML Postal Code page downloaded!')

In [None]:
with open("canada_postal_code_list_from_wikipedia.html") as fp:
    soup = bs(fp, 'lxml')

# Get the HTML table codes
tagTable = soup.table
#Get table body
body = tagTable.tbody

In [None]:
# Define the dataframe columns 
# get table column names -> all 'th' tags of the body in 'tr' fields
colTab = (body.tr).find_all('th')
#print (colTab)
colNames = [(bs(str(colTab[i]))).find('th').string.strip() for i in range (0,3)]

# instantiate the dataframe
postcode_df = pd.DataFrame(columns=colNames)
postcode_df

In [None]:
postcode_df = pd.DataFrame(columns=colNames)

# extract all 'tr' tagged fields except the first one (column names)
codesTab= body.find_all('tr')[1:]

for n, code in enumerate(codesTab):
    # n.th postal code either : name or link
    #print ("\n", n ,".th",  code, )
    # for each element code 
    tabc = ["","",""]
    for i, value in enumerate(code.stripped_strings):
        tabc[i] = value.strip()
    #print("tabc", tabc)
    # Ignore cells with a borough that is Not assigned.
    #print(tabc[1], 'Not assigned', tabc[1] == 'Not assigned')
    postcode = tabc[0]
    borough = tabc[1]
    neighbourhood = tabc[2]
    
    if borough != 'Not assigned':
        # insert
        # check a neighbourhood is assigned else set it with borough
        if neighbourhood == 'Not assigned':
            neighbourhood = borough
        # insert the built postal code into the dataframe
        postcode_df = postcode_df.append({'Postcode' : postcode,
                            'Borough' : borough,
                            'Neighbourhood': neighbourhood},
                           ignore_index=True)

# Combine rows with same postal code into one row with the neighborhoods separated with a comma 
df = postcode_df.groupby('Postcode', as_index=False).agg({'Borough':'first', 'Neighbourhood':', '.join})
print ("Toronto postal codes dataframe dimensions = ", df.shape)
df.head(15)

In [None]:
 #Store dataframe locally as a csv file to be easily used later
df.to_csv('http://cocl.us/Geospatial_data')


In [None]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data   

df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

# Rename the Postal Code column to allow merging
df_coordinates.rename(columns={'Postal Code':'Postcode'}, inplace=True)

# for each postal code get the latitude and longitude values
# Merge the 2 dtaframe on the Postalcode column
df.head(10)
df_latlg = pd.merge(df, df_coordinates, on='Postcode' )
df_latlg.head(15)