In [6]:
#Import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import re
import numpy as np
import pandas as pd
import csv

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [8]:
def remove_tags(data_arr_list):
    tags = ["<td>", "</td>", "\n", "td>" , "</td", "]]"]
    for i in range(0, len(data_arr_list)):
        for j in range(0, len(tags)):
            if str(tags[j]) in str(data_arr_list[i]):
                data_arr_list[i] = data_arr_list[i].replace(tags[j], "")
                if 'title="' in str(data_arr_list[i]):
                    data_arr_list[i] = str(data_arr_list[i]).split('title="')[1].split('">')[0]
    
    return (data_arr_list)

In [9]:
# Funtion: Compile_Postal

def compile_postal(data_arr_list):

    #Compare the postal code to the next one in order
    for i in range (0, len(data_arr_list)-3, 3):

        if str(data_arr_list[i]) == str(data_arr_list[i+3]):
            #Add to the current postal code
            if str(data_arr_list[i+4]) not in data_arr_list[i+1]:
                data_arr_list[i+1] = str(data_arr_list[i+1]) + ", " + str(data_arr_list[i+4])
            if str(data_arr_list[i+5]) not in data_arr_list[i+2]:
                data_arr_list[i+2] = str(data_arr_list[i+2]) + ", " + str(data_arr_list[i+5])
            
            #Remove old entry(s)
            del(data_arr_list[i+3])
            del(data_arr_list[i+3])
            del(data_arr_list[i+3])
            
            data_arr_list = compile_postal(data_arr_list)
            
            break
            
    return data_arr_list

In [10]:
# Description: Drop borough rows that are N/A 

def drop_na_borough(data_arr_list):

    for i in range (1, len(data_arr_list)-1, 3):
        if str(data_arr_list[i]) == 'Not assigned':
            
            #Remove the row
            del(data_arr_list[i-1])
            del(data_arr_list[i-1])
            del(data_arr_list[i-1])
            
            data_arr_list = drop_na_borough(data_arr_list)
            break
            
    return data_arr_list

In [11]:
# Description: Assign borough value to neighborhood if neighborhood is N/A 
 
def neighborhood_borough(data_arr_list):
    
    for i in range (2, len(data_arr_list), 3):
        if str(data_arr_list[i]) == 'Not assigned':
            
            data_arr_list[i] = str(data_arr_list[i-1])
            data_arr_list = neighborhood_borough(data_arr_list)
            
            break
            
    return data_arr_list

In [12]:
# specify the url
quote_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# query the website and return the html to the variable ‘page’
page = urlopen(quote_page)

# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, "html.parser")

#Define array to hold all of the data points
data_arr = []

#Get the first table in the html
data = soup.findAll('table')

#assign the cells to the array
for row in data:
    for item in row.findAll('td'):
        if "<td>" in str(item):
            data_arr.append(str(item))

#Remove the last element in the list as it is invalid
data_arr.pop()            

#Clean up the tags and data points

#Remove HTML tags
data_arr = remove_tags(data_arr)

#Compile postal codes
data_arr = compile_postal(data_arr)

#Drop Not assigned boroughs
data_arr = drop_na_borough(data_arr)

#Assign borough to n/a neighborhoods
data_arr = neighborhood_borough(data_arr)

In [20]:
#Create a dictionary
toronto_dict = {'Postal_Code':data_arr[0::3], 'Borough': data_arr[0::3], 
                                     'Neighborhood':data_arr[0::3] }

#Pandas Data frame
toronto_df = pd.DataFrame.from_dict(toronto_dict)

#*********Uncomment these lines to focus only on those boroughs in Toronto - containing the word Toronto*********#
#toronto_df = toronto_df[toronto_df['Borough'].str.contains("Toronto")==True]
#toronto_df.reset_index(drop=True, inplace=True)

#Print the shape of the new frame and display the first 5 rows
print(toronto_df.shape)

toronto_df.head()

(104, 3)


Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M3A,M3A,M3A
1,M4A,M4A,M4A
2,M5A,M5A,M5A
3,M6A,M6A,M6A
4,M7A,M7A,M7A


In [22]:
toronto_df['Latitude'] = 'Not Set'
toronto_df['Longitude'] = 'Not Set'

#Open file containing the geospacial coordinates for Toronto
with open('Geospatial_Coordinates.csv', 'r') as csvfile:
    geo_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in geo_reader:
        #Find the postal code in the fame and add coordinates
        toronto_df.loc[toronto_df['Postal_Code'] == str(row[0]), "Latitude"] = str(row[1])
        toronto_df.loc[toronto_df['Postal_Code'] == str(row[0]), "Longitude"] = str(row[2])
        
#Set type to numeric
toronto_df['Latitude'] = pd.to_numeric(toronto_df['Latitude'])
toronto_df['Longitude'] = pd.to_numeric(toronto_df['Longitude'])


toronto_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Geospatial_Coordinates.csv'

In [23]:
CLIENT_ID = 'ZV4HIDZPRGJHKKBI454FEA534ILEJ224B2GSBNCSGAJXXLGT' 
CLIENT_SECRET = '0HJ22432R2XCYBFACPRGUFDS1IWRW3SLNBTFJ5IVQVFWGZFW' 
VERSION = '20200601' 
radius=500
LIMIT=100

In [24]:
## Cognitive Class.ai
## Segmenting and Clustering Neighborhoods in New York City
## Note: This function is taken from : https://labs.cognitiveclass.ai/tools/jupyterlab/lab/tree/labs/DP0701EN/DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb
## I do not take credit for writing the below function
## I have used this function and made changes where nessesary for use in this project

#This function will get the near-by venues of a location using coordinates

def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    remove = []
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Log postal codees without nearby addresses 
        if not results:
            remove.append(name)

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal_Code', 
                  'Postal_Latitude', 
                  'Postal_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude', 
                  'Venue_Category']
    
    return(nearby_venues, remove)

In [25]:
#Radius of 500m and a limit of 100 venues
radius = 500
LIMIT = 100


#Get the venues near Toronto postal codes
toronto_venues,remove = getNearbyVenues(names=toronto_df['Postal_Code'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude'],
                                   radius = radius
                                  )

#Remove those postal codes with no nearby venues
for item in remove:
    indexNames = toronto_df[ toronto_df['Postal_Code'] == item ].index
    # Delete these row indexes from dataFrame
    toronto_df.drop(indexNames , inplace=True)

KeyError: 'groups'