Data Science Capstone Project Assignment - Toronto Postcode, Borough and Neighborhood Web Page Scraping

In [40]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


# Store the required Postcode web page url 
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Use the BeautifulSoup package for processing the web page - set the returned data to be in an lxml format
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

# Scrape the Postcode table on the web page
web_table = soup.find('table',{'class':'wikitable sortable'})

# Load the contents of the Postcode table into a text array, use the strip method to remove unwanted characters from the dataset 
table_rows = []
# Loop through the contents of table based on tr and td tags
for table_row in web_table.find_all('tr'):
    table_data = table_row.find_all('td')
    # Convert each table row element to text, strip out unwanted characters and store in a text array
    table_rows.append([i.text.strip() for i in table_data]) 
    
# Isolate the Postcode table column headers (th - table header) and use the strip method to remove unwanted characters from the dataset
table_header = web_table.find_all('th')
# Convert each header list element to text and store in table_header 
table_header = [c.text for c in table_header]
# Loop through each of the characters in the string and use the strip method to remove unwanted characters such as '\n' new line escape codes
table_header = [i.strip() for i in table_header]
#print(table_header)

# Load the table rows and column headers into a Postcode dataframe - pdf
pdf = pd.DataFrame(data = table_rows, columns = table_header)
#pdf.head()

# Remove the empty top row from the dataframe and reset the index
pdf = pdf.drop([0])
pdf = pdf.reset_index(drop = True)
#pdf.head()

# Only include Boroughs in the dataframe that are assigned a Postcode - Ignore cells with a Borough that are 'Not assigned'
pdf = pdf[pdf.Borough != 'Not assigned']

# Sort the values in the dataframe
pdf.sort_values_by = ['Postcode', 'Borough', 'Neighborhood']

# For a given Postcode that has more than one assigned Neighborhood, join the cells to form just one Postcode cell separated by a comma
pdf = pdf.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

pdf.shape



(103, 3)

Add Geospatial latitude and longitude data for a given postcode to the Postcode dataframe

In [41]:
# Locate the required geospatial data for a given postcode
geo_link = 'http://cocl.us/Geospatial_data'
geo_data = requests.get(geo_link)

# Strip out unwanted characters from the geospatial data and form into rows
geo_data = geo_data.text.strip('\n')
geo_data = geo_data.split('\r')

# Loop through each row in the geospatial data and strip out 'new line' esacpe codes
geo_data = [c.strip('\n') for c in geo_data]

# Split each geospatial data row into columns
geo_data = [i.split(',') for i in geo_data]

# Construct the Coordinates dataframe from the prepared geospatial data set
cdf = pd.DataFrame(data = geo_data, columns = geo_data[0])

# From the Coordinates dataframe, remove the top row, column headers, from the full data set and reset the index
cdf = cdf.drop([0])
cdf = cdf.reset_index(drop = True)

# Append Latitude and Longitude columns to the Postcode dataframe and initiate with 'null' data
pdf.insert(3, 'Latitude', '')
pdf.insert(4, 'Longitude', '')

# Where Postcode and Coordinate dataframe postcodes match, update the Postcode dataframe Latitude and Longitude values
# Loop through all the Postcodes in the Postcode dataframe
for i in pdf['Postcode']:
    # Loop through all the Postal Codes in the Coordinates dataframe
    for c in cdf['Postal Code']:
        # Identify matching postcode values in the Postcode and Coordinates dataframes
        if i == c:
            # Based on matching postcode values, update the Postcode dataframe with the associated Latitude and Longitude values
            pdf.loc[pdf['Postcode'] == i, 'Latitude'] = cdf.loc[cdf['Postal Code'] == c, 'Latitude']
            pdf.loc[pdf['Postcode'] == i, 'Longitude'] = cdf.loc[cdf['Postal Code'] == c, 'Longitude']
            
pdf.head()       

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761


Display the Toronto postcode and geospatial data in a cluster map

In [42]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 

# Return Latitude and Longitude coordinates for a given addresses 
from geopy.geocoders import Nominatim 

# JSON to Pandas DadaFrame 
from pandas.io.json import json_normalize
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans
!pip -q install folium
import folium # map rendering library

address = 'Toronto, Ontario'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

# Create a map of Toronto city using the Latitude and Longitude information
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# Using the location information from geospat file, add markers on the map for each neighbourhood
    
for lat, lng, borough, neighborhood in zip(pdf['Latitude'], pdf['Longitude'], pdf['Borough'], pdf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough) # This will label the markers by neighborhood, borough name\
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    ).add_to(Toronto_map)
   
Toronto_map



GeocoderUnavailable: Service not available

Cluster Neighborhoods of the Downtown Toronto Borough

In [None]:
# Build the Toronto DataFrame tdf based on the Downtown Toronto Borough and cluster the associated Neighbourhoods around it

tdf = pdf[pdf['Borough'].str.contains('Toronto')].reset_index(drop = True)

address = 'Downtown Toronto, TO'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude)) 
#print(*zip(tdf['Latitude'], tdf['Longitude'], tdf['Neighbourhood']))

# Map of Downtown Toronto and its surrounding Neighborhoods
map_dt = folium.Map(location=[latitude, longitude], zoom_start=11)
  
# Downtown Toronto as Red on the map
folium.CircleMarker([latitude, longitude], radius=10, popup=label, color='Red', fill=True, fill_color='Red', fill_opacity=0.9,).add_to(map_dt)

# Neighborhoods as Blue
for lat, lng, label in zip(tdf['Latitude'], tdf['Longitude'], tdf['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            ).add_to(map_dt)
    
map_dt

Foursquare API Venue Data

In [None]:
# The code was removed by Watson Studio for sharing.

Function to call Foursquare API 

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
                    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
                
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    NearbyVenues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    
    NearbyVenues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(NearbyVenues)

In [None]:
# Set the radius of the search to be within 500m
radius = 500
# Set the limit to 100 venues per neighborhood
LIMIT = 100

# For each of the Neighborhoods in the Toronto dataframe, return the Venues within a 500m radius up to a limit of 100 Venues per Neighborhood
Toronto_venues = getNearbyVenues(names = tdf['Neighborhood'], latitudes = tdf['Latitude'], longitudes = tdf['Longitude'])
Toronto_venues.head(10)
Toronto_venues.shape

In [None]:
# Number of venues returned by Venue Category
Toronto_venues.groupby('Venue Category').count()

In [None]:
# Determine the number of unique categories that can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

Analyse each Neighborhood

In [None]:
# Use one hot encoding to turn text values into binary for counting purposes 
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# Move the Neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.shape

Group rows by Neighborhood and take the mean of the frequency of occurrence of each category

In [None]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped
Toronto_grouped.shape

Print each Neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# Use a function to return the top venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Determine the top 10 venues in each Neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Clustering the Toronto Neighbourhoods and associated Venues

In [None]:
# Set the number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Create a dataframe that includes the cluster and top 10 venues for each Neighborhood.

In [None]:
# Add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = tdf

# Merge toronto_grouped with the Toronto dataframe, tdf, to include latitude and longitude for each associated Neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head()

Visualise the resulting clusters

In [None]:
# Create a cluster map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set colour scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Examine the first  three clusters

In [None]:
# Cluster 1
Toronto_merged.loc[manhattan_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 2
Toronto_merged.loc[manhattan_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 3
Toronto_merged.loc[manhattan_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]