Data Science Capstone Project Assignment - Web Page Scraping 

In [27]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Store the required Postcode web page url 
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Use the BeautifulSoup package for processing the web page - set the returned data to be in an lxml format
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

# Scrape the Postcode table on the web page
web_table = soup.find('table',{'class':'wikitable sortable'})

# Load the contents of the Postcode table into a text array, use the strip method to remove unwanted characters from the dataset 
table_rows = []
# Loop through the contents of table based on tr and td tags
for table_row in web_table.find_all('tr'):
    table_data = table_row.find_all('td')
    # Convert each table row element to text, strip out unwanted characters and store in a text array
    table_rows.append([i.text.strip() for i in table_data]) 
    
# Isolate the Postcode table column headers (th - table header) and use the strip method to remove unwanted characters from the dataset
table_header = web_table.find_all('th')
# Convert each header list element to text and store in table_header 
table_header = [c.text for c in table_header]
# Loop through each of the characters in the string and use the strip method to remove unwanted characters such as '\n' new line escape codes
table_header = [i.strip() for i in table_header]
#print(table_header)

# Load the table rows and column headers into a dataframe
pdf = pd.DataFrame(data = table_rows, columns = table_header)
#pdf.head()

# Remove the empty top row from the dataframe and reset the index
pdf = pdf.drop([0])
pdf = pdf.reset_index(drop = True)
#pdf.head()

# Only include Boroughs in the dataframe that are assigned a Postcode - Ignore cells with a Borough that are 'Not assigned'
pdf = pdf[pdf.Borough != 'Not assigned']

# Sort the values in the dataframe
pdf.sort_values(by = ['Postcode', 'Borough', 'Neighbourhood'])

# For a given Postcode that has more than one assigned Neighbourhood, join the cells to form just one Postcode cell separated by a comma
pdf = pdf.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

# If a cell has a Borough but 'Not assigned' to a Neighborhood, then make the Neighborhood the same as the Borough
pdf.loc[pdf.Neighbourhood == 'Not assigned', 'Neighbourhood'] = pdf.Borough
#pdf

pdf.shape

(103, 3)

Add Geospatial latitude and longitude data for a given postcode to the Postcode dataframe

In [28]:
# Locate the required geospatial data for a given postcode
geo_link = 'http://cocl.us/Geospatial_data'
geo_data = requests.get(geo_link)

# Strip out unwanted characters from the geospatial data and form into rows
geo_data = geo_data.text.strip('\n')
geo_data = geo_data.split('\r')

# Loop through each row in the geospatial data and strip out 'new line' esacpe codes
geo_data = [c.strip('\n') for c in geo_data]

# Split each geospatial data row into columns
geo_data = [i.split(',') for i in geo_data]

# Construct the Coordinates dataframe from the prepared geospatial data set
cdf = pd.DataFrame(data = geo_data, columns = geo_data[0])

# From the Coordinates dataframe, remove the top row, column headers, from the full data set and reset the index
cdf = cdf.drop([0])
cdf = cdf.reset_index(drop = True)

# Append Latitude and Longitude columns to the Postcode dataframe and initiate with 'null' data
pdf.insert(3, 'Latitude', '')
pdf.insert(4, 'Longitude', '')

# Where Postcode and Coordinate dataframe postcodes match, update the Postcode dataframe Latitude and Longitude values
# Loop through all the postcodes in the Postcode dataframe
for i in pdf.Postcode:
    # Loop through all the postcodes in the Coordinates dataframe
    for c in cdf['Postal Code']:
        # Identify matching postcode values in the Postcode and Coordinates dataframes
        if i == c:
            # Based on matching postcode values, update the Postcode dataframe with the associated Latitude and Longitude values
            pdf.loc[pdf['Postcode'] == i, 'Latitude'] = cdf.loc[cdf['Postal Code'] == c, 'Latitude']
            pdf.loc[pdf['Postcode'] == i, 'Longitude'] = cdf.loc[cdf['Postal Code'] == c, 'Longitude']
            
pdf.head()       

KeyError: 'Postal Code'

Display the Toronto postcode and  geospatial data in a cluster map

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 

# Return Latitude and Longitude coordinates for a given addresses 
from geopy.geocoders import Nominatim 

# JSON to Pandas DadaFrame 
from pandas.io.json import json_normalize
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans
!pip -q install folium
import folium # map rendering library

address = 'Toronto, Ontario'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))


# Create a map of Toronto city using the Latitude and Longitude information
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# Using the location information from geospat file, add markers on the map for each neighbourhood
    
for lat, lng, borough, neighborhood in zip(pdf['Latitude'], pdf['Longitude'], pdf['Borough'], pdf['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough) # This will label the markers by neighborhood, borough name\
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    ).add_to(Toronto_map)
   
Toronto_map


Cluster Neighbourhoods of the Downtown Toronto Borough

In [None]:
# Build the Toronto DataFrame tdf based on the Downtown Toronto Borough and cluster the associated Neighbourhoods around it

tdf = pdf[pdf['Borough'].str.contains('Toronto')].reset_index(drop = True)

address = 'Downtown Toronto, TO'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude)) 
#print(*zip(tdf['Latitude'], tdf['Longitude'], tdf['Neighbourhood']))

# Map of Downtown Toronto and its surrounding Neighborhoods
map_dt = folium.Map(location=[latitude, longitude], zoom_start=11)
  
# Downtown Toronto as Red on the map
folium.CircleMarker([latitude, longitude], radius=10, popup=label, color='Red', fill=True, fill_color='Red', fill_opacity=0.9,).add_to(map_dt)

# Neighbourhoods as Blue
for lat, lng, label in zip(tdf['Latitude'], tdf['Longitude'], tdf['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            ).add_to(map_dt)
    
map_dt

Venues from FourSquare API

In [None]:
# Based on the FourSquare API lab create a function that returns Downtown Toronto nearby venues within a 500m radius of the 
# associated Neighbourhood and create a DataFrame of the venues returned by the function

CLIENT_ID = '$$$$'
CLIENT_SECRET = '$$$$'
VERSION = '20180901' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

def NearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # Build the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)

        # Build the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood',
                            'Neighbourhood Latitude',
                            'Neighbourhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
   
return(nearby_venues)

Clustering the Toronto Neighbourhoods

In [None]:
# Set the number of clusters
k = 5
torronto_grouped_clustering = tdf.drop('Neighbourhood', 1)

# Apply k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(torronto_grouped_clustering)

# Check cluster labels generated for each row in the Dataframe
kmeans.labels_[0:38]
    
# Merge all the required data together including the cluster labels, neighborhoods and top 10 venues in each neighborhood and venue name
torronto_merged = tdf
          
# Apply cluster labels
torronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
torronto_merged = torronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
torronto_merged # check the last columns!

# Visualize the clusters
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Apply the cluster colours
x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(torronto_merged['Latitude'], torronto_merged['Longitude'], torronto_merged['Neighbourhood'], torronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters