In [47]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup #library to parse html documents

<h4> We send a GET request to the Wikipedia URL whose table needs to be scraped and store the HTML response in a variable. It is not legal to scrape any website, so we check the status code. 200 shows that you can go ahead and download it. </h4>

<h4> Next we'll get the response in the form of html </h4>

In [48]:
# We send a GET request to the Wikipedia URL whose table needs to be scraped and store the HTML response in a variable. It is not legal to scrape any website, so we check the status code. 200 shows that you can go ahead and download it.

# get the response in the form of html

wikiurl = " https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table_class = "wikitable sortable jquery-tablesorter"
response = requests.get(wikiurl)
print(response.status_code)

200


<h4>Now let's parse data from the html into a beautifulsoup object </h4>

In [49]:
# parse data from the html into a beautifulsoup object 

soup = BeautifulSoup(response.text, 'html.parser')
torontotable=soup.find('table',{'class':"wikitable"})

In [50]:
df=pd.read_html(str(torontotable))

# convert list to dataframe

df=pd.DataFrame(df[0])
print(df.head())

  Postal Code           Borough              Neighbourhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront


<h4> Let's get rid of any 'not assigned' Boroughs. </h4>

In [51]:
df = df[df.Borough != "Not assigned"]
print(df.head())

  Postal Code           Borough                                Neighbourhood
2         M3A        North York                                    Parkwoods
3         M4A        North York                             Victoria Village
4         M5A  Downtown Toronto                    Regent Park, Harbourfront
5         M6A        North York             Lawrence Manor, Lawrence Heights
6         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


<h4> Assumptions from the course:  </h4>

-More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.

-If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [52]:
#Number of rows and columns in dataset:

shape = df.shape
print(shape)

(103, 3)


<h4> From the course: </h4>
    
Here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [53]:
#importing packages again to reference this as an example of how to create df from csv in the future:

import pandas as pd
import io 
import requests

url= "http://cocl.us/Geospatial_data"
s=requests.get(url).content
coord_df=pd.read_csv(io.StringIO(s.decode('utf-8')))
print(coord_df.head())

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


In [54]:
df = pd.merge(df, coord_df, on = "Postal Code")
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [55]:
import json # library to handle JSON files

!pip3 install geopy # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip3 install folium # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [56]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Create a map of Toronto with neighborhoods superimposed on top¶

In [57]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_Toronto)  
map_Toronto

In [42]:
df_Toronto_boroughs = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_Toronto_boroughs

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M5A,Downtown Toronto,"Regent Park, Harbourfront"
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
2,M5B,Downtown Toronto,"Garden District, Ryerson"
3,M5C,Downtown Toronto,St. James Town
4,M4E,East Toronto,The Beaches
5,M5E,Downtown Toronto,Berczy Park
6,M5G,Downtown Toronto,Central Bay Street
7,M6G,Downtown Toronto,Christie
8,M5H,Downtown Toronto,"Richmond, Adelaide, King"
9,M6H,West Toronto,"Dufferin, Dovercourt Village"


In [66]:
CLIENT_ID = '5FRYF3YJOVFI2FOOEWOS4MLHFKMMUWX5H5SPRLOWSMSXM4BN' # your Foursquare ID
CLIENT_SECRET = 'LJYYAFTSS5Z11OSJZ2GHBBMLKORQFPULX3MA0WO1TAV5XXRF' # your Foursquare Secret
VERSION = '20201212' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5FRYF3YJOVFI2FOOEWOS4MLHFKMMUWX5H5SPRLOWSMSXM4BN
CLIENT_SECRET:LJYYAFTSS5Z11OSJZ2GHBBMLKORQFPULX3MA0WO1TAV5XXRF


3.1. Explore the Neighbourhoods in Toronto¶

Function to get all venues nearby a location in the Toronto boroughs:

In [64]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [65]:
toronto_boroughs_venues = getNearbyVenues(names = df_Toronto_boroughs['Neighbourhood'],
                                   latitudes = df_Toronto_boroughs['Latitude'],
                                   longitudes = df_Toronto_boroughs['Longitude']
                                  )

KeyError: 'Latitude'

In [62]:
print(toronto_boroughs_venues.shape)
toronto_boroughs_venues.head()

NameError: name 'toronto_boroughs_venues' is not defined

In [None]:
toronto_boroughs_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(toronto_boroughs_venues['Venue Category'].unique())))

3.2. Analyze each neighbourhood¶

In [None]:
# one hot encoding
toronto_boroughs_onehot = pd.get_dummies(toronto_boroughs_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_boroughs_onehot['Neighborhood'] = toronto_boroughs_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_boroughs_onehot.columns[-1]] + list(toronto_boroughs_onehot.columns[:-1])
toronto_boroughs_onehot = toronto_boroughs_onehot[fixed_columns]

toronto_boroughs_onehot.sample(10)

In [None]:
toronto_boroughs_onehot.shape

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_boroughs_grouped = toronto_boroughs_onehot.groupby('Neighborhood').mean().reset_index()
toronto_boroughs_grouped

In [None]:
toronto_boroughs_grouped.shape

Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in toronto_boroughs_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_boroughs_grouped[toronto_boroughs_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

Let's put that into a pandas dataframe. First, let's write a function to sort the venues in descending order:

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_boroughs_grouped['Neighborhood']

for ind in np.arange(toronto_boroughs_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_boroughs_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

3.3. Cluster Neighbourhoods
Run k-means to cluster the neighborhood into 5 clusters.

In [None]:
# set number of clusters
kclusters = 5

toronto_boroughs_grouped_clustering = toronto_boroughs_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_boroughs_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_boroughs_merged = df_Toronto_boroughs

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_boroughs_merged = toronto_boroughs_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_boroughs_merged.head() # check the last columns!

Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_boroughs_merged['Latitude'], toronto_boroughs_merged['Longitude'], toronto_boroughs_merged['Neighbourhood'], toronto_boroughs_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)   
map_clusters

3.4. Examine Clusters¶


CLUSTER 1¶

In [None]:


toronto_boroughs_merged.loc[toronto_boroughs_merged['Cluster Labels'] == 0, toronto_boroughs_merged.columns[[2] + list(range(5, toronto_boroughs_merged.shape[1]))]]



This is the Ice-Cream and Gardening Neighbourhood of Toronto.

CLUSTER 2¶

In [None]:
toronto_boroughs_merged.loc[toronto_boroughs_merged['Cluster Labels'] == 1, toronto_boroughs_merged.columns[[2] + list(range(5, toronto_boroughs_merged.shape[1]))]]

These are the all-round neighbourhoods of Toronto.

CLUSTER 3¶

In [None]:
toronto_boroughs_merged.loc[toronto_boroughs_merged['Cluster Labels'] == 2, toronto_boroughs_merged.columns[[2] + list(range(5, toronto_boroughs_merged.shape[1]))]]

This is the Summer Camp and Wine Neighbourhood of Toronto.

CLUSTER 4¶

In [None]:


toronto_boroughs_merged.loc[toronto_boroughs_merged['Cluster Labels'] == 3, toronto_boroughs_merged.columns[[2] + list(range(5, toronto_boroughs_merged.shape[1]))]]



This is the Jewelry and Trailing Neighbourhood of Toronto.

CLUSTER 5¶

In [None]:
toronto_boroughs_merged.loc[toronto_boroughs_merged['Cluster Labels'] == 4, toronto_boroughs_merged.columns[[2] + list(range(5, toronto_boroughs_merged.shape[1]))]]

This is the Outdoors Neighbourhood of Toronto.