## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto



Created on Thu Jun 04 2020

@author: Ivica Klecina

# ---------------------------------------------------------------------------------------------------------------

In [3]:
import pandas as pd # library for data analsysis

import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from sklearn.cluster import KMeans # import k-means from clustering stage

import folium # map rendering library

# ---------------------------------------------------------------------------------------------------------------

## Assignment 1


###

1 Scrape Wikipedia page in order to obtain the table of postal codes and transform that data into a pandas dataframe

2 Ignore the cells with a borough that is Not assigned

3 If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
###

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#1
df = pd.DataFrame(pd.read_html(url)[0])

#2
df = df[df['Borough'] != 'Not assigned']

#3
for index, hood in enumerate(df['Neighborhood']):
    if hood == 'Not assigned':
        df.loc[index, 'Neighborhood'] = df.loc[index, 'Borough']


df.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
df.reset_index(drop = True, inplace = True)

df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# print the number of rows and columns in the dataframe
df.shape

(103, 3)

# ---------------------------------------------------------------------------------------------------------------

## Assignment 2

###

1 Imoprt the geographical coordinates of each postal code (http://cocl.us/Geospatial_data) into coordinates data frame

2 Sort coordinates dataframe according to the values in PostalCode column in df dataframe (not required)

3 Join df and coordinates data frames into a new data frame called toronto_data
###

In [6]:
filepath = 'D:/Coursera/IBM_Applied Data Science Specialization/04_Applied Data Science Capstone/Geospatial_Coordinates.csv'

#1
coordinates = pd.read_csv(filepath)

#2
coordinates = coordinates.set_index('Postal Code')
coordinates = coordinates.reindex(index = df['PostalCode'])
coordinates = coordinates.reset_index()

#3
toronto_data = df.join(coordinates[['Latitude', 'Longitude']])
toronto_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [7]:
# check the size of the new dataframe
toronto_data.shape

(103, 5)


## ----------------------------------------------------------------------------------------------------------------------------------

## Assignment 3

Explore and cluster the neighborhoods in Toronto.

Analysis used in the "Segmenting and Clustering Neighborhoods in New York City" lab will be replicated here.

In [8]:
#Define Foursquare Credentials and Version

path1 = 'D:/Coursera/IBM_Applied Data Science Specialization/04_Applied Data Science Capstone/credentials.txt'

with open(path1) as f:
    data = [line.rstrip() for line in f]
    
CLIENT_ID = data[0] # Foursquare ID
CLIENT_SECRET = data[1] # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 5 # Number of venues returned by Foursquare API

getNearbyVenues function from the analysis of New Yor city data will be used to get top 5 venus within a radius of 200 meters from each neighborhood in Toronto.

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius = 200):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get Top 5 venues for each neighborhood in Toronto.

In [13]:
toronto_venues = getNearbyVenues(names = toronto_data['Neighborhood'],
                                   latitudes = toronto_data['Latitude'],
                                   longitudes = toronto_data['Longitude']
                                  )

Let's check the size of the resulting dataframe

In [17]:
print(toronto_venues.shape)
toronto_venues.head(10)

(191, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
1,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
2,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
3,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
4,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
5,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
6,"Regent Park, Harbourfront",43.65426,-79.360636,The Extension Room,43.653313,-79.359725,Gym / Fitness Center
7,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Roots,43.718214,-79.463893,Boutique
8,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Kitchen Stuff Plus (Clearance Outlet),43.719096,-79.462675,Furniture / Home Store
9,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,International Clothiers,43.717997,-79.46672,Clothing Store


Let's find out how many unique venue categories have been returned.

In [18]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 87 uniques categories.


Now we are going to analyze each neighborhoods using one hot encoding.

In [33]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']])

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,Venue Category_Auto Workshop,Venue Category_BBQ Joint,Venue Category_Bakery,Venue Category_Bank,Venue Category_Basketball Court,Venue Category_Bookstore,Venue Category_Boutique,...,Venue Category_Supermarket,Venue Category_Sushi Restaurant,Venue Category_Tea Room,Venue Category_Thai Restaurant,Venue Category_Theme Restaurant,Venue Category_Trail,Venue Category_Turkish Restaurant,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Wings Joint,Venue Category_Yoga Studio
0,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [41]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,Venue Category_Auto Workshop,Venue Category_BBQ Joint,Venue Category_Bakery,Venue Category_Bank,Venue Category_Basketball Court,Venue Category_Bookstore,Venue Category_Boutique,...,Venue Category_Supermarket,Venue Category_Sushi Restaurant,Venue Category_Tea Room,Venue Category_Thai Restaurant,Venue Category_Theme Restaurant,Venue Category_Trail,Venue Category_Turkish Restaurant,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Wings Joint,Venue Category_Yoga Studio
0,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Let's confirm the new size

In [27]:
toronto_grouped.shape

(60, 88)

First, let's write a function to sort the venues in descending order.

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 3 venues for each neighborhood.

In [44]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Alderwood, Long Branch",Venue Category_Coffee Shop,Venue Category_Pizza Place,Venue Category_Yoga Studio
1,"Bathurst Manor, Wilson Heights, Downsview North",Venue Category_Coffee Shop,Venue Category_Middle Eastern Restaurant,Venue Category_Restaurant
2,"Bedford Park, Lawrence Manor East",Venue Category_Comfort Food Restaurant,Venue Category_Coffee Shop,Venue Category_Sushi Restaurant
3,"Birch Cliff, Cliffside West",Venue Category_Café,Venue Category_Yoga Studio,Venue Category_Comfort Food Restaurant
4,"Brockton, Parkdale Village, Exhibition Place",Venue Category_Café,Venue Category_Yoga Studio,Venue Category_Comfort Food Restaurant
5,"Business reply mail Processing Centre, South C...",Venue Category_Brewery,Venue Category_Auto Workshop,Venue Category_Fish & Chips Shop
6,"CN Tower, King and Spadina, Railway Lands, Har...",Venue Category_Performing Arts Venue,Venue Category_Gym,Venue Category_Comic Shop
7,Canada Post Gateway Processing Centre,Venue Category_Coffee Shop,Venue Category_Yoga Studio,Venue Category_Comfort Food Restaurant
8,Central Bay Street,Venue Category_Coffee Shop,Venue Category_Bookstore,Venue Category_Middle Eastern Restaurant
9,Church and Wellesley,Venue Category_Bubble Tea Shop,Venue Category_Theme Restaurant,Venue Category_Mexican Restaurant


At the end of this assignment we are going to cluster and visualize all neighborhoods from toronto_data dataframe.

Use k-means to cluster the neighborhoods into 5 clusters.

In [45]:
# set number of clusters
kclusters = 5

toronto_data_clustering = toronto_data.drop(['PostalCode', 'Borough', 'Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_data_clustering)

kmeans.labels_

array([4, 4, 2, 0, 2, 1, 3, 4, 4, 2, 0, 1, 3, 4, 4, 2, 2, 1, 3, 4, 2, 2,
       3, 4, 2, 2, 3, 0, 0, 4, 2, 2, 3, 4, 0, 4, 2, 2, 4, 0, 0, 2, 2, 2,
       4, 0, 1, 4, 2, 1, 1, 3, 0, 1, 2, 0, 1, 1, 4, 0, 1, 0, 0, 1, 1, 4,
       0, 0, 2, 1, 1, 4, 0, 0, 2, 2, 1, 1, 3, 2, 2, 1, 4, 2, 2, 3, 2, 2,
       1, 1, 4, 2, 2, 1, 1, 3, 2, 2, 1, 2, 4, 1, 1])

Add a new column with cluster labels to toronto_data dataframe.

In [47]:
toronto_data['ClusterLabels'] = kmeans.labels_

Finally, let's visualize the resulting clusters.

In [48]:
Lat = 43.651070
Long = -79.347015

# create map
toronto_map_clusters = folium.Map(location=[Lat, Long], zoom_start=10)

colors = ['red', 'blue', 'green', 'orange', 'purple']

# add markers to the map
markers_colors = []
for lat, lon, hood, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['ClusterLabels']):
    label = folium.Popup(str(hood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color = colors[cluster -1],
        fill = True,
        fill_color = colors[cluster -1],
        fill_opacity=0.7).add_to(toronto_map_clusters)
       
toronto_map_clusters