### Week 3 Assignment - Segmenting and Clustering Neighborhoods in Toronto

#### Yichuan Wang

In [34]:
# import needed packages
import pandas as pd
import numpy as np
import requests
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

##### Part 1. Read the Toronto neighborhood table from Wikipedia 

In [7]:
# website where the table is located
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# use pandas to find the tables from website above
list_of_df=pd.read_html(url, header=0)

# the first table is the one we need
toronto_df=list_of_df[0]
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


We need to drop entries where Boroughs are 'Not assigned' and replace 'Not assigned' Neighborhoods with the Borough names.

In [9]:
# first change the spelling of Neighbourhood to Neighborhood
toronto_df.columns=['Postal Code','Borough','Neighborhood']

# drop 'Not assigned' Boroughs
toronto_df=toronto_df[toronto_df['Borough']!='Not assigned']

# replace 'Not assigned' neighborhoods with Borough names
idx=toronto_df['Neighborhood']=='Not assigned'
toronto_df['Neighborhood'][idx]=toronto_df['Borough'][idx]

# reset the table index
toronto_df=toronto_df.reset_index(drop=True)

# print a summary view of the table
print(toronto_df.head())
print('\nThe table size is: ',toronto_df.shape)
print('\n',toronto_df.describe())

  Postal Code           Borough                                 Neighborhood
0         M3A        North York                                    Parkwoods
1         M4A        North York                             Victoria Village
2         M5A  Downtown Toronto                    Regent Park, Harbourfront
3         M6A        North York             Lawrence Manor, Lawrence Heights
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government

The table size is:  (103, 3)

        Postal Code     Borough Neighborhood
count          103         103          103
unique         103          10           99
top            M1B  North York    Downsview
freq             1          24            4


Since all postal codes are unique in the table, we do not need to combine any entries. A quick inspection of the table on the web shows there's no missing entry in the Postal Code column.

##### Part 2. Obtain the coordinates for each neighborhood

I'm using the CSV file provided to obtain the coordinates.

In [10]:
# read the coordinate table from CSV file
coord_df=pd.read_csv('Geospatial_Coordinates.csv')

# merge the two tables
nbhd_df=pd.merge(toronto_df, coord_df, on='Postal Code')

nbhd_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


##### Part 3. Clustering analysis

The clustering analysis will be performed on all the neighborhoods in Toronto. First we need to find the coordinates of the city for plotting purpose.

In [13]:
address='Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
toronto_lat = location.latitude
toronto_long = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


To use the Foursquare API, we need the credentials.

In [12]:
# @hidden_cell
# Foursquare credential
CLIENT_ID = 'SMWO4YMT1OFFOIZA1RTD50OLCLS0K4XVAKKSFJXL11BXQGOW' # your Foursquare ID
CLIENT_SECRET = 'NU2FGECZ1YJV0VQG5F4CMHP3R5DOAPQJZN31PUZKDAEMSY2O' # your Foursquare Secret
VERSION = '20201019' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Use the function provided in the lab to get the top venues in each neighborhood.

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
# obtain the venues from Foursquare
toronto_venues = getNearbyVenues(names=nbhd_df['Neighborhood'],
                                   latitudes=nbhd_df['Latitude'],
                                   longitudes=nbhd_df['Longitude']
                                  )

# merge with previous neighborhood table to add postal code and borough info
toronto_venues=pd.merge(toronto_venues, nbhd_df, on='Neighborhood')

toronto_venues=toronto_venues.drop(['Latitude','Longitude'], axis=1)

toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Postal Code,Borough
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park,M3A,North York
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop,M3A,North York
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena,M4A,North York
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant,M4A,North York
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop,M4A,North York


Using the code provided in the lab, we get the top 10 venues in each neighborhood.

In [42]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

# function to return most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
nbhd_venues_sorted = pd.DataFrame(columns=columns)
nbhd_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    nbhd_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

nbhd_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Clothing Store,Lounge,Breakfast Spot,Skating Rink,Latin American Restaurant,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,"Alderwood, Long Branch",Pizza Place,Sandwich Place,Coffee Shop,Pub,Pharmacy,Gym,Greek Restaurant,Discount Store,Department Store,Dessert Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pharmacy,Deli / Bodega,Bridal Shop,Shopping Mall,Sandwich Place,Diner,Restaurant,Middle Eastern Restaurant
3,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop,Greek Restaurant,Sushi Restaurant,Juice Bar,Café,Thai Restaurant,Restaurant,Indian Restaurant


Now we perform K-means clustering on the neighborhoods in Toronto.

In [43]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
nbhd_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge this table with our previous neighborhood table
toronto_merged = pd.merge(nbhd_df, nbhd_venues_sorted, on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Food & Drink Shop,Yoga Studio,Drugstore,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Pizza Place,Hockey Arena,Coffee Shop,Portuguese Restaurant,Intersection,Electronics Store,Eastern European Restaurant,Escape Room,Ethiopian Restaurant,Event Space
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Café,Theater,Spa,Brewery,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,Clothing Store,Accessories Store,Boutique,Gift Shop,Furniture / Home Store,Event Space,Coffee Shop,Women's Store,Vietnamese Restaurant,Airport Service
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Yoga Studio,Sushi Restaurant,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Café,Restaurant,Chinese Restaurant


Now we plot the map of Toronto and markers for neighborhoods with their clustering information.

In [44]:
# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, nbhd, pc, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], 
                                       toronto_merged['Neighborhood'], toronto_merged['Postal Code'],
                                       toronto_merged['Cluster Labels']):
    label = folium.Popup(str(nbhd) + pc + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

It looks like the majority of Toronto neighborhoods are not that different. Three out of the five clusters ended up with only one neighborhood. There is no distinct geographical separation of the two larger clusters.