In [112]:
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes


import pandas as pd 
import numpy as np
import folium 
import matplotlib.cm as cm 
import matplotlib.colors as colors

from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim #address to coordinates

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Week 3 part 3: clustering of Toronto neighborhoods

"df_compressed" pandas dataframe was calculated in previous notebook - Week 3 part 2

In [110]:
#This file was generated in previous notebook
df_compressed = pd.read_csv('coordinates.csv')

In [111]:
df_compressed.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [25]:
#Filtering areas, keeping only Toronto
toronto = df_compressed[df_compressed['Borough'].isin([
    'Central Toronto', 
    'Downtown Toronto',
    'East Toronto'
    'West Toronto'
    ])].reset_index(drop=True)

In [113]:
CLIENT_ID = '<THIS VALUE HAS BEEN MASKED>' # your Foursquare ID
CLIENT_SECRET = '<THIS VALUE HAS BEEN MASKED>' # your Foursquare Secret
VERSION = '20180930' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: <THIS VALUE HAS BEEN MASKED>
CLIENT_SECRET:<THIS VALUE HAS BEEN MASKED>


In [94]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.




In [30]:
#Function for getting venues details from Foursquare and arranging them in dataframe

def getNearbyVenues(names, latitudes, longitudes, limit=100, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
#Getting venues data
toronto_venues = getNearbyVenues(
    names = toronto['Neighborhood'], 
    latitudes = toronto['Latitude'], 
    longitudes=toronto['Longitude']
)

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hi...
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbo...
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


In [61]:
#toronto_venues.to_csv('toronto_venues.csv')
#toronto_venues = pd.read_csv('toronto_venues.csv')
print(toronto_venues.shape)
toronto_venues.head(5)

(1403, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Dim Sum Deluxe,43.726953,-79.39426,Dim Sum Restaurant
2,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


In [59]:
#Keeping only Neighbourhoods with over 20 venues
toronto_venues_filtered = toronto_venues.groupby('Neighborhood').filter(lambda x: x['Venue'].count() > 20).reset_index(drop=True)
print('Remaining venues count: {}'.format(toronto_venues_filtered.shape[0]))

#Remaining neighbourhoods after filtering
neighborhoods_filtered = toronto_venues_filtered.iloc[:,0:3].drop_duplicates()
print('Remaining neighborhoods count: {}'.format(neighborhoods_filtered.shape[0]))

Remaining venues count: 1333
Remaining neighborhoods count: 18


In [74]:
#one hot encoding for Venue Category
toronto_onehot = pd.get_dummies(toronto_venues_filtered[['Venue Category']], prefix="v", prefix_sep="_")

#Putting Neighborhood column back in. 
toronto_onehot.insert(0, 'Neighborhood', toronto_venues_filtered['Neighborhood'])

In [105]:
#Creating features for each Neighbourhood by taking a mean for each attribute
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print('Resulting dataframe has {} neighborhoods and {} features'.format(
        toronto_grouped.shape[0], toronto_grouped.shape[1]))

Resulting dataframe has 18 neighborhoods and 195 features


In [106]:
#Listing top5 venue categories for each neighborhood
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                   venue  freq
0          v_Coffee Shop  0.07
1                 v_Café  0.06
2  v_American Restaurant  0.04
3           v_Steakhouse  0.04
4                  v_Gym  0.04


----Berczy Park----
              venue  freq
0     v_Coffee Shop  0.07
1    v_Cocktail Bar  0.05
2     v_Cheese Shop  0.04
3  v_Farmers Market  0.04
4      v_Steakhouse  0.04


----Cabbagetown, St. James Town----
                  venue  freq
0         v_Coffee Shop  0.08
1          v_Restaurant  0.08
2  v_Italian Restaurant  0.04
3              v_Bakery  0.04
4  v_Chinese Restaurant  0.04


----Central Bay Street----
                  venue  freq
0         v_Coffee Shop  0.14
1                v_Café  0.06
2  v_Italian Restaurant  0.05
3      v_Sandwich Place  0.04
4                 v_Bar  0.04


----Chinatown, Grange Park, Kensington Market----
                             venue  freq
0                           v_Café  0.08
1                            v_Bar  0.06
2

In [107]:
#Grouping 18 neighbourhoods into 3 clusters using k-means
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=3, random_state=0).fit(toronto_grouped_clustering)
toronto_grouped['Cluster'] = pd.Series(kmeans.predict(toronto_grouped_clustering))

In [108]:
#Putting together Neighborhood, coordinates and cluster labels
toronto_merged = toronto_grouped.iloc[:,[0,-1]].join(neighborhoods_filtered.set_index('Neighborhood'), on='Neighborhood')

In [97]:
#Visualising of resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(3)
ys = [i+x+(i*x)**2 for i in range(3)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
        toronto_merged['Neighborhood Latitude'], 
        toronto_merged['Neighborhood Longitude'], 
        toronto_merged['Neighborhood'], 
        toronto_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters