In [None]:
# importing libraries
import pandas as pd # to read csv file
import numpy as np # for matrix operations

!conda install -c conda-forge folium=0.5.0 --yes
import folium # to draw map

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # to get latitude and longitude

import requests # to request
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans # for clustering

And now we can read the data which we created at first two notebooks.

In [2]:
toronto_data = pd.read_csv("toronto_fina.csv")

We need latitude and longitude of the Toronto. We are getting the coordinates via geolocator. 

In [3]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Now we are creating a map of Toronto which includes the data in our data frame.

In [4]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neigborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

I have analyzed the venues at borough which is "Central Toronto". 

In [5]:
toronto_central = toronto_data[toronto_data['Borough'] == 'Central Toronto'].reset_index(drop=True)
toronto_central

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neigborhood,Latitude,Longitude
0,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,47,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
5,49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
6,63,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,64,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
8,65,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


And now let's find the coordinates of "Central Toronto"

In [6]:
address = 'Central Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.653963, -79.387207.


Let's create a map of the locations on "Central Toronto". We only have nine values so we will see nine nodes.

In [7]:
# create map of Manhattan using latitude and longitude values
map_central_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_central['Latitude'], toronto_central['Longitude'], toronto_central['Neigborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central_toronto)  
    
map_central_toronto

And now let's initial our client_id and client_secret at Foursquare API. I have deleted my id and secret. So if you wish to run the notebook you need to use your own client id and secret.

In [8]:
CLIENT_ID = '5CFXPWZ1EJXNZKOSOBLTUFHOMJUXTXORCNCIVALS1LQ5CFLU' # your Foursquare ID
CLIENT_SECRET = 'K3BLTRSXBGXXBXL51DCRGCFNPL0WNA4YLNDIAU5ZSWLD3VK0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5CFXPWZ1EJXNZKOSOBLTUFHOMJUXTXORCNCIVALS1LQ5CFLU
CLIENT_SECRET:K3BLTRSXBGXXBXL51DCRGCFNPL0WNA4YLNDIAU5ZSWLD3VK0


At this junk we are creating a function all the venues at Central Toronto by using api URL 

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now let's check the venues of Central Toronto. 

In [10]:
toronto_venues = getNearbyVenues(names=toronto_central['Neigborhood'],
                                   latitudes=toronto_central['Latitude'],
                                   longitudes=toronto_central['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville


Let's check the shape and first five values of the data frame.

In [11]:
print(toronto_venues.shape)
toronto_venues.head()

(112, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Averax Group,43.727406,-79.383103,Construction & Landscaping
2,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


And group them according to neighborhoods count.

In [12]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,34,34,34,34,34,34
Davisville North,7,7,7,7,7,7
"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",14,14,14,14,14,14
"Forest Hill North, Forest Hill West",4,4,4,4,4,4
Lawrence Park,4,4,4,4,4,4
"Moore Park, Summerhill East",4,4,4,4,4,4
North Toronto West,21,21,21,21,21,21
Roselawn,2,2,2,2,2,2
"The Annex, North Midtown, Yorkville",22,22,22,22,22,22


And check the unique categories.

In [13]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 62 uniques categories.


Now we have an encoding operation categories.

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,...,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's check the shape of the data frame.

In [15]:
toronto_onehot.shape

(112, 63)

At this junk we are grouping the Neighborhood column of the toronto_onehot and finding the mean of the values.

In [16]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,...,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,0.0,0.0,...,0.0,0.058824,0.0,0.0,0.029412,0.029412,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
3,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0
6,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619
7,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"The Annex, North Midtown, Yorkville",0.045455,0.045455,0.0,0.0,0.045455,0.0,0.136364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0


The shape of the toronto_grouped data frame.

In [17]:
toronto_grouped.shape

(9, 63)

We are using return_most_common_venues function to get top venues and using the function for the top 10 venues.

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Sandwich Place,Pizza Place,Dessert Shop,Sushi Restaurant,Café,Gym,Coffee Shop,Italian Restaurant,Pharmacy,Park
1,Davisville North,Park,Hotel,Breakfast Spot,Gym,Department Store,Sandwich Place,Food & Drink Shop,Yoga Studio,Flower Shop,Fried Chicken Joint
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Pub,Coffee Shop,Sports Bar,Vietnamese Restaurant,Fried Chicken Joint,Light Rail Station,Liquor Store,Pizza Place,Restaurant,American Restaurant
3,"Forest Hill North, Forest Hill West",Sushi Restaurant,Trail,Jewelry Store,Mexican Restaurant,Gas Station,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden
4,Lawrence Park,Construction & Landscaping,Bus Line,Swim School,Park,Yoga Studio,Garden,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Gift Shop


Now we are going to set the clusters. I prefer to use 3 cluster for this project.

In [20]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 1, 0, 1, 2, 1], dtype=int32)

Now we are going to add the labels of clusters to the neighborhoods_venues_sorted data frame and merge it with the toronto_central data frame.    

In [21]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_data_merged = toronto_central

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_data_merged = toronto_data_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neigborhood')

toronto_data_merged.head() # check the last columns!

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neigborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Construction & Landscaping,Bus Line,Swim School,Park,Yoga Studio,Garden,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Gift Shop
1,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Park,Hotel,Breakfast Spot,Gym,Department Store,Sandwich Place,Food & Drink Shop,Yoga Studio,Flower Shop,Fried Chicken Joint
2,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Clothing Store,Coffee Shop,Yoga Studio,Cosmetics Shop,Gift Shop,Mexican Restaurant,Fast Food Restaurant,Park,Pet Store,Diner
3,47,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Sandwich Place,Pizza Place,Dessert Shop,Sushi Restaurant,Café,Gym,Coffee Shop,Italian Restaurant,Pharmacy,Park
4,48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,0,Playground,Trail,Tennis Court,Park,Fried Chicken Joint,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Yoga Studio


Now we can draw the map.

In [22]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data_merged['Latitude'], toronto_data_merged['Longitude'], toronto_data_merged['Neigborhood'], toronto_data_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

At the end let's check our 3 clusters.

In [23]:
toronto_data_merged.loc[toronto_data_merged['Cluster Labels'] == 0, toronto_data_merged.columns[[1] + list(range(5, toronto_data_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4T,-79.38316,0,Playground,Trail,Tennis Court,Park,Fried Chicken Joint,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Yoga Studio
7,M5P,-79.411307,0,Sushi Restaurant,Trail,Jewelry Store,Mexican Restaurant,Gas Station,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden


In [24]:
toronto_data_merged.loc[toronto_data_merged['Cluster Labels'] == 1, toronto_data_merged.columns[[1] + list(range(5, toronto_data_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,-79.38879,1,Construction & Landscaping,Bus Line,Swim School,Park,Yoga Studio,Garden,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Gift Shop
1,M4P,-79.390197,1,Park,Hotel,Breakfast Spot,Gym,Department Store,Sandwich Place,Food & Drink Shop,Yoga Studio,Flower Shop,Fried Chicken Joint
2,M4R,-79.405678,1,Clothing Store,Coffee Shop,Yoga Studio,Cosmetics Shop,Gift Shop,Mexican Restaurant,Fast Food Restaurant,Park,Pet Store,Diner
3,M4S,-79.38879,1,Sandwich Place,Pizza Place,Dessert Shop,Sushi Restaurant,Café,Gym,Coffee Shop,Italian Restaurant,Pharmacy,Park
5,M4V,-79.400049,1,Pub,Coffee Shop,Sports Bar,Vietnamese Restaurant,Fried Chicken Joint,Light Rail Station,Liquor Store,Pizza Place,Restaurant,American Restaurant
8,M5R,-79.405678,1,Café,Sandwich Place,Coffee Shop,Indian Restaurant,Pub,BBQ Joint,Burger Joint,Cosmetics Shop,History Museum,Liquor Store


In [25]:
toronto_data_merged.loc[toronto_data_merged['Cluster Labels'] == 2, toronto_data_merged.columns[[1] + list(range(5, toronto_data_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M5N,-79.416936,2,Music Venue,Garden,Yoga Studio,Diner,Gym / Fitness Center,Gym,Greek Restaurant,Gourmet Shop,Gift Shop,Gas Station


Thank you for your patience and for your feedbacks. 