# Compare and analyze coffee shops in Toronto and Vancouver

Install and import required libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml
!pip install requests
!pip install folium
!pip install geopy
!pip install opencage
import bs4 as bs
import requests
import numpy as np 
import pandas as pd
import folium 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize
from opencage.geocoder import OpenCageGeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Part 1: Download and explore dataset

The code below scrapes postal codes and neighborhoods from the following Wikipedia pages: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V

In [2]:
# get postal codes and neighborhoods for Toronto
source_tor = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = bs.BeautifulSoup(source_tor, 'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
data_tor = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    data_tor.append(row)

# get postal codes and neighborhoods for Vancouver
source_van = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V').text
soup = bs.BeautifulSoup(source_van, 'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
data_van = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    for entry in row:
        # filter out non-Vancouver neighborhoods
        if 'West Vancouver' not in entry and 'North Vancouver' not in entry:
            if 'Vancouver' in entry:
                postal_code = entry[:3]
                neighborhoods = entry[entry.find("(")+1:entry.find(")")].replace('/', ',')
                data_van.append([postal_code, neighborhoods])


Transform the data into a pandas dataframe

In [3]:
df_tor = pd.DataFrame(data_tor, columns=["Postcode", "Borough", "Neighborhood"])
df_van = pd.DataFrame(data_van, columns=["Postcode", "Neighborhood"])

# Ignore cells with a borough that is Not assigned or Null 
df_tor = df_tor[df_tor['Borough'] != 'Not assigned']
df_tor = df_tor[df_tor['Borough'].notnull()]

# Combine neighborhoods into one comma separated row
df_tor = df_tor.groupby(['Postcode', 'Borough'])['Neighborhood'].agg(lambda x : ','.join(x)).to_frame().reset_index()

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough (only 1 case)
df_tor.loc[df_tor['Neighborhood'] == 'Not assigned', 'Neighborhood'] = 'Queen\'s Park'

In [4]:
df_tor.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
df_van.head()

Unnamed: 0,Postcode,Neighborhood
0,V6A,"Strathcona , Chinatown , Downtown Eastside"
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter..."
2,V6C,"Waterfront , Coal Harbour , Canada Place"
3,V6E,"SE West End , Davie Village"
4,V6G,"NW West End , Stanley Park"


In [6]:
df_tor.shape

(103, 3)

In [7]:
df_van.shape

(31, 2)

## Part 2: get coordinates for each neighborhood in Toronto and Vancouver

First create a dataframe with Toronto neighborhoods and geolocations.

In [8]:
!wget -q -O 'geospacial_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [9]:
geospacial_data = pd.read_csv('geospacial_data.csv')
geospacial_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geospacial_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Transform dataframe to leave only boroughs that contain the word Toronto

In [33]:
df_tor = df_tor[df_tor.Borough.str.contains("Toronto")].reset_index(drop=True)

Combine two dataframes

In [34]:
df_tor = df_tor.merge(geospacial_data, on='Postcode')
df_tor.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


Now create a dataframe with Vancouver neighborhoods and geolocations.

In [27]:
# Steps to download data from geocode API - need to get your own key
# key = ''
# geocoder = OpenCageGeocode(key)

# geospacial_data_list = []
# for index, postcode in df_van.iterrows():
#     query = postcode['Postcode']
#     results = geocoder.geocode(query)
#     for result in results:
#         if result['components']['country_code'] == 'ca':
#             geospacial_data_list.append([query, result['geometry']['lat'], result['geometry']['lng']] )
#             break
            
# geospacial_data_van = pd.DataFrame.from_records(geospacial_data_list, columns=["Postcode", "Latitude", "Longitude"])
# geospacial_data_van.to_csv('geospacial_data_van.csv')

geospacial_data_van = pd.read_csv('geospacial_data_van.csv')
geospacial_data_van.drop("Unnamed: 0", axis=1, inplace=True)
df_van = df_van.merge(geospacial_data_van, on='Postcode')
df_van.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
0,V6A,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter...",49.278226,-123.10578
2,V6C,"Waterfront , Coal Harbour , Canada Place",49.29181,-123.115989
3,V6E,"SE West End , Davie Village",49.287537,-123.120389
4,V6G,"NW West End , Stanley Park",49.299723,-123.137791


## Part 3: Plot Toronto and Vancouver Neighborhoods

In [24]:
# function to get geographical coordinates given location name
def get_lat_lon(location):
    geolocator = Nominatim(user_agent="to_explorer")
    location = geolocator.geocode(location)
    latitude = location.latitude
    longitude = location.longitude
    print('The geographical coordinate of {} are {}, {}.'.format(location, latitude, longitude))
    return (latitude, longitude)
    
to_coordinates = get_lat_lon('Toronto,ON')
van_coordinates = get_lat_lon('Vancouver,B.C.')

The geographical coordinate of Toronto, Golden Horseshoe, Ontario, M6K 1X9, Canada are 43.653963, -79.387207.
The geographical coordinate of Vancouver, Metro Vancouver, British Columbia, Canada are 49.2608724, -123.1139529.


Create a map of Toronto neighborhoods

In [26]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[to_coordinates[0], to_coordinates[1]], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

In [28]:
# create map of Vancouver using latitude and longitude values
vancouver_map = folium.Map(location=[van_coordinates[0], van_coordinates[1]], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df_van['Latitude'], df_van['Longitude'], df_van['Neighborhood']):
    label = str(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(vancouver_map)  
    
vancouver_map

## Part 4: Utilize the Foursquare API to explore the coffee shops

In [29]:
CLIENT_ID = 'NSD51NG2XKLQCQXNKBV40V3EPD2UCYJ5OVJGWNDTEC5GEKOU' # your Foursquare ID
CLIENT_SECRET = 'JLAMKUZ3PIT1OUBW2LDNGU4PPOWVT2L10LNPG4UU10II4ZNG' # your Foursquare Secret
VERSION = '20190926' # Foursquare API version

### Find all coffee shops in Toronto by neighborhood

In [73]:
from pprint import pprint
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value
categories = ['4bf58dd8d48988d1e0931735','5665c7b9498e7d8a4f2c0f06'] # coffee shop , corporate coffee shop
radius = 600

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name
# url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius=500&limit=100'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION)
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryId={},{}&v={}&radius={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    neighborhood_latitude, 
    neighborhood_longitude,
    categories[0], 
    categories[1], 
    VERSION, 
    radius)
results = requests.get(url).json()['response']['venues']

for result in results:
    print(result['name'], result['location']['lat'], result['location']['lng'], result['categories'][0]['name'], )

Tori's Bakeshop 43.672113947269565 -79.29033140068843 Vegetarian / Vegan Restaurant
Savoury Grounds 43.680540277686 -79.28742125217829 Coffee Shop
The Remarkable Bean 43.67280091854837 -79.28703802232461 Coffee Shop
Oscar 43.6725317361254 -79.28741455815401 Coffee Shop
Veloute Bistro 43.672267 -79.289584 French Restaurant
Dip 'n Sip 43.67889707815811 -79.29774501670785 Coffee Shop
Prana Coffee 43.671306 -79.294092 Coffee Shop
Best Coffee House 43.672494 -79.28769 Coffee Shop


In [83]:
LIMIT=100
def getNearbyCoffeeShops(names, latitudes, longitudes, radius=600):
    
    coffee_shop_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryId={},{}&v={}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            lat, 
            lng,
            categories[0], 
            categories[1], 
            VERSION, 
            radius)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        
        # return only relevant information for each nearby venue
        coffee_shop_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])

    nearby_coffee = pd.DataFrame([item for coffee_list in coffee_shop_list for item in coffee_list])
    nearby_coffee.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_coffee)

Create a variable toronto_venues to store all neighborhoods

In [84]:
toronto_coffee_shops = getNearbyCoffeeShops(names=df_tor['Neighborhood'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvall

Check size of the resulting dataframe

In [85]:
print(toronto_coffee_shops.shape)
toronto_coffee_shops.head()

(588, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
1,The Beaches,43.676357,-79.293031,Savoury Grounds,43.68054,-79.287421,Coffee Shop
2,The Beaches,43.676357,-79.293031,The Remarkable Bean,43.672801,-79.287038,Coffee Shop
3,The Beaches,43.676357,-79.293031,Veloute Bistro,43.672267,-79.289584,French Restaurant
4,The Beaches,43.676357,-79.293031,Oscar,43.672532,-79.287415,Coffee Shop


How many coffee shops were returned for each neighborhood

In [86]:
toronto_coffee_shops.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"Brockton,Exhibition Place,Parkdale Village",15,15,15,15,15,15
Business Reply Mail Processing Centre 969 Eastern,2,2,2,2,2,2
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",2,2,2,2,2,2
"Cabbagetown,St. James Town",9,9,9,9,9,9
Central Bay Street,30,30,30,30,30,30
"Chinatown,Grange Park,Kensington Market",30,30,30,30,30,30
Christie,6,6,6,6,6,6
Church and Wellesley,30,30,30,30,30,30


Let's find out how many unique categories can be curated from all the returned venues

In [20]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 237 uniques categories.


## Analyze each neighborhood

In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
toronto_onehot.shape

(1719, 237)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [23]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.066667,0.066667,0.066667,0.066667,0.2,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
toronto_grouped.shape

(38, 237)

### Print each neighborhood along with the top 3 most common venues

In [25]:
num_top_venues = 3

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
         venue  freq
0  Coffee Shop  0.08
1         Café  0.05
2          Bar  0.04


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1    Cocktail Bar  0.05
2  Farmers Market  0.04


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0  Breakfast Spot  0.09
1            Café  0.09
2     Coffee Shop  0.09


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2          Restaurant  0.05


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0   Airport Service  0.20
1  Airport Terminal  0.13
2          Boutique  0.07


----Cabbagetown,St. James Town----
         venue  freq
0  Coffee Shop  0.07
1          Pub  0.05
2  Pizza Place  0.05


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.14
1                Café  0.

### Add this info to pandas dataframe

In [26]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [27]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Burger Joint,Cosmetics Shop,Restaurant,Thai Restaurant,Hotel
1,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Steakhouse,Seafood Restaurant,Cheese Shop,Café,Farmers Market,Jazz Club
2,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Gym,Intersection,Performing Arts Venue,Pet Store,Grocery Store,Climbing Gym,Caribbean Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Comic Shop,Pizza Place,Restaurant,Butcher,Burrito Place,Brewery,Skate Park
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Terminal,Boat or Ferry,Airport Lounge,Coffee Shop,Harbor / Marina,Sculpture Garden,Airport Gate,Airport Food Court,Airport


### Cluster neighborhooods

Run k-means to cluster the neighborhood into 5 clusters

In [28]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [29]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Other Great Outdoors,Health Food Store,Trail,Pub,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Wings Joint
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Pizza Place,Brewery,Bubble Tea Shop,Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Park,Sandwich Place,Pet Store,Movie Theater,Italian Restaurant,Fish & Chips Shop,Intersection,Sushi Restaurant,Brewery,Food & Drink Shop
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Sandwich Place,Bar,Stationery Store,Fish Market,Coworking Space
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Bus Line,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


Visualize resulting clusters

In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

In [31]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Other Great Outdoors,Health Food Store,Trail,Pub,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Wings Joint
1,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Pizza Place,Brewery,Bubble Tea Shop,Restaurant
2,East Toronto,0,Park,Sandwich Place,Pet Store,Movie Theater,Italian Restaurant,Fish & Chips Shop,Intersection,Sushi Restaurant,Brewery,Food & Drink Shop
3,East Toronto,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Sandwich Place,Bar,Stationery Store,Fish Market,Coworking Space
5,Central Toronto,0,Gym,Clothing Store,Restaurant,Food & Drink Shop,Dance Studio,Hotel,Sandwich Place,Breakfast Spot,Park,Eastern European Restaurant
6,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Health & Beauty Service,Italian Restaurant,Salon / Barbershop,Restaurant,Rental Car Location,Park,Mexican Restaurant
7,Central Toronto,0,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Restaurant,Gym,Italian Restaurant,Coffee Shop,Café,Thai Restaurant
9,Central Toronto,0,Coffee Shop,Pub,Health & Beauty Service,American Restaurant,Restaurant,Sushi Restaurant,Liquor Store,Fried Chicken Joint,Sports Bar,Bagel Shop
11,Downtown Toronto,0,Coffee Shop,Bakery,Restaurant,Café,Pub,Market,Pizza Place,Italian Restaurant,Bank,Playground
12,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Pub,Pizza Place,Men's Store,Mediterranean Restaurant,Hotel


In [32]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,1,Playground,Tennis Court,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [33]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,2,Pool,Garden,Wings Joint,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [34]:
# Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,3,Park,Swim School,Bus Line,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


In [35]:
# Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,4,Park,Playground,Trail,Building,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
23,Central Toronto,4,Sushi Restaurant,Park,Trail,Jewelry Store,Wings Joint,Donut Shop,Discount Store,Dog Run,Doner Restaurant,Eastern European Restaurant


In [None]:
# TODO
# Also look at coworking spaces and tech startups