# Compare and analyze coffee shops in Toronto and Vancouver

Install and import required libraries

In [340]:
!pip install beautifulsoup4
!pip install lxml
!pip install requests
!pip install folium
!pip install geopy
!pip install opencage
import bs4 as bs
import requests
import numpy as np 
import pandas as pd
import folium 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize
from opencage.geocoder import OpenCageGeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Part 1: Download and prepare data

The code below scrapes postal codes and neighborhoods from the following Wikipedia pages: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V

In [341]:
# get postal codes and neighborhoods for Toronto
source_tor = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = bs.BeautifulSoup(source_tor, 'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
data_tor = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    data_tor.append(row)

# get postal codes and neighborhoods for Vancouver
source_van = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V').text
soup = bs.BeautifulSoup(source_van, 'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
data_van = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    for entry in row:
        # filter out non-Vancouver neighborhoods
        if 'West Vancouver' not in entry and 'North Vancouver' not in entry:
            if 'Vancouver' in entry:
                postal_code = entry[:3]
                neighborhoods = entry[entry.find("(")+1:entry.find(")")].replace('/', ',')
                data_van.append([postal_code, neighborhoods])


Transform the data into a pandas dataframe

In [342]:
df_tor = pd.DataFrame(data_tor, columns=["Postcode", "Borough", "Neighborhood"])
df_van = pd.DataFrame(data_van, columns=["Postcode", "Neighborhood"])

# Ignore cells with a borough that is Not assigned or Null 
df_tor = df_tor[df_tor['Borough'] != 'Not assigned']
df_tor = df_tor[df_tor['Borough'].notnull()]

# Combine neighborhoods into one comma separated row
df_tor = df_tor.groupby(['Postcode', 'Borough'])['Neighborhood'].agg(lambda x : ','.join(x)).to_frame().reset_index()

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough (only 1 case)
df_tor.loc[df_tor['Neighborhood'] == 'Not assigned', 'Neighborhood'] = 'Queen\'s Park'

In [343]:
df_tor.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [344]:
df_van.head()

Unnamed: 0,Postcode,Neighborhood
0,V6A,"Strathcona , Chinatown , Downtown Eastside"
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter..."
2,V6C,"Waterfront , Coal Harbour , Canada Place"
3,V6E,"SE West End , Davie Village"
4,V6G,"NW West End , Stanley Park"


In [345]:
df_tor.shape

(103, 3)

In [346]:
df_van.shape

(31, 2)

### Get coordinates for each neighborhood in Toronto and Vancouver

First create a dataframe with Toronto neighborhoods and geolocations.

In [347]:
!wget -q -O 'geospacial_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [348]:
geospacial_data = pd.read_csv('geospacial_data.csv')
geospacial_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geospacial_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Transform dataframe to leave only boroughs that contain the word Toronto

In [349]:
# df_tor = df_tor[df_tor.Borough.str.contains("Toronto")].reset_index(drop=True)

Combine two dataframes

In [350]:
df_tor = df_tor.merge(geospacial_data, on='Postcode')
df_tor.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


Now create a dataframe with Vancouver neighborhoods and geolocations.

In [351]:
# Steps to download data from geocode API - need to get your own key
# key = ''
# geocoder = OpenCageGeocode(key)

# geospacial_data_list = []
# for index, postcode in df_van.iterrows():
#     query = postcode['Postcode']
#     results = geocoder.geocode(query)
#     for result in results:
#         if result['components']['country_code'] == 'ca':
#             geospacial_data_list.append([query, result['geometry']['lat'], result['geometry']['lng']] )
#             break
            
# geospacial_data_van = pd.DataFrame.from_records(geospacial_data_list, columns=["Postcode", "Latitude", "Longitude"])
# geospacial_data_van.to_csv('geospacial_data_van.csv')

geospacial_data_van = pd.read_csv('geospacial_data_van.csv')
geospacial_data_van.drop("Unnamed: 0", axis=1, inplace=True)
df_van = df_van.merge(geospacial_data_van, on='Postcode')
df_van.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
0,V6A,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter...",49.278226,-123.10578
2,V6C,"Waterfront , Coal Harbour , Canada Place",49.29181,-123.115989
3,V6E,"SE West End , Davie Village",49.287537,-123.120389
4,V6G,"NW West End , Stanley Park",49.299723,-123.137791


## Part 2: Plot Toronto and Vancouver Neighborhoods

In [352]:
# function to get geographical coordinates given location name
def get_lat_lon(location):
    geolocator = Nominatim(user_agent="to_explorer")
    location = geolocator.geocode(location)
    latitude = location.latitude
    longitude = location.longitude
    print('The geographical coordinate of {} are {}, {}.'.format(location, latitude, longitude))
    return (latitude, longitude)
    
to_coordinates = get_lat_lon('Toronto,ON')
van_coordinates = get_lat_lon('Vancouver,B.C.')

The geographical coordinate of Toronto, Golden Horseshoe, Ontario, M6K 1X9, Canada are 43.653963, -79.387207.
The geographical coordinate of Vancouver, Metro Vancouver, British Columbia, Canada are 49.2608724, -123.1139529.


Create a map of Toronto neighborhoods

In [404]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[to_coordinates[0], to_coordinates[1]], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

In [405]:
# create map of Vancouver using latitude and longitude values
vancouver_map = folium.Map(location=[van_coordinates[0], van_coordinates[1]], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df_van['Latitude'], df_van['Longitude'], df_van['Neighborhood']):
    label = str(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(vancouver_map)  
    
vancouver_map

In [355]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20190929' # Foursquare API version

### Find all coffee shops in Toronto by neighborhood

In [357]:
def getNearbyCoffeeShops(names, latitudes, longitudes, radius=700):
    
    # Id's correspond to Coffee shop, Corporate coffee shop, Café, Pet Café
    # Excluded from this list are College Cafeteria, Corporate Cafeteria, coffee houses, Gaming Cafe, and Internet Cafe 
    categories = ['4bf58dd8d48988d1e0931735', '5665c7b9498e7d8a4f2c0f06','4bf58dd8d48988d16d941735', '56aa371be4b08b9a8d573508'] 
    
    coffee_shop_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryId={},{},{},{}&v={}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            lat, 
            lng,
            categories[0], 
            categories[1],
            categories[2],
            categories[3],
            VERSION, 
            radius)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        
        # return only coffee shops, filter out restaurants that serve coffee, try to eliminate duplicates
        for v in results: 
            entry = (name, lat, lng, v['name'], v['location']['lat'], v['location']['lng'],  v['categories'][0]['name'])
            if ('Coffee' in v['categories'][0]['name'] or 'Caf' in v['categories'][0]['name']) and entry not in coffee_shop_list:
                coffee_shop_list.append([entry])

    nearby_coffee = pd.DataFrame([item for coffee_list in coffee_shop_list for item in coffee_list])
    nearby_coffee.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_coffee)

Create a variable `toronto_coffee_shops` to store all neighborhoods and coffee shops

In [358]:
toronto_coffee_shops = getNearbyCoffeeShops(names=df_tor['Neighborhood'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )

Check size of the resulting dataframe

In [359]:
print(toronto_coffee_shops.shape)
toronto_coffee_shops.head()

(977, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Tim Hortons,43.802,-79.198169,Coffee Shop
1,"Rouge,Malvern",43.806686,-79.194353,Second Cup,43.802165,-79.196114,Coffee Shop
2,"Rouge,Malvern",43.806686,-79.194353,Tim Hortons / Esso,43.801863,-79.199296,Coffee Shop
3,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Highland Harvest,43.784192,-79.169507,Café
4,"Guildwood,Morningside,West Hill",43.763573,-79.188711,Starbucks,43.77037,-79.18649,Coffee Shop


Looks like overall **Toronto has 977 coffee shops** according to Foursquare API results

### Find resident to coffee shop ratio

According to StatCan 2016 census the city of Toronto has 5,429,524 residents

In [378]:
num_residents_tor = 5429524
num_coffee_shops_tor = toronto_coffee_shops.shape[0]
ratio_tor = num_residents_tor/num_coffee_shops_tor
ratio_tor

5557.3428863868985

### Find out top 10 coffee chains in Toronto

In [403]:
tc = toronto_coffee_shops.groupby('Venue').count().sort_values('Neighborhood', ascending=False)
# vc = vc.drop(['Neighborhood'])
tc.head(10)

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Venue Category
Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Starbucks,201,201,201,201,201,201
Tim Hortons,165,165,165,165,165,165
Second Cup,31,31,31,31,31,31
Aroma Espresso Bar,20,20,20,20,20,20
Pilot Coffee Roasters,14,14,14,14,14,14
Coffee Time,12,12,12,12,12,12
Timothy's World Coffee,11,11,11,11,11,11
Country Style,10,10,10,10,10,10
McCafé,9,9,9,9,9,9
Versus Coffee,8,8,8,8,8,8


Map all of Toronto's coffee shops

In [374]:
from folium import plugins

toronto_coffee_map = folium.Map(location = [to_coordinates[0], to_coordinates[1]], zoom_start = 10)

coffee_spots = plugins.MarkerCluster().add_to(toronto_coffee_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(toronto_coffee_shops['Venue Latitude'], toronto_coffee_shops['Venue Longitude'], toronto_coffee_shops['Venue']):
    folium.Marker(
        location=[lat, lng],
        icon=None
    ).add_to(coffee_spots)

# display map
toronto_coffee_map

### Find out which neighborhood in Toronto has the most coffee shops

In [409]:
sorted_tor = toronto_coffee_shops.groupby('Neighborhood').count()
sorted_tor = sorted_tor.sort_values('Venue', ascending=False)
sorted_tor.head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,30,30,30,30,30,30
"Harbourfront East,Toronto Islands,Union Station",30,30,30,30,30,30
Stn A PO Boxes 25 The Esplanade,29,29,29,29,29,29
"Brockton,Exhibition Place,Parkdale Village",28,28,28,28,28,28
"Design Exchange,Toronto Dominion Centre",28,28,28,28,28,28


### Find and map all coffee shops in Vancouver by neighborhood

Create a variable `vancouver_coffee_shops` to store all neighborhoods and coffee shops

In [362]:
vancouver_coffee_shops = getNearbyCoffeeShops(names=df_van['Neighborhood'],
                                   latitudes=df_van['Latitude'],
                                   longitudes=df_van['Longitude']
                                  )

Check size of the resulting dataframe

In [363]:
print(vancouver_coffee_shops.shape)
vancouver_coffee_shops.head()

(406, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Railtown Cafe,49.270443,-123.100794,Café
1,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Tim Hortons,49.27315,-123.100556,Coffee Shop
2,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Tim Hortons,49.272134,-123.097706,Coffee Shop
3,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Starbucks,49.27293,-123.099825,Coffee Shop
4,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Starbucks,49.269334,-123.10285,Coffee Shop


Looks like overall **Vancouver has 406 coffee shops** according to Foursquare API results.

### Find resident to coffee shop ratio

According to StatCan there were 631,486 people living in the city of Vancouver as of 2016

In [379]:
num_residents_van = 631486
num_coffee_shops_van = vancouver_coffee_shops.shape[0]
ratio_van = num_residents_van/num_coffee_shops_van
ratio_van

1555.384236453202

### Find out 10 most common coffee chains in Vancouver

In [402]:
vc = vancouver_coffee_shops.groupby('Venue').count().sort_values('Neighborhood', ascending=False)
# vc = vc.drop(['Neighborhood'])
vc.head(10)

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Venue Category
Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Starbucks,88,88,88,88,88,88
Tim Hortons,42,42,42,42,42,42
JJ Bean,16,16,16,16,16,16
Blenz Coffee,10,10,10,10,10,10
Pallet Coffee Roasters,5,5,5,5,5,5
Giovane Café + Market + Eatery,4,4,4,4,4,4
Elysian Coffee,4,4,4,4,4,4
Matchstick Coffee Roasters,3,3,3,3,3,3
Bean Around The World,3,3,3,3,3,3
Bel Café,3,3,3,3,3,3


In [364]:
from folium import plugins

vancouver_coffee_map = folium.Map(location = [van_coordinates[0], van_coordinates[1]], zoom_start = 12)

van_coffee_spots = plugins.MarkerCluster().add_to(vancouver_coffee_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(vancouver_coffee_shops['Venue Latitude'], vancouver_coffee_shops['Venue Longitude'], vancouver_coffee_shops['Venue']):
    folium.Marker(
        location=[lat, lng],
        icon=None
    ).add_to(van_coffee_spots)

# display map
vancouver_coffee_map

### Find out which neighborhood in Vancouver has the most coffee shops

In [365]:
sorted_van = vancouver_coffee_shops.groupby('Neighborhood').count()
sorted_van.sort_values('Venue', ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Mount Pleasant,27,27,27,27,27,27
"Central Kitsilano , Greektown",26,26,26,26,26,26
"NW Shaughnessy , East Kitsilano , Quilchena",26,26,26,26,26,26
"NE Downtown , Gastown , Harbour Centre , International Village , Victory Square , Yaletown",25,25,25,25,25,25
"Waterfront , Coal Harbour , Canada Place",25,25,25,25,25,25
North Grandview-Woodland,25,25,25,25,25,25
Bentall Centre,23,23,23,23,23,23
"West Fairview , Granville Island , NE Shaughnessy",23,23,23,23,23,23
"SE West End , Davie Village",23,23,23,23,23,23
"Strathcona , Chinatown , Downtown Eastside",22,22,22,22,22,22


## Part 4: Cluster coffee shops in Toronto and Vancouver and map them

Run k-means to cluster the Toronto coffee into 5 clusters

In [366]:
kclusters = 5
toronto_onehot = pd.get_dummies(toronto_coffee_shops[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_coffee_shops['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
toronto_grouped_clustering.head()


Unnamed: 0,Café,Coffee Shop,Corporate Cafeteria,Gaming Cafe,Pet Café
0,0.148148,0.851852,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.333333,0.666667,0.0,0.0,0.0
3,0.5,0.5,0.0,0.0,0.0
4,0.5,0.5,0.0,0.0,0.0


In [367]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([3, 1, 3, 0, 0, 1, 0, 3, 3, 2, 3, 0, 3, 1, 1, 0, 4, 3, 0, 1, 0, 0, 3,
       3, 1, 2, 3, 0, 0, 0, 1, 3, 0, 1, 3, 2, 1, 1, 1, 1, 0, 3, 3, 4, 0, 0,
       1, 3, 3, 3, 4, 2, 1, 0, 3, 2, 1, 0, 3, 4, 4, 3, 3, 3, 2, 4, 0, 0, 3,
       3, 1, 2, 1, 0, 3, 1, 3, 1, 0, 0, 3, 0, 0, 1, 4, 3, 0, 1, 3, 0, 1, 1,
       0, 4, 1], dtype=int32)

In [368]:
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_grouped_clustering
toronto_merged = pd.concat([toronto_merged, toronto_grouped], axis=1)
toronto_merged2 = toronto_merged.drop([toronto_merged.columns[2]] ,  axis='columns')
toronto_merged2 = toronto_merged2.merge(toronto_data, on='Neighborhood')
toronto_merged2.head()

Unnamed: 0,Café,Coffee Shop,Gaming Cafe,Pet Café,Cluster Labels,Neighborhood,Café.1,Coffee Shop.1,Gaming Cafe.1,Pet Café.1,Postcode,Borough,Latitude,Longitude
0,0.148148,0.851852,0.0,0.0,3,"Adelaide,King,Richmond",0.148148,0.851852,0.0,0.0,M5H,Downtown Toronto,43.650571,-79.384568
1,0.166667,0.833333,0.0,0.0,3,Berczy Park,0.166667,0.833333,0.0,0.0,M5E,Downtown Toronto,43.644771,-79.373306
2,0.5,0.5,0.0,0.0,0,"Brockton,Exhibition Place,Parkdale Village",0.5,0.5,0.0,0.0,M6K,West Toronto,43.636847,-79.428191
3,0.0,0.5,0.0,0.5,3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.5,0.0,0.5,M7Y,East Toronto,43.662744,-79.321558
4,0.0,1.0,0.0,0.0,1,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,1.0,0.0,0.0,M5V,Downtown Toronto,43.628947,-79.39442


Map coffee shop clusters

In [384]:
# create map
map_clusters = folium.Map(location=[to_coordinates[0], to_coordinates[1]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged2['Latitude'], toronto_merged2['Longitude'], toronto_merged2['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Run k-means to cluster the Vancouver coffee into 5 clusters

In [370]:
vancouver_onehot = pd.get_dummies(vancouver_coffee_shops[['Venue Category']], prefix="", prefix_sep="")
vancouver_onehot['Neighborhood'] = vancouver_coffee_shops['Neighborhood'] 
fixed_columns = [vancouver_onehot.columns[-1]] + list(vancouver_onehot.columns[:-1])
vancouver_onehot = vancouver_onehot[fixed_columns]
vancouver_grouped = vancouver_onehot.groupby('Neighborhood').mean().reset_index()
vancouver_grouped_clustering = vancouver_grouped.drop('Neighborhood', 1)
vancouver_grouped_clustering.head()

Unnamed: 0,Café,Coffee Shop,Pet Café
0,0.086957,0.913043,0.0
1,0.384615,0.615385,0.0
2,0.375,0.625,0.0
3,0.37037,0.592593,0.037037
4,0.333333,0.666667,0.0


In [371]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(vancouver_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 1, 1, 1, 1, 3, 2, 1, 0, 1, 4, 3, 0, 1, 2, 2, 3, 4, 3, 4, 1, 1, 2,
       1, 1, 1, 1, 4, 1], dtype=int32)

In [380]:
vancouver_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
vancouver_merged = vancouver_grouped_clustering
vancouver_merged = pd.concat([vancouver_merged, vancouver_grouped], axis=1)
vancouver_merged2 = vancouver_merged.drop([vancouver_merged.columns[2]] ,  axis='columns')
vancouver_merged2 = vancouver_merged2.merge(vancouver_coffee_shops, on='Neighborhood')
vancouver_merged2.head()

Unnamed: 0,Café,Coffee Shop,Cluster Labels,Neighborhood,Café.1,Coffee Shop.1,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0.086957,0.913043,2,Bentall Centre,0.086957,0.913043,49.287537,-123.120389,Starbucks,49.287608,-123.119688,Coffee Shop
1,0.086957,0.913043,2,Bentall Centre,0.086957,0.913043,49.287537,-123.120389,Starbucks,49.288327,-123.117649,Coffee Shop
2,0.086957,0.913043,2,Bentall Centre,0.086957,0.913043,49.287537,-123.120389,Tim Hortons,49.287662,-123.11503,Coffee Shop
3,0.086957,0.913043,2,Bentall Centre,0.086957,0.913043,49.287537,-123.120389,Starbucks,49.287831,-123.123609,Coffee Shop
4,0.086957,0.913043,2,Bentall Centre,0.086957,0.913043,49.287537,-123.120389,Starbucks,49.28299,-123.1157,Coffee Shop


Map coffee shop clusters in Vancouver

In [381]:
# create map
map_clusters = folium.Map(location=[van_coordinates[0], van_coordinates[1]], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(vancouver_merged2['Neighborhood Latitude'], vancouver_merged2['Neighborhood Longitude'], vancouver_merged2['Neighborhood'], vancouver_merged2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters