# Compare and analyze coffee shops in Toronto and Vancouver

Install and import required libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml
!pip install requests
!pip install folium
!pip install geopy
!pip install opencage
import bs4 as bs
import requests
import numpy as np 
import pandas as pd
import folium 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize
from opencage.geocoder import OpenCageGeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Part 1: Download and prepare data

The code below scrapes postal codes and neighborhoods from the following Wikipedia pages: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V

In [2]:
# get postal codes and neighborhoods for Toronto
source_tor = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = bs.BeautifulSoup(source_tor, 'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
data_tor = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    data_tor.append(row)

# get postal codes and neighborhoods for Vancouver
source_van = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_V').text
soup = bs.BeautifulSoup(source_van, 'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
data_van = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    for entry in row:
        # filter out non-Vancouver neighborhoods
        if 'West Vancouver' not in entry and 'North Vancouver' not in entry:
            if 'Vancouver' in entry:
                postal_code = entry[:3]
                neighborhoods = entry[entry.find("(")+1:entry.find(")")].replace('/', ',')
                data_van.append([postal_code, neighborhoods])


Transform the data into a pandas dataframe

In [3]:
df_tor = pd.DataFrame(data_tor, columns=["Postcode", "Borough", "Neighborhood"])
df_van = pd.DataFrame(data_van, columns=["Postcode", "Neighborhood"])

# Ignore cells with a borough that is Not assigned or Null 
df_tor = df_tor[df_tor['Borough'] != 'Not assigned']
df_tor = df_tor[df_tor['Borough'].notnull()]

# Combine neighborhoods into one comma separated row
df_tor = df_tor.groupby(['Postcode', 'Borough'])['Neighborhood'].agg(lambda x : ','.join(x)).to_frame().reset_index()

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough (only 1 case)
df_tor.loc[df_tor['Neighborhood'] == 'Not assigned', 'Neighborhood'] = 'Queen\'s Park'

In [4]:
df_tor.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
df_van.head()

Unnamed: 0,Postcode,Neighborhood
0,V6A,"Strathcona , Chinatown , Downtown Eastside"
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter..."
2,V6C,"Waterfront , Coal Harbour , Canada Place"
3,V6E,"SE West End , Davie Village"
4,V6G,"NW West End , Stanley Park"


In [6]:
df_tor.shape

(103, 3)

In [7]:
df_van.shape

(31, 2)

### Get coordinates for each neighborhood in Toronto and Vancouver

First create a dataframe with Toronto neighborhoods and geolocations.

In [8]:
!wget -q -O 'geospacial_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [9]:
geospacial_data = pd.read_csv('geospacial_data.csv')
geospacial_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geospacial_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Transform dataframe to leave only boroughs that contain the word Toronto

In [33]:
df_tor = df_tor[df_tor.Borough.str.contains("Toronto")].reset_index(drop=True)

Combine two dataframes

In [34]:
df_tor = df_tor.merge(geospacial_data, on='Postcode')
df_tor.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


Now create a dataframe with Vancouver neighborhoods and geolocations.

In [27]:
# Steps to download data from geocode API - need to get your own key
# key = ''
# geocoder = OpenCageGeocode(key)

# geospacial_data_list = []
# for index, postcode in df_van.iterrows():
#     query = postcode['Postcode']
#     results = geocoder.geocode(query)
#     for result in results:
#         if result['components']['country_code'] == 'ca':
#             geospacial_data_list.append([query, result['geometry']['lat'], result['geometry']['lng']] )
#             break
            
# geospacial_data_van = pd.DataFrame.from_records(geospacial_data_list, columns=["Postcode", "Latitude", "Longitude"])
# geospacial_data_van.to_csv('geospacial_data_van.csv')

geospacial_data_van = pd.read_csv('geospacial_data_van.csv')
geospacial_data_van.drop("Unnamed: 0", axis=1, inplace=True)
df_van = df_van.merge(geospacial_data_van, on='Postcode')
df_van.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
0,V6A,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585
1,V6B,"NE Downtown , Gastown , Harbour Centre , Inter...",49.278226,-123.10578
2,V6C,"Waterfront , Coal Harbour , Canada Place",49.29181,-123.115989
3,V6E,"SE West End , Davie Village",49.287537,-123.120389
4,V6G,"NW West End , Stanley Park",49.299723,-123.137791


## Part 2: Plot Toronto and Vancouver Neighborhoods

In [24]:
# function to get geographical coordinates given location name
def get_lat_lon(location):
    geolocator = Nominatim(user_agent="to_explorer")
    location = geolocator.geocode(location)
    latitude = location.latitude
    longitude = location.longitude
    print('The geographical coordinate of {} are {}, {}.'.format(location, latitude, longitude))
    return (latitude, longitude)
    
to_coordinates = get_lat_lon('Toronto,ON')
van_coordinates = get_lat_lon('Vancouver,B.C.')

The geographical coordinate of Toronto, Golden Horseshoe, Ontario, M6K 1X9, Canada are 43.653963, -79.387207.
The geographical coordinate of Vancouver, Metro Vancouver, British Columbia, Canada are 49.2608724, -123.1139529.


Create a map of Toronto neighborhoods

In [26]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[to_coordinates[0], to_coordinates[1]], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

In [28]:
# create map of Vancouver using latitude and longitude values
vancouver_map = folium.Map(location=[van_coordinates[0], van_coordinates[1]], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df_van['Latitude'], df_van['Longitude'], df_van['Neighborhood']):
    label = str(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(vancouver_map)  
    
vancouver_map

## Part 3: Utilize the Foursquare API to explore the coffee shops

In [29]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20190928' # Foursquare API version

### Find all coffee shops in Toronto by neighborhood

In [91]:
# from pprint import pprint
# neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
# neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value
# radius = 600
# neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name
# # url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius=500&limit=100'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION)
# url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryId={},{}&v={}&radius={}'.format(
#     CLIENT_ID, 
#     CLIENT_SECRET, 
#     neighborhood_latitude, 
#     neighborhood_longitude,
#     categories[0], 
#     categories[1], 
#     VERSION, 
#     radius)
# results = requests.get(url).json()['response']['venues']

# for result in results:
#     print(result['name'], result['location']['lat'], result['location']['lng'], result['categories'][0]['name'], )

In [133]:
def getNearbyCoffeeShops(names, latitudes, longitudes, radius=700):
    
    # Id's correspond to Coffee shop, Corporate coffee shop, Café, Pet Café
    # Excluded from this list are College Cafeteria, Corporate Cafeteria, coffee houses, Gaming Cafe, and Internet Cafe 
    categories = ['4bf58dd8d48988d1e0931735', '5665c7b9498e7d8a4f2c0f06','4bf58dd8d48988d16d941735', '56aa371be4b08b9a8d573508'] 
    
    coffee_shop_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryId={},{},{},{}&v={}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            lat, 
            lng,
            categories[0], 
            categories[1],
            categories[2],
            categories[3],
            VERSION, 
            radius)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        
        # return only coffee shops, filter out restaurants that serve coffee, try to eliminate duplicates
        for v in results: 
            entry = (name, lat, lng, v['name'], v['location']['lat'], v['location']['lng'],  v['categories'][0]['name'])
            if ('Coffee' in v['categories'][0]['name'] or 'Caf' in v['categories'][0]['name']) and entry not in coffee_shop_list:
                coffee_shop_list.append([entry])

    nearby_coffee = pd.DataFrame([item for coffee_list in coffee_shop_list for item in coffee_list])
    nearby_coffee.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_coffee)

Create a variable `toronto_coffee_shops` to store all neighborhoods and coffee shops

In [134]:
toronto_coffee_shops = getNearbyCoffeeShops(names=df_tor['Neighborhood'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvall

Check size of the resulting dataframe

In [135]:
print(toronto_coffee_shops.shape)
toronto_coffee_shops.head()

(723, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Oscar Coffee and Espresso Bar,43.6726,-79.287701,Café
1,The Beaches,43.676357,-79.293031,The Remarkable Bean,43.672801,-79.287038,Coffee Shop
2,The Beaches,43.676357,-79.293031,Juice and Java,43.671062,-79.295755,Café
3,The Beaches,43.676357,-79.293031,Savoury Grounds,43.68054,-79.287421,Coffee Shop
4,The Beaches,43.676357,-79.293031,The Porch Light,43.680679,-79.286561,Coffee Shop


Looks like overall **Toronto has 723 coffee shops** according to Foursquare API results

### Find out which neighborhood in Toronto has the most coffee shops

In [136]:
sorted_tor = toronto_coffee_shops.groupby('Neighborhood').count()
sorted_tor.sort_values('Venue', ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Harbourfront East,Toronto Islands,Union Station",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"Brockton,Exhibition Place,Parkdale Village",29,29,29,29,29,29
Stn A PO Boxes 25 The Esplanade,29,29,29,29,29,29
"Design Exchange,Toronto Dominion Centre",28,28,28,28,28,28
Church and Wellesley,27,27,27,27,27,27
"The Annex,North Midtown,Yorkville",27,27,27,27,27,27
"First Canadian Place,Underground city",27,27,27,27,27,27
"Commerce Court,Victoria Hotel",27,27,27,27,27,27
"Adelaide,King,Richmond",27,27,27,27,27,27


### Find and map all coffee shops in Vancouver by neighborhood

Create a variable `vancouver_coffee_shops` to store all neighborhoods and coffee shops

In [137]:
vancouver_coffee_shops = getNearbyCoffeeShops(names=df_van['Neighborhood'],
                                   latitudes=df_van['Latitude'],
                                   longitudes=df_van['Longitude']
                                  )

Strathcona , Chinatown , Downtown Eastside
NE Downtown , Gastown , Harbour Centre , International Village , Victory Square , Yaletown
Waterfront , Coal Harbour , Canada Place
SE West End , Davie Village
NW West End , Stanley Park
West Fairview , Granville Island , NE Shaughnessy
NW Shaughnessy , East Kitsilano , Quilchena
North Hastings-Sunrise
Central Kitsilano , Greektown
North Grandview-Woodland
NW Arbutus Ridge , NE Dunbar-Southlands
South Hastings-Sunrise , North Renfrew-Collingwood
South Shaughnessy , NW Oakridge , NE Kerrisdale , SE Arbutus Ridge
South Grandview-Woodland , NE Kensington-Cedar Cottage
West Kerrisdale , South Dunbar-Southlands , Musqueam
SE Kensington-Cedar Cottage , Victoria-Fraserview
SE Kerrisdale , SW Oakridge , West Marpole
South Renfrew-Collingwood
West Kitsilano , West Point Grey , Jericho
Killarney
NW Dunbar-Southlands , Chaldecutt , South University Endowment Lands
East Mount Pleasant
UBC
West Kensington-Cedar Cottage , NE Riley Park-Little Mountain
SE Ri

Check size of the resulting dataframe

In [138]:
print(vancouver_coffee_shops.shape)
vancouver_coffee_shops.head()

(406, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Railtown Cafe,49.270443,-123.100794,Café
1,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Tim Hortons,49.27315,-123.100556,Coffee Shop
2,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Tim Hortons,49.272134,-123.097706,Coffee Shop
3,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Starbucks,49.27293,-123.099825,Coffee Shop
4,"Strathcona , Chinatown , Downtown Eastside",49.271119,-123.100585,Starbucks,49.269334,-123.10285,Coffee Shop


Looks like overall **Vancouver has 406 coffee shops** according to Foursquare API results

### Find out which neighborhood in Vancouver has the most coffee shops

In [140]:
sorted_van = vancouver_coffee_shops.groupby('Neighborhood').count()
sorted_van.sort_values('Venue', ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Mount Pleasant,27,27,27,27,27,27
"Central Kitsilano , Greektown",26,26,26,26,26,26
"NW Shaughnessy , East Kitsilano , Quilchena",26,26,26,26,26,26
"NE Downtown , Gastown , Harbour Centre , International Village , Victory Square , Yaletown",25,25,25,25,25,25
"Waterfront , Coal Harbour , Canada Place",25,25,25,25,25,25
North Grandview-Woodland,25,25,25,25,25,25
Bentall Centre,23,23,23,23,23,23
"West Fairview , Granville Island , NE Shaughnessy",23,23,23,23,23,23
"SE West End , Davie Village",23,23,23,23,23,23
"Strathcona , Chinatown , Downtown Eastside",22,22,22,22,22,22


## Part 4: Cluster coffee shops in Toronto and Vancouver and map them

Run k-means to cluster the neighborhood into 5 clusters