# Part 1: Creating the dataframe

In [1]:
import pandas as pd
#!pip install lxml bs4 html5lib
import html5lib
import bs4
#!pip install BeautifulSoup4


In [2]:
boroughs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',flavor='html5lib')[0]

In [3]:
boroughs.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
#removing "Not assigned" boroughs
boroughs = boroughs.where(boroughs['Borough'] != 'Not assigned').dropna()
boroughs.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
# Check if any cells have a neighborhood with 'Not assigned' or empty
boroughs[(boroughs['Neighbourhood'] == 'Not assigned') | (boroughs['Neighbourhood'] == '')]

Unnamed: 0,Postal Code,Borough,Neighbourhood


No cells contain empty or 'not assigned' for their neighborhood value.

In [6]:
print('The shape of the dataframe is', boroughs.shape,'.')

The shape of the dataframe is (103, 3) .


# Part 2 : Finding coordinate data

Geocoder is not working, so we will import the csv file.

In [7]:
#import the csv
postalcodedata = pd.read_csv('http://cocl.us/Geospatial_data')

In [8]:
postalcodedata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
#merging the two tables
finaltable = pd.merge(boroughs,postalcodedata,how='inner', left_on='Postal Code', right_on='Postal Code')

In [10]:
finaltable.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3: Segmenting Boroughs in Toronto

In [11]:
# Downsizing the dataframe for us to work with less boroughs:
counter =[]
for i in range(0,len(finaltable)):
    if 'Toronto' not in finaltable.iloc[i,1]:
        counter.append(i)
        
finaltable.drop(counter, inplace=True)
finaltable.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [13]:
#resetting index for finaltable
finaltable.reset_index(inplace=True, drop=True)
finaltable.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [37]:
#size of finaltable
finaltable.shape

(39, 6)

### Let's cluster the neighborhoods in Toronto based on the concentration of 4 different types of cuisine, and we will use the k-means algorithm for that, and later plot the results on a map

Let's first find the data for the top 100 venues in each neighborhood, before we choose what cuisines to group by.


In [90]:
# Defining the function for importing venues from Foursquare
def getNearbyVenues(names, latitudes, longitudes, Zip, radius=500):
    
    venues_list=[]
    for Zip, name, lat, lng in zip(Zip, names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            Zip,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code','Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
# Saving parameters for Foursquare (Content Hidden)
CLIENT_ID = 
CLIENT_SECRET = 
VERSION = '20180605'
LIMIT = 100 

In [91]:
# Using the function and creating a data table
import requests
toronto_venues = getNearbyVenues(names=finaltable['Borough'], latitudes=finaltable['Latitude'],longitudes=finaltable['Longitude'], Zip=finaltable['Postal Code'])

Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
East Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
West Toronto
Downtown Toronto
West Toronto
East Toronto
Downtown Toronto
West Toronto
East Toronto
Downtown Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
West Toronto
Central Toronto
Central Toronto
West Toronto
Central Toronto
Downtown Toronto
West Toronto
Central Toronto
Downtown Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
East Toronto


In [92]:
# Checking head of data table
toronto_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,Downtown Toronto,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,M5A,Downtown Toronto,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,M5A,Downtown Toronto,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,Downtown Toronto,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,Downtown Toronto,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [74]:
# Checking most popular types of venues
toronto_venues['Venue Category'].value_counts().to_dict()


{'Coffee Shop': 154,
 'Café': 83,
 'Restaurant': 50,
 'Italian Restaurant': 40,
 'Hotel': 38,
 'Park': 34,
 'Bakery': 34,
 'Japanese Restaurant': 32,
 'Pizza Place': 28,
 'Gym': 26,
 'Bar': 26,
 'Sushi Restaurant': 25,
 'Sandwich Place': 23,
 'Clothing Store': 22,
 'Seafood Restaurant': 22,
 'American Restaurant': 21,
 'Pub': 19,
 'Thai Restaurant': 19,
 'Vegetarian / Vegan Restaurant': 18,
 'Gastropub': 18,
 'Beer Bar': 17,
 'Cocktail Bar': 16,
 'Diner': 16,
 'Breakfast Spot': 16,
 'Grocery Store': 15,
 'Bookstore': 15,
 'Fast Food Restaurant': 14,
 'Deli / Bodega': 14,
 'Yoga Studio': 13,
 'Steakhouse': 13,
 'Theater': 13,
 'Salad Place': 12,
 'Dessert Shop': 12,
 'Brewery': 12,
 'Art Gallery': 12,
 'Greek Restaurant': 12,
 'Bank': 12,
 'Asian Restaurant': 11,
 'Farmers Market': 11,
 'Sporting Goods Shop': 11,
 'Gym / Fitness Center': 11,
 'Lounge': 10,
 'Ice Cream Shop': 10,
 'French Restaurant': 10,
 'Burrito Place': 10,
 'Mexican Restaurant': 10,
 'Cosmetics Shop': 10,
 'Pharmacy'

We can then select the top 4 entries: Italian/Pizza, Japanese/Sushi, American/Diner and Thai/Asian. Let's organize this in a dataframe.

In [192]:
# Selecting just those categories
italian = toronto_venues[toronto_venues['Venue Category']=='Italian Restaurant']
pizza = toronto_venues[toronto_venues['Venue Category']=='Pizza Place']
japanese = toronto_venues[toronto_venues['Venue Category']=='Japanese Restaurant']
sushi = toronto_venues[toronto_venues['Venue Category']=='Sushi Restaurant']
american = toronto_venues[toronto_venues['Venue Category']=='American Restaurant']
diner = toronto_venues[toronto_venues['Venue Category']=='Diner']
thai = toronto_venues[toronto_venues['Venue Category']=='Thai Restaurant']
asian = toronto_venues[toronto_venues['Venue Category']=='Asian Restaurant']

In [193]:
# extracting counts 
italian = italian.groupby('Postal Code').count().iloc[:,-1].reset_index()
pizza = pizza.groupby('Postal Code').count().iloc[:,-1].reset_index()
japanese = japanese.groupby('Postal Code').count().iloc[:,-1].reset_index()
sushi = sushi.groupby('Postal Code').count().iloc[:,-1].reset_index()
american = american.groupby('Postal Code').count().iloc[:,-1].reset_index()
diner = diner.groupby('Postal Code').count().iloc[:,-1].reset_index()
thai = thai.groupby('Postal Code').count().iloc[:,-1].reset_index()
asian = asian.groupby('Postal Code').count().iloc[:,-1].reset_index()

In [194]:
# Joining tables
italian = italian.merge(pizza, how='inner', on='Postal Code')
italian['Italian'] = italian['Venue Category_x'] + italian['Venue Category_y']
italian

Unnamed: 0,Postal Code,Venue Category_x,Venue Category_y,Italian
0,M4K,3,1,4
1,M4L,1,2,3
2,M4S,2,3,5
3,M4X,2,2,4
4,M5B,3,2,5
5,M5J,3,2,5
6,M5K,3,1,4
7,M5L,4,2,6
8,M5W,3,1,4
9,M5X,1,2,3


In [195]:
#Joining the other tables
italian = italian.drop(['Venue Category_x','Venue Category_y'], axis=1)
japanese = japanese.merge(sushi, how='inner', on='Postal Code')
japanese['Japanese'] = japanese['Venue Category_x'] + japanese['Venue Category_y']
japanese = japanese.drop(['Venue Category_x','Venue Category_y'], axis=1)
american = american.merge(diner, how='inner', on='Postal Code')
american['American'] = american['Venue Category_x'] + american['Venue Category_y']
american = american.drop(['Venue Category_x','Venue Category_y'], axis=1)
thai = thai.merge(asian, how='inner', on='Postal Code')
thai['Thai'] = thai['Venue Category_x'] + thai['Venue Category_y']
thai = thai.drop(['Venue Category_x','Venue Category_y'], axis=1)

In [196]:
# Joining all cuisines
cuisines = italian.merge(japanese, left_on='Postal Code', right_on='Postal Code', how='outer').merge(thai, left_on='Postal Code', right_on='Postal Code', how='outer').merge(american, left_on='Postal Code', right_on='Postal Code', how='outer')

In [197]:
cuisines.head(10)

Unnamed: 0,Postal Code,Italian,Japanese,Thai,American
0,M4K,4.0,,,
1,M4L,3.0,,,
2,M4S,5.0,,,
3,M4X,4.0,,,
4,M5B,5.0,,,
5,M5J,5.0,2.0,,
6,M5K,4.0,5.0,,
7,M5L,6.0,,3.0,
8,M5W,4.0,4.0,,2.0
9,M5X,3.0,6.0,5.0,


In [198]:
# Transforming into integers, changing NaN to 0
cuisines.fillna(0, inplace=True)
cuisines['Italian'] = cuisines['Italian'].astype('int')
cuisines['Japanese'] = cuisines['Japanese'].astype('int')
cuisines['Thai'] = cuisines['Thai'].astype('int')
cuisines['American'] = cuisines['American'].astype('int')


In [199]:
# Adding borough name, latitude and longitude
cuisines = cuisines.merge(finaltable, left_on='Postal Code', right_on='Postal Code', how='left')
cuisines.head()

Unnamed: 0,Postal Code,Italian,Japanese,Thai,American,Borough,Neighbourhood,Latitude,Longitude
0,M4K,4,0,0,0,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
1,M4L,3,0,0,0,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
2,M4S,5,0,0,0,Central Toronto,Davisville,43.704324,-79.38879
3,M4X,4,0,0,0,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
4,M5B,5,0,0,0,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Now let's start the modeling!

In [203]:
X = cuisines.values[:,1:5]
from sklearn.preprocessing import StandardScaler
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset[0:5]



array([[ 0.71898859, -0.76696499, -0.47087096, -0.47380354],
       [ 0.25512498, -0.76696499, -0.47087096, -0.47380354],
       [ 1.1828522 , -0.76696499, -0.47087096, -0.47380354],
       [ 0.71898859, -0.76696499, -0.47087096, -0.47380354],
       [ 1.1828522 , -0.76696499, -0.47087096, -0.47380354]])

Let's use 3 clusters to group the boroughs

In [204]:
from sklearn.cluster import KMeans 
num_clusters = 3

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[0 0 0 0 0 0 0 0 1 2 0 0 1 1 1 2 1 1 1 1]


The algorithm has grouped boroughs based on the number of restaurants of each type it has.
Let's visualize this on a map.

In [214]:
# import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

#find mean latitude and longitude
latitude = cuisines['Latitude'].mean()
longitude = cuisines['Longitude'].mean()

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cuisines['Latitude'], cuisines['Longitude'], cuisines['Neighbourhood'], labels):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

       
map_clusters

Cluster 0 corresponds to the following neighborhoods, which tend to have more Italian restaurants:

In [216]:
cuisines['Label'] = labels
cuisines[cuisines['Label']==0]

Unnamed: 0,Postal Code,Italian,Japanese,Thai,American,Borough,Neighbourhood,Latitude,Longitude,Label
0,M4K,4,0,0,0,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0
1,M4L,3,0,0,0,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0
2,M4S,5,0,0,0,Central Toronto,Davisville,43.704324,-79.38879,0
3,M4X,4,0,0,0,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,0
4,M5B,5,0,0,0,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
5,M5J,5,2,0,0,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,0
6,M5K,4,5,0,0,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0
7,M5L,6,0,3,0,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,0
10,M6J,2,0,0,0,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,0
11,M6S,4,0,0,0,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,0


The restaurants in label 1 have more Japanese restaurants, in general.

In [217]:
cuisines[cuisines['Label']==1]

Unnamed: 0,Postal Code,Italian,Japanese,Thai,American,Borough,Neighbourhood,Latitude,Longitude,Label
8,M5W,4,4,0,2,Downtown Toronto,Stn A PO Boxes,43.646435,-79.374846,1
12,M4Y,0,10,0,2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1
13,M5E,0,2,0,0,Downtown Toronto,Berczy Park,43.644771,-79.373306,1
14,M5G,0,2,0,0,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1
16,M5S,0,3,0,0,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049,1
17,M7A,0,3,0,0,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1
18,M5C,0,0,2,4,Downtown Toronto,St. James Town,43.651494,-79.375418,1
19,M4M,0,0,0,3,East Toronto,Studio District,43.659526,-79.340923,1


Restaurants in label 2 tend to have more Thai and Japanese restaurants and no American restaurants.

In [218]:
cuisines[cuisines['Label']==2]

Unnamed: 0,Postal Code,Italian,Japanese,Thai,American,Borough,Neighbourhood,Latitude,Longitude,Label
9,M5X,3,6,5,0,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,2
15,M5H,0,3,4,0,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,2
