# Toronto Segmentation and Clustering Project

In [2]:
# Imports
import pandas as pd
import numpy as np
import io
import requests
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import json
from pandas.io.json import json_normalize

In [3]:
#!pip3 install folium

In [4]:
#!pip3 install beautifulsoup4 requests

In [5]:
#import requests
from bs4 import BeautifulSoup

# Spoof our headers to make sure we don't get blocked
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [6]:
# Retrieve the wiki page as beautiful soup object
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"aa028fba-7540-451b-a64e-24f8c0feb477","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1019189119,"wgRevisionId":1019189119,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Wikipedia

In [7]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [8]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [10]:
df.shape

(103, 3)

In [11]:
# Add latitude and longitude to the dataframe
url="https://cocl.us/Geospatial_data"
content_string=requests.get(url).content
content_table=pd.read_csv(io.StringIO(content_string.decode('utf-8')))

df_joined = df.join(content_table.set_index('Postal Code'), on='PostalCode')
df_joined.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [12]:
# Get the coordinates of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Toronto latitude and longitude: {}, {}.'.format(latitude, longitude))

Toronto latitude and longitude: 43.6534817, -79.3839347.


In [13]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# Markers
for lat, lng, borough, neighborhood in zip(df_joined['Latitude'], df_joined['Longitude'], df_joined['Borough'], df_joined['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='gray',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

In [14]:
toronto_map

In [15]:

CLIENT_ID = 'O0HJUMSBLMA4FOPTXPXA0IO0P4DEWM2XMKD0IJ5Q0ZJRPUV3' # your Foursquare 
CLIENT_SECRET = 'O0L2WTWXSGY10DARU2CBPI450RB00LARJRIZ5FZ2KF01GQYW' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
radius = 500


url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url

Your credentails:
CLIENT_ID: O0HJUMSBLMA4FOPTXPXA0IO0P4DEWM2XMKD0IJ5Q0ZJRPUV3
CLIENT_SECRET:O0L2WTWXSGY10DARU2CBPI450RB00LARJRIZ5FZ2KF01GQYW


'https://api.foursquare.com/v2/venues/search?client_id=O0HJUMSBLMA4FOPTXPXA0IO0P4DEWM2XMKD0IJ5Q0ZJRPUV3&client_secret=O0L2WTWXSGY10DARU2CBPI450RB00LARJRIZ5FZ2KF01GQYW&ll=43.6534817,-79.3839347&v=20180604&radius=500&limit=100'

In [16]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [17]:
# Results from the GET
results = requests.get(url).json()
results

# Pull out the venues
venues = results['response']['venues']

# dataframe
venues_dataframe = json_normalize(venues)
venues_dataframe.head()



Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.lat,location.lng,location.labeledLatLngs,location.distance,location.postalCode,location.cc,location.city,location.state,location.country,location.formattedAddress,location.crossStreet
0,5b193c42598e64002ca79b96,City of Toronto Civic Innovation Office,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",v-1623699995,False,100 Queen St W,43.653454,-79.383952,"[{'label': 'display', 'lat': 43.653454, 'lng':...",3,M5H 2N2,CA,Toronto,ON,Canada,"[100 Queen St W, Toronto ON M5H 2N2, Canada]",
1,4ad4c05ef964a5208ff620e3,Toronto City Hall,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",v-1623699995,False,100 Queen St. W.,43.65314,-79.383967,"[{'label': 'display', 'lat': 43.65313989695342...",38,M5H 2N2,CA,Toronto,ON,Canada,"[100 Queen St. W. (at Bay St.), Toronto ON M5H...",at Bay St.
2,4c41d47cd691c9b6fa4c8d0a,Kew Gardens Playground,"[{'id': '4bf58dd8d48988d1e7941735', 'name': 'P...",v-1623699995,False,Queen street and belfair ave,43.653225,-79.383185,"[{'label': 'display', 'lat': 43.65322456230742...",66,,CA,Toronto,ON,Canada,"[Queen street and belfair ave, Toronto ON, Can...",
3,4c0121fd9a950f47fa9208c6,City Hall Podium Green Roof,"[{'id': '4bf58dd8d48988d15a941735', 'name': 'G...",v-1623699995,False,100 Queen Street West,43.653504,-79.383866,"[{'label': 'display', 'lat': 43.65350358617817...",6,,CA,Toronto,ON,Canada,"[100 Queen Street West, Toronto ON, Canada]",
4,4c093ee0340720a153728493,City Hall Council Chambers,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",v-1623699995,False,,43.651827,-79.383949,"[{'label': 'display', 'lat': 43.65182710471462...",184,,CA,,,Canada,[Canada],


In [18]:
# Category extraction function
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [19]:
# Filter the categories and clean up the venue names
filtered_columns = ['name', 'categories'] + [col for col in venues_dataframe.columns if col.startswith('location.')] + ['id']
venues_df_filtered = venues_dataframe.loc[:, filtered_columns]
venues_df_filtered['categories'] = venues_df_filtered.apply(get_category_type, axis=1)
venues_df_filtered.columns = [column.split('.')[-1] for column in venues_df_filtered.columns]
venues_df_filtered.head()

Unnamed: 0,name,categories,address,lat,lng,labeledLatLngs,distance,postalCode,cc,city,state,country,formattedAddress,crossStreet,id
0,City of Toronto Civic Innovation Office,City Hall,100 Queen St W,43.653454,-79.383952,"[{'label': 'display', 'lat': 43.653454, 'lng':...",3,M5H 2N2,CA,Toronto,ON,Canada,"[100 Queen St W, Toronto ON M5H 2N2, Canada]",,5b193c42598e64002ca79b96
1,Toronto City Hall,City Hall,100 Queen St. W.,43.65314,-79.383967,"[{'label': 'display', 'lat': 43.65313989695342...",38,M5H 2N2,CA,Toronto,ON,Canada,"[100 Queen St. W. (at Bay St.), Toronto ON M5H...",at Bay St.,4ad4c05ef964a5208ff620e3
2,Kew Gardens Playground,Playground,Queen street and belfair ave,43.653225,-79.383185,"[{'label': 'display', 'lat': 43.65322456230742...",66,,CA,Toronto,ON,Canada,"[Queen street and belfair ave, Toronto ON, Can...",,4c41d47cd691c9b6fa4c8d0a
3,City Hall Podium Green Roof,Garden,100 Queen Street West,43.653504,-79.383866,"[{'label': 'display', 'lat': 43.65350358617817...",6,,CA,Toronto,ON,Canada,"[100 Queen Street West, Toronto ON, Canada]",,4c0121fd9a950f47fa9208c6
4,City Hall Council Chambers,City Hall,,43.651827,-79.383949,"[{'label': 'display', 'lat': 43.65182710471462...",184,,CA,,,Canada,[Canada],,4c093ee0340720a153728493


In [20]:
venues_df_filtered.shape

(65, 15)

In [21]:
# Nearby venues function
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
names=df_joined['Neighborhood']

In [23]:
# The neighborhood that have venues in Toronto
toronto_venues = getNearbyVenues(names=names,
                                   latitudes=df_joined['Latitude'],
                                   longitudes=df_joined['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [24]:
print('Number of rows returned{}',format(toronto_venues.shape[0]))

Number of rows returned{} 2007


In [25]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [26]:
# Venue count per neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",5,5,5,5,5,5
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
...,...,...,...,...,...,...
Willowdale West,6,6,6,6,6,6
"Willowdale, Newtonbrook",1,1,1,1,1,1
Woburn,4,4,4,4,4,4
Woodbine Heights,5,5,5,5,5,5


In [27]:

print('Number of unique categories is {}.'.format(len(toronto_venues['Venue Category'].unique())))

Number of unique categories is 256.


### Onehot Encoding

From Wikipedia:

"In digital circuits and machine learning, a one-hot is a group of bits among which the legal combinations of values are only those with a single high (1) bit and all the others low (0).[1] A similar implementation in which all bits are '1' except one '0' is sometimes called one-cold.[2] In statistics, dummy variables represent a similar technique for representing categorical data."

In [28]:
# Onehot encoding
tr_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
tr_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [tr_onehot.columns[-1]] + list(tr_onehot.columns[:-1])
final_onehot = tr_onehot[fixed_columns]

In [29]:
final_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Check to see all the data was encoding and there is no data loss
final_onehot.shape

(2007, 256)

In [31]:
# Group by neighborhood
tr_df_grouped = final_onehot.groupby('Neighborhood').mean().reset_index()
tr_df_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
num_top_venues = 3

for hood in tr_df_grouped['Neighborhood']:
    print("------- "+hood+" -------")
    temp = tr_df_grouped[tr_df_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

------- Agincourt -------
                       venue  freq
0  Latin American Restaurant  0.25
1                     Lounge  0.25
2             Clothing Store  0.25


------- Alderwood, Long Branch -------
            venue  freq
0     Pizza Place   0.4
1  Sandwich Place   0.2
2     Coffee Shop   0.2


------- Bathurst Manor, Wilson Heights, Downsview North -------
               venue  freq
0               Bank  0.10
1        Coffee Shop  0.10
2  Convenience Store  0.05


------- Bayview Village -------
                venue  freq
0                Café  0.25
1                Bank  0.25
2  Chinese Restaurant  0.25


------- Bedford Park, Lawrence Manor East -------
            venue  freq
0  Sandwich Place  0.08
1      Restaurant  0.08
2     Coffee Shop  0.08


------- Berczy Park -------
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.09
2        Bakery  0.07


------- Birch Cliff, Cliffside West -------
             venue  freq
0  College Stadium   0.2
1     Skating R

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Most Common Venues

In [34]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [35]:
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tr_df_grouped['Neighborhood']

for ind in np.arange(tr_df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tr_df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Breakfast Spot,Latin American Restaurant,Clothing Store,Diner
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pub,Sandwich Place,Distribution Center
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Bridal Shop,Ice Cream Shop
3,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store
4,"Bedford Park, Lawrence Manor East",Pizza Place,Sandwich Place,Restaurant,Coffee Shop,Thai Restaurant


### Clustering

In [54]:
# sRun Kmeans clustering and row labels
kclusters_num = 2
tr_df_grouped_clustering = tr_df_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters_num, random_state=0).fit(tr_df_grouped_clustering)
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int32)

In [55]:
# Clustering Labels - only once
#neighborhoods_venues_sorted.insert(0, 'ClusterLabels', kmeans.labels_)
neighborhoods_venues_sorted.head()

Unnamed: 0,ClusterLabels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,0,Agincourt,Lounge,Breakfast Spot,Latin American Restaurant,Clothing Store,Diner
1,0,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pub,Sandwich Place,Distribution Center
2,0,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Bridal Shop,Ice Cream Shop
3,0,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store
4,0,"Bedford Park, Lawrence Manor East",Pizza Place,Sandwich Place,Restaurant,Coffee Shop,Thai Restaurant


In [57]:
df_joined

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [61]:
# Let's merge the two datasets
tr_df_merged = df_joined
tr_df_merged = tr_df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood') 
tr_df_merged = tr_df_merged.dropna() # remove NAs
tr_df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Park,Fast Food Restaurant,Food & Drink Shop,Department Store,Escape Room
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Intersection,Hockey Arena,Coffee Shop,Financial or Legal Service,Portuguese Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Park,Pub,Bakery,Café
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Clothing Store,Coffee Shop,Vietnamese Restaurant,Gift Shop,Sporting Goods Shop
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,0.0,Coffee Shop,Sushi Restaurant,Burrito Place,Yoga Studio,Theater


In [65]:
tr_df_merged.loc[tr_df_merged['ClusterLabels'] ==1, tr_df_merged.columns[[1] + list(range(5, tr_df_merged.shape[1]))]]

Unnamed: 0,Borough,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1.0,Fast Food Restaurant,Women's Store,Falafel Restaurant,Ethiopian Restaurant,Escape Room


In [63]:
# Mapping with Folium
tr_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters_num)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(tr_df_merged['Latitude'], tr_df_merged['Longitude'], tr_df_merged['Neighborhood'], tr_df_merged['ClusterLabels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6).add_to(tr_map_clusters)
       
tr_map_clusters

Unnamed: 0,Borough,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1.0,Fast Food Restaurant,Women's Store,Falafel Restaurant,Ethiopian Restaurant,Escape Room
