# Final report: 
# Opening of new restaurants in Manhattan, NYC and Toronto: A comparative analysis

## Importing libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from bs4 import BeautifulSoup # used for scraping websites 

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 1. The Manhattan data set

### Download the New York dataset and save it into a panda data frame

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [4]:
#Extract the features of the new york data set
neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods_nyc = pd.DataFrame(columns=column_names)

#fill in the dataframe one row at a time
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods_nyc = neighborhoods_nyc.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

### Conisder only the Manhattan borough and check the data set

In [5]:
manhattan_data = neighborhoods_nyc[neighborhoods_nyc['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


### Get the geographical location of Manhattan

In [6]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location_manhattan = geolocator.geocode(address)
latitude_manhattan = location_manhattan.latitude
longitude_manhattan = location_manhattan.longitude

## 2. The Toronto data set

### Scraping the table of the Toronto postal codes

In [7]:
from bs4 import BeautifulSoup

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
neighborhoods_toronto = df[0]

### Dropping non assigned boroughs in the Toronto data set

In [8]:
# Rename Postcode to PostalCode
neighborhoods_toronto.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
# Drop "not assigned" in borough
toronto_drop = neighborhoods_toronto[neighborhoods_toronto['Borough'] != 'Not assigned'].reset_index(drop=True)

### Group data by postal code

In [9]:
toronto_data = toronto_drop.groupby(['PostalCode','Borough'], as_index=False)['Neighbourhood'].apply(','.join).reset_index()
toronto_data.rename(columns={0: 'Neighbourhood'}, inplace=True)

### Replacing non assigned neighborhoods

In [10]:
toronto_data.loc[toronto_data['Neighbourhood']=='Not assigned', 'Neighbourhood'] = toronto_data.loc[toronto_data['Neighbourhood']=='Not assigned', 'Borough']
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Reading in the latitude and longitude data from the csv file (geocoder is too buggy!)

In [11]:
long_lat = pd.read_csv('https://cocl.us/Geospatial_data')

In [12]:
# Save the Toronto coordinate data
toronto_coord = pd.concat([toronto_data, long_lat[['Latitude','Longitude']]], axis=1)

### Restriction to boroughs with the word "Toronto"¶

In [13]:
toronto_borough = toronto_coord[toronto_coord['Borough'].str.contains('Toronto')].reset_index(drop=True)

### Get the geological coordinates of Toronto

In [14]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location_toronto = geolocator.geocode(address)
latitude_toronto = location_toronto.latitude
longitude_toronto = location_toronto.longitude

## 3. Segmentation and Clustering: Manhattan

### Foursquare credentials

In [15]:
# The code was removed by Watson Studio for sharing.

In [16]:
VERSION = '20180605' # Foursquare API version

### Define a function to get nearby venues

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Find the nearby venues

In [18]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


### Group the data via one-hot encoding

In [19]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()

### Function to find the most common venues

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Find the top 10 venues

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Park,Coffee Shop,Hotel,Wine Shop,Clothing Store,Gym,Women's Store,Memorial Site,Boat or Ferry,Pizza Place
1,Carnegie Hill,Coffee Shop,Pizza Place,Cosmetics Shop,Yoga Studio,Bakery,Gym,Bookstore,Café,Japanese Restaurant,Wine Shop
2,Central Harlem,African Restaurant,Bar,American Restaurant,Seafood Restaurant,French Restaurant,Chinese Restaurant,Tapas Restaurant,Spa,Cosmetics Shop,Beer Bar
3,Chelsea,Coffee Shop,Bakery,Italian Restaurant,Ice Cream Shop,Wine Shop,Theater,American Restaurant,Hotel,Nightclub,Bookstore
4,Chinatown,Chinese Restaurant,American Restaurant,Cocktail Bar,Salon / Barbershop,Spa,Optical Shop,Bakery,Vietnamese Restaurant,Hotpot Restaurant,Asian Restaurant


### Select only venues with 1st most common venue containing the word "Resturant"

In [22]:
#restaurants_manhattan = neighborhoods_venues_sorted[neighborhoods_venues_sorted['1st Most Common Venue'].str.contains('Restaurant')].reset_index(drop=True)
#restaurants_manhattan

### Cluster the most common venues

In [23]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Manhattan,Marble Hill,40.876551,-73.91066,3,Coffee Shop,Sandwich Place,American Restaurant,Kids Store,Bank,Supplement Shop,Gym,Miscellaneous Shop,Donut Shop,Shopping Mall
1,Manhattan,Chinatown,40.715618,-73.994279,1,Chinese Restaurant,American Restaurant,Cocktail Bar,Salon / Barbershop,Spa,Optical Shop,Bakery,Vietnamese Restaurant,Hotpot Restaurant,Asian Restaurant
2,Manhattan,Washington Heights,40.851903,-73.9369,4,Café,Bakery,Grocery Store,Mobile Phone Shop,Spanish Restaurant,Deli / Bodega,Chinese Restaurant,Mexican Restaurant,Tapas Restaurant,Coffee Shop
3,Manhattan,Inwood,40.867684,-73.92121,4,Mexican Restaurant,Lounge,Café,Pizza Place,Restaurant,Bakery,Park,Chinese Restaurant,American Restaurant,Frozen Yogurt Shop
4,Manhattan,Hamilton Heights,40.823604,-73.949688,4,Pizza Place,Café,Park,Coffee Shop,Mexican Restaurant,Yoga Studio,Bakery,Sandwich Place,Cocktail Bar,School


### Plot the map of the venues

In [24]:
# create map
map_clusters = folium.Map(location=[latitude_manhattan, longitude_manhattan], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster analysis

#### Cluster 1

In [25]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Harlem,African Restaurant,Bar,American Restaurant,Seafood Restaurant,French Restaurant,Chinese Restaurant,Tapas Restaurant,Spa,Cosmetics Shop,Beer Bar
8,Upper East Side,Italian Restaurant,Exhibit,Coffee Shop,Bakery,Art Gallery,Juice Bar,Gym / Fitness Center,French Restaurant,American Restaurant,Hotel
13,Lincoln Square,Theater,Café,Plaza,Concert Hall,Italian Restaurant,Performing Arts Venue,French Restaurant,Gym / Fitness Center,American Restaurant,Indie Movie Theater
14,Clinton,Theater,Italian Restaurant,Gym / Fitness Center,Coffee Shop,American Restaurant,Hotel,Wine Shop,Spa,Sandwich Place,Gym
18,Greenwich Village,Italian Restaurant,Clothing Store,Sushi Restaurant,Café,Indian Restaurant,Seafood Restaurant,French Restaurant,Dessert Shop,Gourmet Shop,Sandwich Place
21,Tribeca,American Restaurant,Park,Italian Restaurant,Spa,Café,Wine Shop,Wine Bar,Greek Restaurant,Men's Store,Coffee Shop
24,West Village,Italian Restaurant,New American Restaurant,Cosmetics Shop,Park,Cocktail Bar,Wine Bar,American Restaurant,Coffee Shop,Theater,Bakery
27,Gramercy,Bar,Italian Restaurant,Mexican Restaurant,Pizza Place,Grocery Store,Thai Restaurant,Playground,Bagel Shop,Diner,Comedy Club
31,Noho,Italian Restaurant,Cocktail Bar,French Restaurant,Mexican Restaurant,Gift Shop,Bookstore,Rock Club,Coffee Shop,Pizza Place,Grocery Store
32,Civic Center,Italian Restaurant,Gym / Fitness Center,Coffee Shop,Sandwich Place,French Restaurant,Hotel,Yoga Studio,Cocktail Bar,Spa,Park


#### Cluster 2

In [26]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Chinatown,Chinese Restaurant,American Restaurant,Cocktail Bar,Salon / Barbershop,Spa,Optical Shop,Bakery,Vietnamese Restaurant,Hotpot Restaurant,Asian Restaurant
10,Lenox Hill,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Cocktail Bar,Café,Burger Joint,Sporting Goods Shop,Gym,Gym / Fitness Center
12,Upper West Side,Italian Restaurant,Coffee Shop,Wine Bar,Bar,Bakery,Mediterranean Restaurant,Café,Gym / Fitness Center,Ice Cream Shop,Indian Restaurant
15,Midtown,Hotel,Food Truck,Coffee Shop,Theater,Clothing Store,Sporting Goods Shop,Bakery,Café,Japanese Restaurant,Bookstore
16,Murray Hill,Coffee Shop,Sandwich Place,American Restaurant,Japanese Restaurant,Hotel,Gym / Fitness Center,Italian Restaurant,Chinese Restaurant,Sushi Restaurant,Bar
17,Chelsea,Coffee Shop,Bakery,Italian Restaurant,Ice Cream Shop,Wine Shop,Theater,American Restaurant,Hotel,Nightclub,Bookstore
22,Little Italy,Italian Restaurant,Café,Bubble Tea Shop,Bakery,Mediterranean Restaurant,Sandwich Place,Pizza Place,Cocktail Bar,Clothing Store,Salon / Barbershop
23,Soho,Clothing Store,Boutique,Women's Store,Art Gallery,Shoe Store,Italian Restaurant,Sporting Goods Shop,Bakery,Mediterranean Restaurant,Men's Store
30,Carnegie Hill,Coffee Shop,Pizza Place,Cosmetics Shop,Yoga Studio,Bakery,Gym,Bookstore,Café,Japanese Restaurant,Wine Shop
33,Midtown South,Korean Restaurant,Hotel,Hotel Bar,Japanese Restaurant,Coffee Shop,Dessert Shop,American Restaurant,Gym / Fitness Center,Cocktail Bar,Scenic Lookout


#### Cluster 3

In [27]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Stuyvesant Town,Bar,Park,Boat or Ferry,Cocktail Bar,Baseball Field,Fountain,Harbor / Marina,Coffee Shop,Heliport,Farmers Market


#### Cluster 4

In [28]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,Coffee Shop,Sandwich Place,American Restaurant,Kids Store,Bank,Supplement Shop,Gym,Miscellaneous Shop,Donut Shop,Shopping Mall
11,Roosevelt Island,Deli / Bodega,Park,Sandwich Place,Coffee Shop,Bridge,Farmers Market,Greek Restaurant,Metro Station,Supermarket,Bubble Tea Shop
26,Morningside Heights,Park,Bookstore,Coffee Shop,American Restaurant,Food Truck,Sandwich Place,Burger Joint,Deli / Bodega,Pizza Place,Ice Cream Shop
28,Battery Park City,Park,Coffee Shop,Hotel,Wine Shop,Clothing Store,Gym,Women's Store,Memorial Site,Boat or Ferry,Pizza Place
29,Financial District,Coffee Shop,American Restaurant,Pizza Place,Hotel,Gym,Italian Restaurant,Wine Shop,Food Truck,Steakhouse,Gym / Fitness Center


#### Cluster 5

In [29]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Washington Heights,Café,Bakery,Grocery Store,Mobile Phone Shop,Spanish Restaurant,Deli / Bodega,Chinese Restaurant,Mexican Restaurant,Tapas Restaurant,Coffee Shop
3,Inwood,Mexican Restaurant,Lounge,Café,Pizza Place,Restaurant,Bakery,Park,Chinese Restaurant,American Restaurant,Frozen Yogurt Shop
4,Hamilton Heights,Pizza Place,Café,Park,Coffee Shop,Mexican Restaurant,Yoga Studio,Bakery,Sandwich Place,Cocktail Bar,School
5,Manhattanville,Coffee Shop,Deli / Bodega,Italian Restaurant,Park,Seafood Restaurant,Mexican Restaurant,Japanese Curry Restaurant,Café,Bike Trail,Sushi Restaurant
7,East Harlem,Mexican Restaurant,Thai Restaurant,Bakery,Latin American Restaurant,Deli / Bodega,Cuban Restaurant,Pizza Place,Beer Bar,Taco Place,Gas Station
9,Yorkville,Italian Restaurant,Coffee Shop,Gym,Bar,Sushi Restaurant,Deli / Bodega,Bakery,Wine Shop,Diner,Japanese Restaurant
19,East Village,Bar,Wine Bar,Ice Cream Shop,Chinese Restaurant,Mexican Restaurant,Cocktail Bar,Pizza Place,Vegetarian / Vegan Restaurant,Italian Restaurant,Coffee Shop
20,Lower East Side,Art Gallery,Coffee Shop,Pizza Place,Cocktail Bar,Bakery,Café,Ramen Restaurant,Park,Japanese Restaurant,Shoe Store
25,Manhattan Valley,Coffee Shop,Bar,Yoga Studio,Mexican Restaurant,Café,Thai Restaurant,Pizza Place,Indian Restaurant,Playground,Furniture / Home Store
36,Tudor City,Café,Park,Mexican Restaurant,Pizza Place,Deli / Bodega,Coffee Shop,Diner,Greek Restaurant,Garden,Asian Restaurant


## 4. Segmentation and Clustering: Toronto¶

### Get the nearby venues for our set of Toronto neighborhoods

In [30]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
toronto_venues = getNearbyVenues(names=toronto_borough['Neighbourhood'],
                                   latitudes=toronto_borough['Latitude'],
                                   longitudes=toronto_borough['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvalles
Runnymede

### Use One Hot encoding for the venues and group them

In [34]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

### Store the top 10 venues  into a panda data frame

In [52]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Hotel,Restaurant,Thai Restaurant,Asian Restaurant,Cosmetics Shop,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Steakhouse,Farmers Market,Seafood Restaurant,Beer Bar,Café,Belgian Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Bakery,Stadium,Bar,Restaurant,Intersection,Italian Restaurant,Burrito Place
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Park,Comic Shop,Pizza Place,Restaurant,Burrito Place,Brewery,Skate Park
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Bar,Airport,Airport Food Court,Airport Gate,Boutique,Rental Car Location,Harbor / Marina


### Cluster the most common venues

In [53]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_borough

#merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged[['Cluster Labels']] = toronto_merged[['Cluster Labels']].fillna(0.0).astype(int)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Health Food Store,Pub,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Dessert Shop,Bookstore,Brewery,Bubble Tea Shop,Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Sandwich Place,Brewery,Burrito Place,Pub,Fish & Chips Shop,Sushi Restaurant,Fast Food Restaurant,Ice Cream Shop,Food & Drink Shop,Liquor Store
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Italian Restaurant,Brewery,American Restaurant,Bakery,Diner,Latin American Restaurant,Seafood Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Swim School,Bus Line,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


### Visulaization of the clusters

In [56]:
# create map
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster analysis

#### Cluster 1

In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Trail,Health Food Store,Pub,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
1,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Dessert Shop,Bookstore,Brewery,Bubble Tea Shop,Restaurant
2,East Toronto,0,Sandwich Place,Brewery,Burrito Place,Pub,Fish & Chips Shop,Sushi Restaurant,Fast Food Restaurant,Ice Cream Shop,Food & Drink Shop,Liquor Store
3,East Toronto,0,Café,Coffee Shop,Gastropub,Italian Restaurant,Brewery,American Restaurant,Bakery,Diner,Latin American Restaurant,Seafood Restaurant
5,Central Toronto,0,Park,Gym,Food & Drink Shop,Sandwich Place,Department Store,Convenience Store,Breakfast Spot,Hotel,Donut Shop,Doner Restaurant
6,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Bagel Shop,Gym / Fitness Center,Fast Food Restaurant,Diner,Dessert Shop,Mexican Restaurant,Park
7,Central Toronto,0,Dessert Shop,Pizza Place,Sandwich Place,Gym,Italian Restaurant,Sushi Restaurant,Café,Coffee Shop,Discount Store,Diner
9,Central Toronto,0,Coffee Shop,Pub,American Restaurant,Liquor Store,Sushi Restaurant,Sports Bar,Restaurant,Pizza Place,Fried Chicken Joint,Supermarket
11,Downtown Toronto,0,Coffee Shop,Bakery,Park,Pizza Place,Café,Restaurant,Pub,Market,Italian Restaurant,Indian Restaurant
12,Downtown Toronto,0,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Gay Bar,Gastropub,Mediterranean Restaurant,Men's Store,Café,Burger Joint


#### Cluster 2

In [58]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,1,Park,Gym,Playground,Trail,Restaurant,Doner Restaurant,Dog Run,Donut Shop,Cupcake Shop,Dumpling Restaurant
10,Downtown Toronto,1,Park,Playground,Trail,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store


#### Cluster 3

In [59]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,2,Garden,Women's Store,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


#### Cluster 4

In [60]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,3,Jewelry Store,Trail,Sushi Restaurant,Bus Line,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


#### Cluster 5

In [61]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,4,Park,Swim School,Bus Line,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
