<a href="https://www.bigdatauniversity.com"><img src="https://ibm.box.com/shared/static/cw2c7r3o20w9zn8gkecaeyjhgw3xdgbj.png" width="400" align="center"></a>




<h1 align="center"><font size="5"> Data Science Professional Certificate - Capstone: Week 3
 

Clustering Neighborhoods  - Toronto \
Part - 3 </font></h1>

Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you. 

Just make sure:

1. to add enough Markdown cells to explain what you decided to do and to report any observations you make. 
2. to generate maps to visualize your neighborhoods and how they cluster together. 

In [1]:
# Install and Import Libraries
# General and Analysis
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# For Geographic data and visualization
!pip install geopy
!pip install folium
print('Libraries installed!')

from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium

# For working with json using Foursquare API
from pandas.io.json import json_normalize
print('Libraries imported!')

Libraries installed!
Libraries imported!


In [2]:
# Import csv with borough and coordinates
df = pd.read_csv('toronto_postcode_coordinates.csv')
print(df.shape)

(103, 5)


In [None]:
# Let's see the first few rows of the dataframe
df.head()

In [None]:
# Let's see how many borough are there
print('The dataframe has {} boroughs'.format(len(df['Borough'].unique())))

**Let's create a map of Toronto with neighborhoods**

In [3]:
# Get coordinates of Toronto
address = 'Toronto , Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}'.format(address, latitude, longitude))

The geograpical coordinate of Toronto , Ontario are 43.6534817, -79.3839347


In [4]:
# Create map
toronto_map = folium.Map(location=[latitude, longitude], zoom_start = 10)

# Adding location markers
for lat, lng, boro, nbr in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(nbr, boro)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'orange',
        fill = True,
        fill_color = '2196cc',
        fill_opacity = 0.6,
        parse_html = False).add_to(toronto_map)

toronto_map    


**Let's select only the York neighborhoods**

In [5]:
df_york = df[df['Borough'].str.contains('York')].reset_index(drop = True)
df_york.head()
df_york.shape

(34, 5)

In [6]:
# Getting coordinates for York, Toronto

address = 'York, Toronto'

geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude =location.longitude

print('The coordinate of {}; latitude: {}, and longitude: {}.'.format(address,
                                                                      latitude,
                                                                    longitude))


The coordinate of York, Toronto; latitude: 43.6896191, and longitude: -79.479188.


In [7]:
# Recreate the map of toronto with labels only of York Neighborhoods

#create map
map_york = folium.Map(location =[latitude, longitude], zoom_start = 10)

#add markers
for lat, lng, boro, nbr in zip(df_york['Latitude'], df_york['Longitude'], df_york['Borough'], df_york['Neighborhood']):
    label = '{}, {}'.format(nbr, boro)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'orange',
        fill = True,
        fill_color = '2196cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_york)
    
#display york map
map_york


**Set Up Foursquare credentials**

In [56]:
CLIENT_ID = 'FGW2MIVOabcLANNMCALTJ5DAJZHHSC'   #foursquare api credentials
CLIENT_SECRET = 'VGPC25WKX1SQJ51OVWSEKXxyzBWFPKRTVCF33G'
VERSION = '20210501' #use required date

print('Your Client ID: ', CLIENT_ID)
print('Your Client Secret: ', CLIENT_SECRET)

Your Client ID:  FGW2MIVOabcLANNMCALTJ5DAJZHHSC
Your Client Secret:  VGPC25WKX1SQJ51OVWSEKXxyzBWFKK5M4TBKPKRTVCF33G


**Exploring a neighborhood**

In [9]:
# Let's see the York dataset again
df_york.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills North,43.745906,-79.352188
4,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937


In [10]:
# Exploring Parkwoods neighborhood
nbr = df_york.loc[0, 'Neighborhood']
nbr_lat = df_york.loc[0,'Latitude']
nbr_lng = df_york.loc[0, 'Longitude']

print('Neighborhood: {}, Latitude: {}, Longitude: {}'.format(nbr,
                                                            nbr_lat,
                                                            nbr_lng))



Neighborhood: Parkwoods, Latitude: 43.7532586, Longitude: -79.3296565


In [11]:
# Explore top 50 venues in 500 meters radius of Parkwoods

LIMIT = 50
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
nbr_lat,
nbr_lng,
radius,
LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '6097bba7c6095967960046c1'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [12]:
# Creating a function to get Name, Location, Categories of venues around Parkwoods

def getVenues (names, latitudes,longitudes, radius = 500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # 1 - Make API request - obtain json
        # 1.1 - Create URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # 1.2 - Send GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # 2 - Filter only essential information from json
        venues_list.append([(
            name,  #Neighborhood name
            lat,   #Neighborhood latitude
            lng,   #Neighborhood longitude
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    # 3 - create dataframe and rename columns
    venues_nearby = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venues_nearby.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(venues_nearby)      
    

**Getting list of venues around ***York*****

In [13]:
york_venues = getVenues(names = df_york['Neighborhood'],
                       latitudes = df_york['Latitude'],
                       longitudes = df_york['Longitude'])


Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills North
Parkview Hill, Woodbine Gardens
Glencairn
Don Mills South
Woodbine Heights
Humewood-Cedarvale
Caledonia-Fairbanks
Leaside
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Bayview Village
Downsview East
York Mills, Silver Hills
Downsview West
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview Central
Bedford Park, Lawrence Manor East
Del Ray, Mount Dennis, Keelsdale and Silverthorn
Humberlea, Emery
Willowdale South
Downsview Northwest
Runnymede, The Junction North
Weston
York Mills West
Willowdale West


In [14]:
# Let's see the outline of the york_venues
print(york_venues.shape)
york_venues.head()

(317, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [15]:
# Group 345 venues by the neighborhood they are in
york_venues.groupby('Neighborhood').count()
# Get unique categories of venues present in the df_york
print('Number of Unique Categories: {}'.format(len(york_venues['Venue Category'].unique())))

Number of Unique Categories: 113


### Analyzing Each Neighborhood##

**One Hot Encoding**

Create a new dataframe with encoded values for categories represented by each venue in every neighborhood in York

New df will have same # rows (=venues) x #of unique categories (i.e. 113)

In [16]:
# One hot encoding 
york1h = pd.get_dummies(york_venues[['Venue Category']],
                        prefix = '',
                        prefix_sep = '')
# york1h = only contain columns for venue categories
# Add neighborhood column back
york1h['Neighborhood'] = york_venues['Neighborhood']
# Move neighborhood column as first column
sorted_columns = [york1h.columns[-1]] + list(york1h.columns[:-1])
york1h = york1h[sorted_columns]

#visualize
print(york1h.shape)
york1h.head()

(317, 114)


Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Group Neighborhoods and take mean frequency of each category**

In [17]:
york_grp = york1h.groupby('Neighborhood').mean().reset_index()
print(york_grp.shape)
york_grp.head()

(32, 114)


Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.04,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


There are 32 neighborhoods with venues of different categories, and each has mean freqency for 113 categories in the dataframe - **york_grp**

### Obtain most common venue categories in each neighborhood###

In [18]:
# Let's see which is the 2 most common vnue in each neighborhood
no_top_venues = 2


for nbr in york_grp['Neighborhood']:
    print("------"+nbr+"------")
    temp = york_grp[york_grp['Neighborhood'] == nbr].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(no_top_venues))
    print('\n')

------Bathurst Manor, Wilson Heights, Downsview North------
         venue  freq
0  Coffee Shop   0.1
1         Bank   0.1


------Bayview Village------
                venue  freq
0  Chinese Restaurant  0.25
1                Café  0.25


------Bedford Park, Lawrence Manor East------
         venue  freq
0  Coffee Shop  0.08
1   Restaurant  0.08


------Caledonia-Fairbanks------
           venue  freq
0           Park  0.50
1  Women's Store  0.25


------Del Ray, Mount Dennis, Keelsdale and Silverthorn------
               venue  freq
0  Convenience Store  0.25
1        Coffee Shop  0.25


------Don Mills North------
                venue  freq
0                 Gym  0.17
1  Athletics & Sports  0.17


------Don Mills South------
            venue  freq
0      Restaurant   0.1
1  Clothing Store   0.1


------Downsview Central------
          venue  freq
0  Home Service  0.33
1    Food Truck  0.33


------Downsview East------
     venue  freq
0  Airport   0.5
1     Park   0.5


------Dow

In [19]:
# Create a function to sort the categories by mean frequency

def getCommonVenues(row, no_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:no_top_venues]


In [36]:
# Create dataframe of top venues for each neighborhood

no_top_venues = 10
suffix = ['st', 'nd', 'rd']

# Create columns corresponding to # of top venues entered
columns = ['Neighborhood'] # start list with 1st column = Neighborhood name

for counter in np.arange(no_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(counter+1, suffix[counter]))
    except:
        columns.append('{}th Most Common Venue'.format(counter+1))
        
# Get data for the dataframe
top_venues = pd.DataFrame(columns = columns)
top_venues['Neighborhood'] = york_grp['Neighborhood']

# Iterate through rows of york_grp and get most common venue for each neighborhood

for i in np.arange(york_grp.shape[0]):
    top_venues.iloc[i, 1:] = getCommonVenues(york_grp.iloc[i, :], no_top_venues)

top_venues.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Middle Eastern Restaurant,Mobile Phone Shop,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Butcher,Grocery Store,Liquor Store,Café,Juice Bar,Pharmacy
3,Caledonia-Fairbanks,Park,Women's Store,Pool,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Coffee Shop,Convenience Store,Sandwich Place,Discount Store,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega


### Clustering Neighborhoods###

We now create clusters of neighborhoods - we'll create and visualize clusters.

We try with 3, 5 and 7 clusters to see what comes back as interesting.


In [37]:
# Let's re-visit the clustering dataset
print(york_grp.shape)
york_grp.head(3)

(32, 114)


Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.04,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0


The data has mean frequency for each of 113 catgories of venues, goruped based on the neighborhood where they exist.

We can drop the Neighborhood column and only analyze the cluster of different venues by mean frequency of occurence.

In [38]:
# Set cluster
kclusters = 3

# Prepare data
york_clusters = york_grp.drop('Neighborhood', 1)

# Run k-means
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(york_clusters)

# Check the labels for first 10 rows
print(kmeans.labels_[0:10])
print(kmeans.labels_.shape)  #Shows # of rows in york_grp = 

[0 0 0 1 0 0 0 2 1 0]
(32,)


**Create dataframe with top *5* venues + cluster label of each neighborhood**


In [39]:
# We have created a dataframe of top 5 venues in each neighbourhood above
top_venues.head(3)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Middle Eastern Restaurant,Mobile Phone Shop,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Butcher,Grocery Store,Liquor Store,Café,Juice Bar,Pharmacy


We can get the top 10 or ***top x*** number of venues for each neighborhood bymodifying the ***no_top_venues*** variable above.

In [40]:
# Insert cluster label to each neighborhood
top_venues3 = top_venues
top_venues3.insert(0, 'Cluster Label', kmeans.labels_)  # Only 1 time

top_venues3.head(3)

Unnamed: 0,Cluster Label,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Middle Eastern Restaurant,Mobile Phone Shop,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop
1,0,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,0,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Butcher,Grocery Store,Liquor Store,Café,Juice Bar,Pharmacy


In [41]:
#top_venues3
# It shows all neighborhoods are assigned to some cluster 
# => NO NaN anywhere

Now we join this dataframe with the ***df_york*** to include Post Code and Borough names along with the Neighborhoods and common venues.

In [42]:
print(len(df_york['Neighborhood'].unique()))
df_york.head(3)

34


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763


In [43]:
york = df_york  # Work with a copy of df_york dataframe

# Join york dataframes
york_final3 = york.join(top_venues3.set_index('Neighborhood'), on = 'Neighborhood')

print(york_final3.shape)
york_final3.head(3)

(34, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Yoga Studio,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Portuguese Restaurant,Hockey Arena,Intersection,Coffee Shop,Yoga Studio,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Clothing Store,Boutique,Women's Store,Furniture / Home Store,Miscellaneous Shop,Coffee Shop,Accessories Store,Vietnamese Restaurant,Asian Restaurant,Falafel Restaurant


**Some Post Codes** (like Row # 19 M2L 	North York 	York Mills, Silver Hills) **do not have any venues, therefore are not added to any clusters**
We drop those postcodes / rows and then visualize the clusters.

In [44]:
york_final3.dropna(inplace = True)
print(york_final3.shape)

(32, 16)


**Visualize the Clusters**

In [45]:
# Create map
map_clusters = folium.Map(location = [latitude, longitude],
                          zoom_start=11)
# Select color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers
markers_colors = []
for lat, lon, nbr, cluster in zip(york_final3['Latitude'],
                                  york_final3['Longitude'],
                                  york_final3['Neighborhood'],
                                  york_final3['Cluster Label']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### Explore each cluster###

**Cluster 1: Label = 0**

In [46]:
york_final3.loc[york_final3['Cluster Label'] == 0,
                york_final3.columns[[1] + list(range(5,
                                                     york_final3.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,0.0,Portuguese Restaurant,Hockey Arena,Intersection,Coffee Shop,Yoga Studio,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
2,North York,0.0,Clothing Store,Boutique,Women's Store,Furniture / Home Store,Miscellaneous Shop,Coffee Shop,Accessories Store,Vietnamese Restaurant,Asian Restaurant,Falafel Restaurant
3,North York,0.0,Gym,Caribbean Restaurant,Athletics & Sports,Café,Dessert Shop,Japanese Restaurant,Yoga Studio,Dog Run,Convenience Store,Cosmetics Shop
4,East York,0.0,Pizza Place,Pharmacy,Bank,Gym / Fitness Center,Flea Market,Intersection,Pet Store,Café,Breakfast Spot,Gastropub
5,North York,0.0,Bakery,Asian Restaurant,Park,Sushi Restaurant,Japanese Restaurant,Pizza Place,Convenience Store,Construction & Landscaping,Cosmetics Shop,Curling Ice
6,North York,0.0,Coffee Shop,Clothing Store,Restaurant,Gym,Chinese Restaurant,Supermarket,Dim Sum Restaurant,Discount Store,Italian Restaurant,Sandwich Place
7,East York,0.0,Skating Rink,Spa,Park,Beer Store,Curling Ice,Athletics & Sports,Dessert Shop,Discount Store,Diner,Dim Sum Restaurant
8,York,0.0,Park,Trail,Hockey Arena,Field,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
10,East York,0.0,Coffee Shop,Burger Joint,Sporting Goods Shop,Bank,Shopping Mall,Furniture / Home Store,Dessert Shop,Pet Store,Department Store,Electronics Store
11,North York,0.0,Golf Course,Fast Food Restaurant,Pool,Mediterranean Restaurant,Dog Run,Food Service,Food Court,Comfort Food Restaurant,Furniture / Home Store,Construction & Landscaping


This cluster of neighborhoods seem to be central locations and good residential as well as commercial areas with restruants and coffee places, parks, and pharmacies common in the neighborhood.

**Cluster 2: Label = 1**

In [47]:
york_final3.loc[york_final3['Cluster Label'] == 1,
                york_final3.columns[[1] + list(range(5,
                                                     york_final3.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,1.0,Park,Food & Drink Shop,Yoga Studio,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
9,York,1.0,Park,Women's Store,Pool,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
16,East York/East Toronto,1.0,Park,Coffee Shop,Convenience Store,Yoga Studio,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dessert Shop
18,North York,1.0,Airport,Park,Yoga Studio,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
31,York,1.0,Park,Yoga Studio,Coffee Shop,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dessert Shop
32,North York,1.0,Park,Convenience Store,Yoga Studio,Coffee Shop,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dessert Shop


This group of neighborhoods could be a bit outside the core residential or commercial areas, with loads of parks, airports, gym and other recreational / outdoor activities.
There are some restraunts, coffee shops and convinience stores which might be targeted to customers of these outdoor activities.

**Cluster 3: Label = 2**

In [48]:
york_final3.loc[york_final3['Cluster Label'] == 2,
                york_final3.columns[[1] + list(range(5,
                                                     york_final3.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,North York,2.0,Food Truck,Home Service,Baseball Field,Yoga Studio,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
27,North York,2.0,Food Service,Baseball Field,Yoga Studio,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store


These neighborhoods could be further outside or in the relatively new and upcoming neighborhoods with most common venues being related to baseball field, yoga studio, and many related to construction and landscaping, with food trucks and few convinence stores, probably targeted for the sports events, construction personnels. 

### Optional/ Personal Exploration Below

**We can repeat with 5 Clusters**

In [49]:
# Create dataframe of top venues for each neighborhood

no_top_venues = 10
suffix = ['st', 'nd', 'rd']

# Create columns corresponding to # of top venues entered
columns = ['Neighborhood'] # start list with 1st column = Neighborhood name

for counter in np.arange(no_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(counter+1, suffix[counter]))
    except:
        columns.append('{}th Most Common Venue'.format(counter+1))
        
# Get data for the dataframe
top_venues = pd.DataFrame(columns = columns)
top_venues['Neighborhood'] = york_grp['Neighborhood']

# Iterate through rows of york_grp and get most common venue for each neighborhood

for i in np.arange(york_grp.shape[0]):
    top_venues.iloc[i, 1:] = getCommonVenues(york_grp.iloc[i, :], no_top_venues)

top_venues.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Middle Eastern Restaurant,Mobile Phone Shop,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Butcher,Grocery Store,Liquor Store,Café,Juice Bar,Pharmacy
3,Caledonia-Fairbanks,Park,Women's Store,Pool,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Coffee Shop,Convenience Store,Sandwich Place,Discount Store,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega


In [50]:
# Prepare data

# Set cluster
kclusters = 5

# Prepare data
york_clusters5 = york_grp.drop('Neighborhood', 1)

# Run k-means
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(york_clusters5)

# Insert cluster label to each neighborhood
top_venues5 = top_venues
top_venues5.insert(0, 'Cluster Label', kmeans.labels_)  

# Join top top venues - with cluster labels
york_final5 = york.join(top_venues.set_index('Neighborhood'), on = 'Neighborhood')
# Drop NaN
york_final5.dropna(inplace = True)



In [51]:
# Visualize

# Create map
map_clusters = folium.Map(location = [latitude, longitude],
                          zoom_start=11)
# Select color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers
markers_colors = []
for lat, lon, nbr, cluster in zip(york_final5['Latitude'],
                                  york_final5['Longitude'],
                                  york_final5['Neighborhood'],
                                  york_final5['Cluster Label']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

**Try 10 Clusters**

In [52]:
# Create dataframe of top venues for each neighborhood

no_top_venues = 10
suffix = ['st', 'nd', 'rd']

# Create columns corresponding to # of top venues entered
columns = ['Neighborhood'] # start list with 1st column = Neighborhood name

for counter in np.arange(no_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(counter+1, suffix[counter]))
    except:
        columns.append('{}th Most Common Venue'.format(counter+1))
        
# Get data for the dataframe
top_venues = pd.DataFrame(columns = columns)
top_venues['Neighborhood'] = york_grp['Neighborhood']

# Iterate through rows of york_grp and get most common venue for each neighborhood

for i in np.arange(york_grp.shape[0]):
    top_venues.iloc[i, 1:] = getCommonVenues(york_grp.iloc[i, :], no_top_venues)

top_venues.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Middle Eastern Restaurant,Mobile Phone Shop,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Butcher,Grocery Store,Liquor Store,Café,Juice Bar,Pharmacy
3,Caledonia-Fairbanks,Park,Women's Store,Pool,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Coffee Shop,Convenience Store,Sandwich Place,Discount Store,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega


In [54]:
# Prepare data

# Set cluster
kclusters = 5

# Prepare data
york_clusters10 = york_grp.drop('Neighborhood', 1)

# Run k-means
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(york_clusters10)

# Insert cluster label to each neighborhood
top_venues10 = top_venues
top_venues10.insert(0, 'Cluster Label', kmeans.labels_)  


# Join top top venues - with cluster labels
york_final10 = york.join(top_venues.set_index('Neighborhood'), on = 'Neighborhood')
# Drop NaN
york_final10.dropna(inplace = True)

In [55]:
# Visualize

# Create map
map_clusters = folium.Map(location = [latitude, longitude],
                          zoom_start=11)
# Select color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers
markers_colors = []
for lat, lon, nbr, cluster in zip(york_final10['Latitude'],
                                  york_final10['Longitude'],
                                  york_final10['Neighborhood'],
                                  york_final10['Cluster Label']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

While more analysis is required, the clusters seem to represent the the occupation of the neighborhoods for various uses - like  old and exiting residential, new and upcoming residential, recreation and outdoor activites, and commecial uses.