# Segmenting and Clustering Neighborhoods in Toronto#

In [1]:
#importing necessary libraries

import pandas as pd # library for data analsysis

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 1.Scraping data and converting into pandas data frame ##

In [2]:
#scraping the Wikipedia page in order to obtain the data that is in the table of postal codes 
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
type(table)

list

In [4]:
len(table)

3

In [5]:
#transforming the scraped data into a pandas dataframe
df = table[0]

In [6]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal code     180 non-null object
Borough         180 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [8]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


#### Ignoring cells where Borough is not assigned ####

In [9]:
# Ignoring cells with a borough that is Not assigned.
df = df[df.Borough != "Not assigned"]

In [10]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


#### Formatting neighborhood coloumn as mentioned ####

In [12]:
#Converting the format into the required format
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [14]:
#resetting index 
df.reset_index(inplace=True)

In [15]:
del df['index']

In [16]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


#### Finding the number of rows in dataframe after necessary cleaning ####

In [17]:
# print the number of rows of your dataframe.
df.shape

(103, 3)

### --------End of Part 1 (According to the assignment)-------- ###

## 2.Loading Coordinates in pandas dataframe ##

In [18]:
loc_cor = pd.read_csv(r"C:\Users\GAURI TOSHNIWAL\Documents\zcoursera\Capstone\Coursera_Capstone\Geospatial_Coordinates.csv")

In [19]:
loc_cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df = df.sort_values(by=['Postal code'])
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
6,M1B,Scarborough,"Malvern , Rouge"
12,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
18,M1E,Scarborough,"Guildwood , Morningside , West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [21]:
df_new = pd.merge(df, loc_cor,left_on='Postal code', right_on='Postal Code', how='left').drop('Postal Code', axis=1)

#### Final dataframe with latitude and longitude columns ####

In [22]:
df_new.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### --------End of Part 2 (According to the assignment)-------- ###

### Using geopy library to get latitude and longitude values of Toronto ###

#### Used geopy library to get the latitude and longitude values of Toronto City ####

In [23]:
address = 'Toronto,ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address,timeout=10)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate Toronto City are 43.6534817, -79.3839347.


#### Map of Toronto with neighborhoods superimposed on top ####

In [24]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_new['Latitude'], df_new['Longitude'],df_new['Borough'], df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [25]:
df_new["Borough"].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

#### segment and cluster only the neighborhoods in North York ####

In [26]:
#sliced the original dataframe and created a new dataframe of the Central Toronto data.
toronto_data = df_new[df_new['Borough']== 'North York'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview , Henry Farm , Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"York Mills , Silver Hills",43.75749,-79.374714
4,M2M,North York,"Willowdale , Newtonbrook",43.789053,-79.408493


In [27]:
address = 'North York,ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address,timeout=10)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [28]:
# create map of Cetral Toronto using latitude and longitude values
map_ct = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ct)  
    
map_ct

In [29]:
CLIENT_ID = 'CVDUCJDQZN2GITGJSRMOA22KTGC4DV10JWSXB3BNVLVCY3FN' # your Foursquare ID
CLIENT_SECRET = 'L1NUQU2U0H3RGWRK2FUXKMBWQ12XUOM5HKXG0PH3KL53VH2W' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CVDUCJDQZN2GITGJSRMOA22KTGC4DV10JWSXB3BNVLVCY3FN
CLIENT_SECRET:L1NUQU2U0H3RGWRK2FUXKMBWQ12XUOM5HKXG0PH3KL53VH2W


In [30]:
toronto_data.loc[0, 'Neighborhood']

'Hillcrest Village'

In [31]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Hillcrest Village are 43.8037622, -79.3634517.


 #### top 100 venues that are in Hillcrest Village within a radius of 500 meters ####

In [32]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=CVDUCJDQZN2GITGJSRMOA22KTGC4DV10JWSXB3BNVLVCY3FN&client_secret=L1NUQU2U0H3RGWRK2FUXKMBWQ12XUOM5HKXG0PH3KL53VH2W&v=20180605&ll=43.8037622,-79.3634517&radius=500&limit=100'

In [33]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e999554006dce001b22d490'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.808262204500004,
    'lng': -79.3572281853783},
   'sw': {'lat': 43.7992621955, 'lng': -79.3696752146217}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad9dce6f964a520651b21e3',
       'name': "Eagle's Nest Golf Club",
       'location': {'address': '10000 Dufferin Rd',
        'lat': 43.805454826002794,
        'lng': -79.36418592243415,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.805454826002794,
          'lng': -79.36418592243415}],
        'distance': 197,
        'cc': 'CA',
        'city': 'Toronto

In [34]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [35]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Eagle's Nest Golf Club,Golf Course,43.805455,-79.364186
1,AY Jackson Pool,Pool,43.804515,-79.366138
2,Villa Madina,Mediterranean Restaurant,43.801685,-79.363938
3,Duncan Creek Park,Dog Run,43.805539,-79.360695


In [36]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


## 3. Explore Neighborhoods in North York ##

In [37]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [38]:
ct_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude'])

Hillcrest Village
Fairview , Henry Farm , Oriole
Bayview Village
York Mills , Silver Hills
Willowdale , Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor , Wilson Heights , Downsview North
Northwood Park , York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Bedford Park , Lawrence Manor East
Lawrence Manor , Lawrence Heights
Glencairn
North Park , Maple Leaf Park , Upwood Park
Humber Summit
Humberlea , Emery


In [39]:
print(ct_venues.shape)
ct_venues.head()

(245, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,"Fairview , Henry Farm , Oriole",43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


In [40]:
ct_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor , Wilson Heights , Downsview North",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park , Lawrence Manor East",26,26,26,26,26,26
Don Mills,27,27,27,27,27,27
Downsview,13,13,13,13,13,13
"Fairview , Henry Farm , Oriole",67,67,67,67,67,67
Glencairn,4,4,4,4,4,4
Hillcrest Village,4,4,4,4,4,4
Humber Summit,2,2,2,2,2,2
"Humberlea , Emery",2,2,2,2,2,2


In [41]:
print('There are {} uniques categories.'.format(len(ct_venues['Venue Category'].unique())))

There are 104 uniques categories.


## 4. Analyze Each Neighborhood ##

In [42]:
# one hot encoding
ct_onehot = pd.get_dummies(ct_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ct_onehot['Neighborhood'] = ct_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ct_onehot.columns[-1]] + list(ct_onehot.columns[:-1])
ct_onehot = ct_onehot[fixed_columns]

ct_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Fairview , Henry Farm , Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [43]:
ct_onehot.shape

(245, 105)

In [44]:
ct_grouped = ct_onehot.groupby('Neighborhood').mean().reset_index()
ct_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor , Wilson Heights , Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,...,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park , Lawrence Manor East",0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.076923,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.0,0.074074,0.037037,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.076923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Fairview , Henry Farm , Oriole",0.0,0.0,0.014925,0.0,0.014925,0.0,0.029851,0.029851,0.014925,...,0.014925,0.0,0.029851,0.0,0.014925,0.014925,0.014925,0.0,0.0,0.029851
6,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea , Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
ct_grouped.shape

(18, 105)

In [46]:
num_top_venues = 5

for hood in ct_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ct_grouped[ct_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor , Wilson Heights , Downsview North----
           venue  freq
0    Coffee Shop  0.11
1           Bank  0.11
2  Shopping Mall  0.05
3    Supermarket  0.05
4  Deli / Bodega  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park , Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.08
1          Restaurant  0.08
2         Pizza Place  0.08
3    Sushi Restaurant  0.08
4  Italian Restaurant  0.08


----Don Mills----
                 venue  freq
0           Restaurant  0.07
1     Asian Restaurant  0.07
2                  Gym  0.07
3  Japanese Restaurant  0.07
4          Coffee Shop  0.07


----Downsview----
                        venue  freq
0               Grocery Store  0.23
1                        Park  0.15
2                        Bank  0.08
3  Construction & Landscaping  0.

In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [48]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ct_grouped['Neighborhood']

for ind in np.arange(ct_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ct_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor , Wilson Heights , Downsview North",Bank,Coffee Shop,Supermarket,Pizza Place,Bridal Shop,Diner,Pharmacy,Sandwich Place,Deli / Bodega,Shopping Mall
1,Bayview Village,Japanese Restaurant,Chinese Restaurant,Café,Bank,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
2,"Bedford Park , Lawrence Manor East",Restaurant,Sandwich Place,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Locksmith,Liquor Store,Juice Bar,Café
3,Don Mills,Restaurant,Beer Store,Asian Restaurant,Coffee Shop,Japanese Restaurant,Gym,Italian Restaurant,Café,Caribbean Restaurant,Gym / Fitness Center
4,Downsview,Grocery Store,Park,Liquor Store,Gym / Fitness Center,Baseball Field,Bank,Construction & Landscaping,Athletics & Sports,Food Truck,Airport


## 5. Cluster Neighborhoods ##

Run k-means to cluster the neighborhood into 8 clusters.

In [49]:
# set number of clusters
kclusters = 8

ct_grouped_clustering = ct_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ct_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 1, 0, 1, 6, 2, 3, 7, 1, 0, 1, 5, 1, 1, 4, 0])

In [50]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ct_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ct_merged = ct_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ct_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,43.803762,-79.363452,2.0,Dog Run,Golf Course,Pool,Mediterranean Restaurant,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
1,M2J,North York,"Fairview , Henry Farm , Oriole",43.778517,-79.346556,1.0,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Bakery,Bank,Tea Room,Food Court,Convenience Store,Electronics Store
2,M2K,North York,Bayview Village,43.786947,-79.385975,1.0,Japanese Restaurant,Chinese Restaurant,Café,Bank,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
3,M2L,North York,"York Mills , Silver Hills",43.75749,-79.374714,4.0,Park,Cafeteria,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
4,M2M,North York,"Willowdale , Newtonbrook",43.789053,-79.408493,,,,,,,,,,,


In [51]:
ct_merged['Cluster Lables']=ct_merged['Cluster Labels'].fillna(0,inplace=True)


In [52]:
ct_merged['Cluster Labels'] =ct_merged['Cluster Labels'].astype(int)

In [53]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ct_merged['Latitude'],ct_merged['Longitude'],ct_merged['Neighborhood'],ct_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 6. Examine Clusters ###

In [54]:
ct_merged.loc[ct_merged['Cluster Labels'] == 0, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
4,North York,0,,,,,,,,,,,
6,North York,0,Convenience Store,Bank,Bar,Park,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Cosmetics Shop,Deli / Bodega,
13,North York,0,Grocery Store,Park,Liquor Store,Gym / Fitness Center,Baseball Field,Bank,Construction & Landscaping,Athletics & Sports,Food Truck,Airport,
14,North York,0,Grocery Store,Park,Liquor Store,Gym / Fitness Center,Baseball Field,Bank,Construction & Landscaping,Athletics & Sports,Food Truck,Airport,
15,North York,0,Grocery Store,Park,Liquor Store,Gym / Fitness Center,Baseball Field,Bank,Construction & Landscaping,Athletics & Sports,Food Truck,Airport,
16,North York,0,Grocery Store,Park,Liquor Store,Gym / Fitness Center,Baseball Field,Bank,Construction & Landscaping,Athletics & Sports,Food Truck,Airport,
21,North York,0,Construction & Landscaping,Bakery,Park,Women's Store,Dog Run,Concert Hall,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,


In [55]:
ct_merged.loc[ct_merged['Cluster Labels'] == 1, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
1,North York,1,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Bakery,Bank,Tea Room,Food Court,Convenience Store,Electronics Store,
2,North York,1,Japanese Restaurant,Chinese Restaurant,Café,Bank,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,
5,North York,1,Coffee Shop,Ramen Restaurant,Pizza Place,Restaurant,Grocery Store,Sandwich Place,Café,Sushi Restaurant,Pet Store,Plaza,
7,North York,1,Coffee Shop,Ramen Restaurant,Pizza Place,Restaurant,Grocery Store,Sandwich Place,Café,Sushi Restaurant,Pet Store,Plaza,
9,North York,1,Restaurant,Beer Store,Asian Restaurant,Coffee Shop,Japanese Restaurant,Gym,Italian Restaurant,Café,Caribbean Restaurant,Gym / Fitness Center,
10,North York,1,Restaurant,Beer Store,Asian Restaurant,Coffee Shop,Japanese Restaurant,Gym,Italian Restaurant,Café,Caribbean Restaurant,Gym / Fitness Center,
11,North York,1,Bank,Coffee Shop,Supermarket,Pizza Place,Bridal Shop,Diner,Pharmacy,Sandwich Place,Deli / Bodega,Shopping Mall,
12,North York,1,Coffee Shop,Metro Station,Caribbean Restaurant,Massage Studio,Bar,Dog Run,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,
17,North York,1,Coffee Shop,Grocery Store,Hockey Arena,Portuguese Restaurant,Nail Salon,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,
18,North York,1,Restaurant,Sandwich Place,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Locksmith,Liquor Store,Juice Bar,Café,


In [56]:
ct_merged.loc[ct_merged['Cluster Labels'] == 2, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
0,North York,2,Dog Run,Golf Course,Pool,Mediterranean Restaurant,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,


In [57]:
ct_merged.loc[ct_merged['Cluster Labels'] == 3, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
22,North York,3,Pizza Place,Empanada Restaurant,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,


In [58]:
ct_merged.loc[ct_merged['Cluster Labels'] == 4, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
3,North York,4,Park,Cafeteria,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,


In [59]:
ct_merged.loc[ct_merged['Cluster Labels'] == 5, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
8,North York,5,Park,Food & Drink Shop,Fireworks Store,Discount Store,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,


In [60]:
ct_merged.loc[ct_merged['Cluster Labels'] == 6, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
20,North York,6,Japanese Restaurant,Playground,Bakery,Pub,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,


In [61]:
ct_merged.loc[ct_merged['Cluster Labels'] == 7, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Lables
23,North York,7,Food Service,Baseball Field,Women's Store,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,
