### Segmenting and Clustering Neighborhoods in Toronto, Ontario

#### Install packages and import libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

import requests # library to handle requests

from sklearn.cluster import KMeans # import k-means for clustering stage  

!pip install beautifulsoup4
from bs4 import BeautifulSoup

!pip install lxml

!pip install geopy # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# uncomment next line if you haven't installed folium
#!conda install -c conda-forge folium=0.5.0 --yes  

import folium # map rendering library
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


#### 1. Get the list of postal codes for Toronto region from Wikipedia, build a dataframe containing the corresponding boroughs and neighborhoods, and arrange data according to the requirements

In [2]:
# get the source file from wikipedia

source_file = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
print('Got the file from wikipedia')

Got the file from wikipedia


In [3]:
#get the data parsed and scraped:

my_soup = BeautifulSoup(source_file, 'lxml')
print('Data scraped to my_soup using Beautiful scraper and lxml parser')

Data scraped to my_soup using Beautiful scraper and lxml parser


In [4]:
#another option to get the data parsed and scraped:

#!wget -q -O 'toronto_data.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
#print('Data downloaded!')
#with open('toronto_data.html') as html_file:
#    my_soup = BeautifulSoup(html_file, 'lxml')
#print('Data scraped to my_soup using Beautiful scraper and lxml parser')

In [5]:
# the following (uncommented) code printed the html file:

#print(my_soup.prettify())

In [6]:
# the following (uncommented) code printed the table from the html file:

#the_table = my_soup.table
#print(the_table)

In [7]:
# the following (uncommented) code printed only the text in the table:

#the_text_in_the_table = my_soup.table.text
#print(the_text_in_the_table)

In [8]:
# get table columns names

match_header = my_soup.table.find_all('th')
column_names = []
for i, col_name in enumerate(match_header):
    column_name = col_name.text.split('\n')[0]
    column_names.append(column_name)

In [9]:
# instantiate the dataframe

toronto_neighborhoods = pd.DataFrame(columns=column_names)
toronto_neighborhoods

Unnamed: 0,Postal code,Borough,Neighborhood


In [10]:
#get data from table

match_data = my_soup.table.find_all('td') 
match_data_length = len(match_data)
for i in range(0, match_data_length):
    current_data= match_data[i].text.split('\n')[0]
    if (i%3) == 0: postal_code = current_data
    elif (i%3) == 1: borough = current_data
    else:  # i%3 == 2
        neighborhood = current_data
        toronto_neighborhoods = toronto_neighborhoods.append({'Postal code': postal_code,
                                                              'Borough': borough, 
                                                              'Neighborhood': neighborhood}, 
                                                              ignore_index=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [11]:
# replace "Not assigned" by NaN

toronto_neighborhoods['Borough'].replace("Not assigned", np.nan, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [12]:
# drop whole row with NaN in "price" column

toronto_neighborhoods.dropna(subset=['Borough'], axis=0, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [13]:
# reset index, because we droped two rows

toronto_neighborhoods.reset_index(drop=True, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [14]:
# replace "/" with "," in the Neighborhood column

toronto_neighborhoods['Neighborhood'].replace(" / ", ", ", regex = True, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
# look for "Not assigned" values in Neighborhood column
# In case of one found, replace the "Not assigned" value with the corresponding Borough name

n=0
no_of_rows = toronto_neighborhoods.shape[0]
for i in range(0,no_of_rows):
    if toronto_neighborhoods.iloc[i, 2] == "Not assigned": 
        n = n + 1 
        toronto_neighborhoods.at[i, 'Neighborhood'] = toronto_neighborhoods.iloc[i, 1]
print("Number of -Not assigned- appearances: ", n)

Number of -Not assigned- appearances:  0


##### There was no cell in the Neighborhood column with a "Not assigned" value

In [18]:
# The answer for question 1:

toronto_neighborhoods.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### 2. Get the coordinates for the neighborhoods and attach them to the dataframe

In [19]:
# get the coordinates file

!wget -q -O 'coordinates.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [20]:
# get the dataframe for coordinates

toronto_coordinates = pd.read_csv('coordinates.csv')
toronto_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
# sort the postal codes dataframe, toronto_neighborhoods, by postal codes

toronto_neighborhoods_ordered = toronto_neighborhoods.sort_values(by=['Postal code'])
toronto_neighborhoods_ordered.head()

Unnamed: 0,Postal code,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [22]:
# reset index for toronto_neighborhoods_ordered

toronto_neighborhoods_ordered = toronto_neighborhoods_ordered.reset_index(drop = True)
toronto_neighborhoods_ordered

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [23]:
# verify if the "Postal code" columns in toronto_neighborhoods_ordered and toronto_coordinates
# have the same length and print the length

if toronto_neighborhoods_ordered.shape[0] == toronto_coordinates.shape[0]:
    print("the same column length: ", toronto_coordinates.shape[0])
else:
    print("the column lengths are different")

the same column length:  103


In [24]:
# verify if the "Postal code" columns in toronto_neighborhoods_ordered 
# and toronto_coordinatesare identical
 
counter = 0
for i in range(0, toronto_coordinates.shape[0]):
    if toronto_neighborhoods_ordered.iloc[i,0] == toronto_coordinates.iloc[i,0]: 
        counter = counter + 1
if counter == toronto_coordinates.shape[0]:
    print("the columns are identical")
else:
    print("the columns are NOT identical")

the columns are identical


In [25]:
# add toronto_coordinates 'Latitude' column to toronto_neighborhoods_ordered

toronto_neighborhoods_ordered['Latitude']=toronto_coordinates['Latitude']

In [26]:
# add toronto_coordinates 'Longitude' column to toronto_neighborhoods_ordered

toronto_neighborhoods_ordered['Longitude']=toronto_coordinates['Longitude']
toronto_neighborhoods_ordered.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [30]:
# The answer to question 2:
    
toronto_neighborhoods_ordered

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


#### 3. Segmenting and Clustering the Neighborhoods

In [32]:
# look for Borough names containing the string 'Toronto'

only_toronto = toronto_neighborhoods_ordered[toronto_neighborhoods_ordered['Borough'].str.contains('Toronto')]
only_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [33]:
# reset index for only_toronto

only_toronto_ordered = only_toronto.reset_index(drop = True)
only_toronto_ordered.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [34]:
# check for number of rows in only_toronto_ordered

n_rows = only_toronto_ordered.shape[0]
print('There are {} rows in the only_toronto_ordered dataframe'.format(n_rows))

There are 39 rows in the only_toronto_ordered dataframe


##### Use geopy library to get the latitude and longitude values of Toronto

In order to define an instance of the geocoder, we need to define a user_agent. It's name will be <em>toronto_explorer</em>.

In [35]:
# get the geograpical coordinates of Toronto, Ontario

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, Ontario are 43.6534817, -79.3839347.


##### Create a map of Toronto with its neighborhoods.

In [36]:
# create map of Toronto using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

In [37]:
# add markers to map

for lat, lng, borough, neighborhood in zip(only_toronto['Latitude'],  \
                                           only_toronto['Longitude'], \
                                           only_toronto['Borough'],   \
                                           only_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [39]:
# Define Foursquare Credentials and Version
# my_client_id and my_client_version were initialized in the previous cell, 
# but due to privacy, this cell was deleted.... :-)

CLIENT_ID = my_client_id # my Foursquare ID
CLIENT_SECRET = my_client_version # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

# print('My credentails are:')
# print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

##### Let's explore a neighborhood in our dataframe, the one with the M5T postal code. 

In [40]:
# get the row for postal code M5T

M5T_toronto = only_toronto_ordered[only_toronto_ordered['Postal code'].str.match('M5T')]
M5T_toronto

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
26,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049


In [41]:
# reset index for M5T_toronto

M5T_toronto = M5T_toronto.reset_index(drop = True)
M5T_toronto

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049


In [42]:
# get borough name and neighborhood / neighborhoods name / names for M5T

borough_name = M5T_toronto.loc[0, 'Borough']
print('Borough for M5T:', borough_name)
neighborhoods_names = M5T_toronto.loc[0, 'Neighborhood']
print('Neighborhoods for M5T:', neighborhoods_names)

Borough for M5T: Downtown Toronto
Neighborhoods for M5T: Kensington Market, Chinatown, Grange Park


In [43]:
# Get the neighborhood's latitude and longitude values.
 
neighborhoods_latitude = M5T_toronto.loc[0, 'Latitude']     # neighborhoods latitude value
neighborhoods_longitude = M5T_toronto.loc[0, 'Longitude']    # neighborhood longitude value
print('Latitude and longitude for {} are: {}, {}.'.format(neighborhoods_names, 
                                                              neighborhoods_latitude, 
                                                              neighborhoods_longitude))

Latitude and longitude for Kensington Market, Chinatown, Grange Park are: 43.6532057, -79.4000493.


##### Let's get the top (not more than 100) venues that are in Kensington Market, Chinatown, Grange Park within a radius of 500 meters from the center of the neighborhoods

In [44]:
# create the URL for the GET request

limit = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhoods_latitude, 
    neighborhoods_longitude, 
    radius, 
    limit)
# url

In [45]:
# send the GET request and examine the resutls

my_results = requests.get(url).json()

# in order to see my_results uncomment the next row
# my_results 

##### All the information is in the items key. 
##### We define a function to extract the category type of the venue:

In [46]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

##### Clean my_results json and structure it into a pandas dataframe.

In [47]:
venues = my_results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns] # filter columns
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1) # filter the category for each row
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns] # clean columns
nearby_venues.head()

  


Unnamed: 0,name,categories,lat,lng
0,Kid Icarus,Arts & Crafts Store,43.653933,-79.401719
1,Essence of Life Organics,Organic Grocery,43.654111,-79.400431
2,Blackbird Baking Co,Bakery,43.654764,-79.400566
3,Seven Lives - Tacos y Mariscos,Mexican Restaurant,43.654418,-79.400545
4,The Moonbean Cafe,Café,43.654147,-79.400182


In [48]:
# Get the number of venues found
print('The number of venues found: ', nearby_venues.shape[0])

The number of venues found:  55


##### We define a function to repeat the same process to all the neighborhoods in Toronto:

In [49]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)     # create the API request URL

        results = requests.get(url).json()["response"]['groups'][0]['items']  # make the GET request

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])  # relevant information for each venue

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

##### We call the above function on each neighborhood and create a dataframe called toronto_venues.

In [50]:
toronto_venues = getNearbyVenues(names=only_toronto_ordered['Neighborhood'],
                                   latitudes=only_toronto_ordered['Latitude'],
                                   longitudes=only_toronto_ordered['Longitude']
                                  )
print('{} venues found in Toronto'.format(toronto_venues.shape[0]))
toronto_venues.head()

1636 venues found in Toronto


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


##### We can check how many venues were returned for each neighborhood:

In [51]:
toronto_venues_count = toronto_venues.groupby('Neighborhood').count()
toronto_venues_count.head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
Business reply mail Processing CentrE,16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",18,18,18,18,18,18
Central Bay Street,64,64,64,64,64,64


##### We can find out how many unique categories can be curated from all the returned venues:

In [52]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 234 uniques categories.


##### Now, we can analyze each neighborhood:

In [53]:
# get dummies encoding
toronto_venues_get_dummies = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_venues_get_dummies['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_venues_get_dummies.columns[-1]] + list(toronto_venues_get_dummies.columns[:-1])
toronto_venues_get_dummies = toronto_venues_get_dummies[fixed_columns]

print('The toronto_venues_get_dummies dataframe size is: ', toronto_venues_get_dummies.shape) 
toronto_venues_get_dummies.head()

The toronto_venues_get_dummies dataframe size is:  (1636, 234)


Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### We can group rows by neighborhood and by taking the mean of the frequency of occurrence of each category:

In [54]:
toronto_grouped = toronto_venues_get_dummies.groupby('Neighborhood').mean().reset_index()
print('The toronto_grouped dataframe size is: ', toronto_grouped.shape) 
toronto_grouped.head()

The toronto_grouped dataframe size is:  (39, 234)


Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,0.0,0.0


##### We can build a dataframe containing each neighborhood along with the top most common venues.
##### First, we define a function to sort the venues in descending order:

In [55]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Next, we create the new dataframe and display the top 10 venues for each neighborhood:

In [56]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Restaurant,Café,Cheese Shop,Bakery,Seafood Restaurant,Farmers Market,Italian Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Bakery,Italian Restaurant,Stadium,Grocery Store,Gym,Furniture / Home Store
2,Business reply mail Processing CentrE,Light Rail Station,Park,Auto Workshop,Comic Shop,Pizza Place,Burrito Place,Restaurant,Brewery,Smoke Shop,Spa
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Bar,Boat or Ferry,Harbor / Marina,Coffee Shop,Sculpture Garden,Plane
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Burger Joint,Ice Cream Shop,Thai Restaurant,Salad Place,Middle Eastern Restaurant,Fried Chicken Joint


##### Now, we will run *k*-means to cluster the neighborhood into 5 clusters.

In [57]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 0, 1,
       1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

##### Now, we create a new dataframe that includes the cluster labels, as well as the top 10 venues, for each neighborhood.

In [58]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = only_toronto_ordered

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Pub,Trail,Health Food Store,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Liquor Store,Indian Restaurant,Japanese Restaurant,Juice Bar
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,1,Park,Fast Food Restaurant,Sushi Restaurant,Fish & Chips Shop,Brewery,Light Rail Station,Liquor Store,Italian Restaurant,Restaurant,Burrito Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Bakery,Brewery,American Restaurant,Gastropub,Yoga Studio,Bookstore,Sandwich Place,Cheese Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Bus Line,Swim School,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


##### Finally, let's visualize the resulting clusters.

In [59]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

##### Now, we can examine each cluster, determine the venue categories that distinguish one from another, and, based on this, assign a name to each cluster.

##### And this conclude the answer to question 3.

##### *Cluster 1*

In [60]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,"Moore Park, Summerhill East",0,Park,Playground,Tennis Court,Restaurant,Creperie,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
10,Rosedale,0,Park,Playground,Trail,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


##### I will name Cluster 1: *Parks and Playgrounds*

##### *Cluster 2*

In [61]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,The Beaches,1,Pub,Trail,Health Food Store,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
1,"The Danforth West, Riverdale",1,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Liquor Store,Indian Restaurant,Japanese Restaurant,Juice Bar
2,"India Bazaar, The Beaches West",1,Park,Fast Food Restaurant,Sushi Restaurant,Fish & Chips Shop,Brewery,Light Rail Station,Liquor Store,Italian Restaurant,Restaurant,Burrito Place
3,Studio District,1,Café,Coffee Shop,Bakery,Brewery,American Restaurant,Gastropub,Yoga Studio,Bookstore,Sandwich Place,Cheese Shop
5,Davisville North,1,Sandwich Place,Gym,Hotel,Department Store,Convenience Store,Food & Drink Shop,Breakfast Spot,Park,Dog Run,Distribution Center
6,North Toronto West,1,Clothing Store,Coffee Shop,Seafood Restaurant,Salon / Barbershop,Restaurant,Rental Car Location,Café,Chinese Restaurant,Yoga Studio,Sporting Goods Shop
7,Davisville,1,Sandwich Place,Pizza Place,Dessert Shop,Thai Restaurant,Gym,Sushi Restaurant,Italian Restaurant,Toy / Game Store,Café,Coffee Shop
9,"Summerhill West, Rathnelly, South Hill, Forest...",1,Coffee Shop,Pub,Sushi Restaurant,Bagel Shop,Supermarket,Sports Bar,Bank,Pizza Place,Fried Chicken Joint,Liquor Store
11,"St. James Town, Cabbagetown",1,Coffee Shop,Bakery,Café,Restaurant,Italian Restaurant,Pharmacy,Pizza Place,Pub,Bank,Liquor Store
12,Church and Wellesley,1,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Yoga Studio,Gastropub,Men's Store,Pub,Hotel,Gay Bar


##### I will name Cluster 2: *Coffee Shops and Restaurants*

##### *Cluster 3*

In [62]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Roselawn,2,Garden,Women's Store,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


##### I will name Cluster 3: *Gardens*

##### *Cluster 4*

In [63]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Lawrence Park,3,Park,Bus Line,Swim School,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


##### I will name Cluster 4: *Parks*