# Assignment Week 3: Neighborhood in Toronto


Import libraries to scrape the following wikipedia page: [link](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

# Question 1

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
import folium
import requests 
import matplotlib.cm as cm
import matplotlib.colors as colors
# Empty List
tabs = []

# File handling
with open('List of postal codes of Canada_ M - Wikipedia.html', 'r') as fp:
    html_content = fp.read()

    table_doc = BeautifulSoup(html_content, 'html.parser')
 
len(table_doc.find_all('tr')[2:])

183

Retrieve the three columns: PostalCode, Borough, and Neighborhood

In [2]:
PostalCode = []
Borough = []
Neighborhood = []


for tr in table_doc.find_all('tr')[2:181]:
    tds = tr.find_all('td')
#     print(tds)
    PostalCode.append(tds[0].text.rstrip())
    Borough.append(tds[1].text.rstrip())
    Neighborhood.append(tds[2].text.rstrip())
    

In [3]:
# Create final dataset
df = pd.DataFrame(list(zip(PostalCode, Borough, Neighborhood)), 
               columns =['PostalCode', 'Borough', 'Neighborhood']) 

In [4]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [5]:
# Remove rows with Borough not assigned
df.drop(df[df['Borough']== 'Not assigned'].index, inplace = True) 

In [6]:
# Check if there are Not assigned  neighborhood
len(df[df['Neighborhood']== 'Not assigned']) # there are no cases


0

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"
5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
df.shape

(103, 3)

# Question 2

In [9]:
# Assign latitude and longitudine
# Unfortunately geocoder does not work well, hence I'm using the csv for the latitute and longitude

In [10]:
lat_lon = pd.read_csv('Geospatial_Coordinates.csv')  

In [11]:
# Sort the rows according to Postcodes
df_sort = df.sort_values(by=['PostalCode'])

# Reset indices to avoid problems with joint
df_sort.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
df_sort.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M1B,Scarborough,"Malvern, Rouge"
17,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
26,M1E,Scarborough,"Guildwood, Morningside, West Hill"
35,M1G,Scarborough,Woburn
44,M1H,Scarborough,Cedarbrae


In [13]:
lat_lon.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
neighborhoods = df_sort.join(lat_lon.set_index('PostalCode'), on='PostalCode')

# Question 3: Exploring the neighborhoods in Toronto


#### Load and explore the data


In [15]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
8,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
17,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
26,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
35,M1G,Scarborough,Woburn,43.770992,-79.216917
44,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>to_explorer</em>, as shown below.


In [17]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top.


In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
neighborhoods['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

### Let's explore the Borough called Central Toronto

In [20]:
Central_Toronto_data = neighborhoods[neighborhoods['Borough'] == 'Central Toronto'].reset_index(drop=True)
Central_Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


Let's get the geographical coordinates of Central Toronto.


In [21]:
address = 'Central Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.6708625, -79.37279241253721.


Let's visualize the neiborhoods only of Central Toronto


In [22]:
# create map of Manhattan using latitude and longitude values
map_central = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Central_Toronto_data['Latitude'], Central_Toronto_data['Longitude'], Central_Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central)  
    
map_central

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.


#### Define Foursquare Credentials and Version


In [24]:
# CLIENT_ID = '' # your Foursquare ID
# CLIENT_SECRET = '' # your Foursquare Secret
# VERSION = '20180605' # Foursquare API version
# LIMIT = 100 # A default Foursquare API limit value

# print('Your credentails:')
# print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

#### Let's explore the first neighborhood in our dataframe.


Get the neighborhood's name.


In [25]:
Central_Toronto_data.loc[0, 'Neighborhood']

'Lawrence Park'

Get the neighborhood's latitude and longitude values.


In [26]:
neighborhood_latitude = Central_Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Central_Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Central_Toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


#### Now, let's get the top 100 venues that are in Lawrence Park within a radius of 500 meters.


First, let's create the GET request URL. Name your URL **url**.


In [28]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
#url # display URL



Send the GET request and examine the resutls


In [29]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fe529d95c2c4d623ff7b652'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

In [30]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

The function will help us to structure the json into a _pandas_ dataframe.


In [31]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


#### Let's use the getNearbyVenues function to repeat the same process to all the neighborhoods in Central Toronto


In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
central_toronto_venues = getNearbyVenues(names=Central_Toronto_data['Neighborhood'],
                                   latitudes=Central_Toronto_data['Latitude'],
                                   longitudes=Central_Toronto_data['Longitude']
                                  )


Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville


#### Let's check the size of the resulting dataframe


In [34]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(103, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


Let's check how many venues were returned for each neighborhood


In [35]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,34,34,34,34,34,34
Davisville North,9,9,9,9,9,9
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",2,2,2,2,2,2
"North Toronto West, Lawrence Park",16,16,16,16,16,16
Roselawn,1,1,1,1,1,1
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",14,14,14,14,14,14
"The Annex, North Midtown, Yorkville",20,20,20,20,20,20


#### Let's find out how many unique categories can be curated from all the returned venues


In [36]:
print('There are {} uniques categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 58 uniques categories.


<a id='item3'></a>


## 3. Analyze Each Neighborhood


In [37]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

central_toronto_onehot.head(10)

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Davisville North,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.


In [38]:
central_toronto_onehot.shape

(103, 59)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [39]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,...,0.0,0.0,0.058824,0.0,0.029412,0.029412,0.029412,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
5,"North Toronto West, Lawrence Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,...,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.071429,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,...,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
8,"The Annex, North Midtown, Yorkville",0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's confirm the new size


In [40]:
central_toronto_grouped.shape

(9, 59)

#### Let's print each neighborhood along with the top 5 most common venues


In [41]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
            venue  freq
0  Sandwich Place  0.09
1    Dessert Shop  0.09
2             Gym  0.06
3     Pizza Place  0.06
4            Café  0.06


----Davisville North----
                  venue  freq
0                 Hotel  0.11
1      Department Store  0.11
2  Gym / Fitness Center  0.11
3                   Gym  0.11
4                  Park  0.11


----Forest Hill North & West, Forest Hill Road Park----
                 venue  freq
0                Trail  0.25
1        Jewelry Store  0.25
2     Sushi Restaurant  0.25
3                 Park  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0             Bus Line  0.33
1          Swim School  0.33
2                 Park  0.33
3  American Restaurant  0.00
4           Restaurant  0.00


----Moore Park, Summerhill East----
                 venue  freq
0                Trail   0.5
1                 Park   0.5
2  American Restaurant   0.0
3           Restaurant   0.0
4     Indoor Play

#### Let's put that into a _pandas_ dataframe


First, let's write a function to sort the venues in descending order.


In [42]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.


In [43]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Café,Sushi Restaurant,Coffee Shop,Pizza Place,Gym,Italian Restaurant,Gas Station,Gourmet Shop
1,Davisville North,Pizza Place,Sandwich Place,Gym / Fitness Center,Hotel,Food & Drink Shop,Park,Department Store,Gym,Breakfast Spot,Bus Line
2,"Forest Hill North & West, Forest Hill Road Park",Trail,Sushi Restaurant,Jewelry Store,Park,Yoga Studio,Dessert Shop,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
3,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
4,"Moore Park, Summerhill East",Trail,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


<a id='item4'></a>


##  Cluster Neighborhoods


I choose to cluster neighbouthood in 4 groups


In [44]:
# set number of clusters
kclusters = 4

central_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(central_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:9] 

array([1, 1, 3, 0, 3, 1, 2, 1, 1], dtype=int32)

In [45]:
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Café,Sushi Restaurant,Coffee Shop,Pizza Place,Gym,Italian Restaurant,Gas Station,Gourmet Shop
1,Davisville North,Pizza Place,Sandwich Place,Gym / Fitness Center,Hotel,Food & Drink Shop,Park,Department Store,Gym,Breakfast Spot,Bus Line
2,"Forest Hill North & West, Forest Hill Road Park",Trail,Sushi Restaurant,Jewelry Store,Park,Yoga Studio,Dessert Shop,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
3,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
4,"Moore Park, Summerhill East",Trail,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop
5,"North Toronto West, Lawrence Park",Coffee Shop,Clothing Store,Café,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Cosmetics Shop,Restaurant,Salon / Barbershop
6,Roselawn,Garden,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Flower Shop
7,"Summerhill West, Rathnelly, South Hill, Forest...",Coffee Shop,American Restaurant,Restaurant,Fried Chicken Joint,Vietnamese Restaurant,Light Rail Station,Liquor Store,Pub,Pizza Place,Bank
8,"The Annex, North Midtown, Yorkville",Sandwich Place,Café,Coffee Shop,History Museum,Flower Shop,Indian Restaurant,Donut Shop,Liquor Store,Middle Eastern Restaurant,Park


Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.


In [46]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)



In [47]:
central_toronto_merged = Central_Toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
central_toronto_merged = central_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

central_toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Pizza Place,Sandwich Place,Gym / Fitness Center,Hotel,Food & Drink Shop,Park,Department Store,Gym,Breakfast Spot,Bus Line
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,1,Coffee Shop,Clothing Store,Café,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Cosmetics Shop,Restaurant,Salon / Barbershop
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Dessert Shop,Sandwich Place,Café,Sushi Restaurant,Coffee Shop,Pizza Place,Gym,Italian Restaurant,Gas Station,Gourmet Shop
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,3,Trail,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


Finally, let's visualize the resulting clusters


In [48]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(central_toronto_merged['Latitude'], central_toronto_merged['Longitude'], central_toronto_merged['Neighborhood'], central_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>


## Examine Clusters


Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.


#### Cluster 1


In [49]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 0, central_toronto_merged.columns[[2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Lawrence Park,0,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint


#### Cluster 2


In [50]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 1, central_toronto_merged.columns[[2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Davisville North,1,Pizza Place,Sandwich Place,Gym / Fitness Center,Hotel,Food & Drink Shop,Park,Department Store,Gym,Breakfast Spot,Bus Line
2,"North Toronto West, Lawrence Park",1,Coffee Shop,Clothing Store,Café,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Cosmetics Shop,Restaurant,Salon / Barbershop
3,Davisville,1,Dessert Shop,Sandwich Place,Café,Sushi Restaurant,Coffee Shop,Pizza Place,Gym,Italian Restaurant,Gas Station,Gourmet Shop
5,"Summerhill West, Rathnelly, South Hill, Forest...",1,Coffee Shop,American Restaurant,Restaurant,Fried Chicken Joint,Vietnamese Restaurant,Light Rail Station,Liquor Store,Pub,Pizza Place,Bank
8,"The Annex, North Midtown, Yorkville",1,Sandwich Place,Café,Coffee Shop,History Museum,Flower Shop,Indian Restaurant,Donut Shop,Liquor Store,Middle Eastern Restaurant,Park


#### Cluster 3


In [51]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 2, central_toronto_merged.columns[[2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Roselawn,2,Garden,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Flower Shop


#### Cluster 4


In [52]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 3, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,3,Trail,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop
7,Central Toronto,3,Trail,Sushi Restaurant,Jewelry Store,Park,Yoga Studio,Dessert Shop,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
