# Segmenting and Clustering Neighborhoods in Toronto

## Task 3 - Clustering

### Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [8]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Read Data from csv from Second Part

In [11]:
df = pd.read_csv('DATA\data_ll.csv')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.808626,-79.189913
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.785779,-79.157368
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.765806,-79.185284
3,M1G,Scarborough,Woburn,43.771545,-79.218135
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813


In [12]:
df.shape

(103, 5)

### Use geopy library to get the latitude and longitude values of New York City.
#### In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent ny_explorer, as shown below.

In [16]:
address = 'Toronto'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


### Create a map of Toronto with neighborhoods superimposed on top.

In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### We cluster the neighborhoods in Scarborough. So let's slice the original dataframe and create a new dataframe of the York data.

In [45]:
scarborough_data = df[df['Borough'] == 'Scarborough'].reset_index(drop=True)

In [46]:
scarborough_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.808626,-79.189913
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.785779,-79.157368
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.765806,-79.185284
3,M1G,Scarborough,Woburn,43.771545,-79.218135
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813


### Let's get the geographical coordinates of York.

In [58]:
address1 = 'Scarborough,Toronto'
geolocator1 = Nominatim()
location1 = geolocator1.geocode(address1)
latitude1 = location1.latitude
longitude1 = location1.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude1, longitude1))

  


The geograpical coordinate of Scarborough are 43.773077, -79.257774.


In [59]:
map_scarb = folium.Map(location=[latitude1, longitude1], zoom_start=11)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarb)  
    
map_scarb

### Define Foursquare Credentials and Version

In [60]:
CLIENT_ID = '5OTH1M5X5YAZN5YYDYZBMJGUX5DSATGYJ1URNEJJ2A0LME5T'
CLIENT_SECRET = 'FZJRUN2OSGYERV0WOXXNFSITR4HQ1PVDZ0VGAIGBKNHTQNW4' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5OTH1M5X5YAZN5YYDYZBMJGUX5DSATGYJ1URNEJJ2A0LME5T
CLIENT_SECRET:FZJRUN2OSGYERV0WOXXNFSITR4HQ1PVDZ0VGAIGBKNHTQNW4


### Let's explore the first neighborhood in our dataframe.
#### Get the neighborhood's name.

In [53]:
scarborough_data.loc[0, 'Neighborhood']

'Malvern / Rouge'

#### Get the neighborhood's latitude and longitude values.

In [61]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern / Rouge are 43.80862623100006, -79.18991284599997.


### Now, let's get the top 100 venues that are in Malvern/Rouge within a radius of 500 meters.
#### First, let's create the GET request URL. Name your URL url.

In [62]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude1, 
    longitude1, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=5OTH1M5X5YAZN5YYDYZBMJGUX5DSATGYJ1URNEJJ2A0LME5T&client_secret=FZJRUN2OSGYERV0WOXXNFSITR4HQ1PVDZ0VGAIGBKNHTQNW4&v=20180605&ll=43.773077,-79.257774&radius=500&limit=100'

### Extract the category of the venue

In [65]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Now we are ready to clean the json and structure it into a pandas dataframe.

In [66]:
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(12)

Unnamed: 0,name,categories,lat,lng
0,Disney Store,Toy / Game Store,43.775537,-79.256833
1,SEPHORA,Cosmetics Shop,43.775017,-79.258109
2,St. Andrews Fish & Chips,Fish & Chips Shop,43.771865,-79.252645
3,DAVIDsTEA,Tea Room,43.77632,-79.258688
4,American Eagle Outfitters,Clothing Store,43.776012,-79.258334
5,Hot Topic,Clothing Store,43.77545,-79.257929
6,Tommy Hilfiger,Clothing Store,43.776015,-79.257369
7,St. Louis Bar & Grill,Bar,43.774157,-79.253808
8,Chipotle Mexican Grill,Mexican Restaurant,43.77641,-79.258069
9,Shoppers Drug Mart,Pharmacy,43.773305,-79.251662


### Number of seats returned by foursquare

In [67]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

45 venues were returned by Foursquare.


# Explore Neighborhoods in Scarborough 
### Let's create a function to repeat the same process to all the neighborhoods in Scarbourogh¶

In [68]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### We execute the above function in each neighborhood and create a new data frame called scarborough_venues.

In [69]:
scarborough_venues = getNearbyVenues(names=scarborough_data['Neighborhood'],
                                   latitudes=scarborough_data['Latitude'],
                                   longitudes=scarborough_data['Longitude']
                                  )

Malvern / Rouge
Rouge Hill / Port Union / Highland Creek
Guildwood / Morningside / West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park / Ionview / East Birchmount Park
Golden Mile / Clairlea / Oakridge
Cliffside / Cliffcrest / Scarborough Village West
Birch Cliff / Cliffside West
Dorset Park / Wexford Heights / Scarborough Town Centre
Wexford / Maryvale
Agincourt
Clarks Corners / Tam O'Shanter / Sullivan
Milliken / Agincourt North / Steeles East / L'Amoreaux East
Steeles West / L'Amoreaux West
Upper Rouge


### Let's check the size of the resulting dataframe

In [71]:
print(scarborough_venues.shape)
scarborough_venues.head()

(88, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern / Rouge,43.808626,-79.189913,Upper Rouge Trail,43.809988,-79.186147,Trail
1,Rouge Hill / Port Union / Highland Creek,43.785779,-79.157368,Fox and Fiddle,43.789082,-79.154459,Bar
2,Guildwood / Morningside / West Hill,43.765806,-79.185284,Chick-N-Joy,43.768752,-79.187982,Fried Chicken Joint
3,Guildwood / Morningside / West Hill,43.765806,-79.185284,Little Caesars Pizza,43.769046,-79.184386,Pizza Place
4,Guildwood / Morningside / West Hill,43.765806,-79.185284,Swiss Chalet,43.768122,-79.190493,Restaurant


### Let's check how many venues were returned for each neighborhood


In [72]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
Birch Cliff / Cliffside West,4,4,4,4,4,4
Cedarbrae,2,2,2,2,2,2
Clarks Corners / Tam O'Shanter / Sullivan,6,6,6,6,6,6
Cliffside / Cliffcrest / Scarborough Village West,6,6,6,6,6,6
Dorset Park / Wexford Heights / Scarborough Town Centre,4,4,4,4,4,4
Golden Mile / Clairlea / Oakridge,9,9,9,9,9,9
Guildwood / Morningside / West Hill,20,20,20,20,20,20
Kennedy Park / Ionview / East Birchmount Park,4,4,4,4,4,4
Malvern / Rouge,1,1,1,1,1,1


### Let's find out how many unique categories can be curated from all the returned venues

In [73]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 50 uniques categories.


# Analyze Each Neighborhood

In [77]:
# one hot encoding
scarb_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarb_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarb_onehot.columns[-1]] + list(scarb_onehot.columns[:-1])
scarb_onehot = scarb_onehot[fixed_columns]

scarb_onehot.head()

Unnamed: 0,Neighborhood,Auto Garage,Badminton Court,Bakery,Bank,Bar,Beer Store,Big Box Store,Bistro,Breakfast Spot,Brewery,Bubble Tea Shop,Bus Line,Bus Station,Chinese Restaurant,Coffee Shop,College Stadium,Construction & Landscaping,Department Store,Discount Store,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Gym,Hobby Shop,Indian Restaurant,Intersection,Korean Restaurant,Liquor Store,Metro Station,Mexican Restaurant,Other Great Outdoors,Pharmacy,Pizza Place,Rental Service,Restaurant,Salon / Barbershop,Sandwich Place,Shopping Mall,Skating Rink,Soccer Field,Sports Bar,Supermarket,Sushi Restaurant,Thrift / Vintage Store,Trail,Wine Shop
0,Malvern / Rouge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Rouge Hill / Port Union / Highland Creek,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [78]:
scarb_onehot.shape

(88, 51)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [79]:
scarb_grouped = scarb_onehot.groupby('Neighborhood').mean().reset_index()
scarb_grouped.shape

(16, 51)

### Getting top 10 venues per neighborhood

In [80]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [82]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarb_grouped['Neighborhood']

for ind in np.arange(scarb_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarb_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Badminton Court,Sushi Restaurant,Supermarket,Skating Rink,Breakfast Spot,Wine Shop,Bus Station,Fast Food Restaurant,Electronics Store,Discount Store
1,Birch Cliff / Cliffside West,Pizza Place,Skating Rink,College Stadium,General Entertainment,Bank,Bar,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
2,Cedarbrae,Construction & Landscaping,Trail,Wine Shop,Bus Station,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
3,Clarks Corners / Tam O'Shanter / Sullivan,Pizza Place,Shopping Mall,Intersection,Coffee Shop,Pharmacy,Golf Course,Beer Store,Chinese Restaurant,Fast Food Restaurant,Electronics Store
4,Cliffside / Cliffcrest / Scarborough Village West,Pharmacy,Discount Store,Bistro,Sandwich Place,Liquor Store,Coffee Shop,Wine Shop,Fast Food Restaurant,Electronics Store,Department Store


# Cluster Neighborhoods
### Run k-means to cluster the neighborhood into 5 clusters.


In [91]:
scarb_data = scarborough_data.drop(16)

# set number of clusters
kclusters = 5

scarb_grouped_clustering = scarb_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarb_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 2])

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarb_merged = scarb_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarb_merged = scarb_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [94]:
scarb_merged.head() # check the last columns!

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,Malvern / Rouge,43.808626,-79.189913,0
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.785779,-79.157368,0
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.765806,-79.185284,2
3,M1G,Scarborough,Woburn,43.771545,-79.218135,0
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813,0


# Visualize the clusters

In [95]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarb_merged['Latitude'], scarb_merged['Longitude'], scarb_merged['Neighborhood'], scarb_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

#### Cluster 1

In [96]:
scarb_merged.loc[scarb_merged['Cluster Labels'] == 0, scarb_merged.columns[[1] + list(range(5, scarb_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels
0,Scarborough,0
1,Scarborough,0
3,Scarborough,0
4,Scarborough,0
5,Scarborough,0
6,Scarborough,0
7,Scarborough,0
8,Scarborough,0
12,Scarborough,0
13,Scarborough,0


#### Cluster 2

In [97]:
scarb_merged.loc[scarb_merged['Cluster Labels'] == 1, scarb_merged.columns[[1] + list(range(5, scarb_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels
11,Scarborough,1


#### Cluster 3

In [98]:
scarb_merged.loc[scarb_merged['Cluster Labels'] == 2, scarb_merged.columns[[1] + list(range(5, scarb_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels
2,Scarborough,2
9,Scarborough,2


#### Cluster4

In [99]:
scarb_merged.loc[scarb_merged['Cluster Labels'] == 3, scarb_merged.columns[[1] + list(range(5, scarb_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels
14,Scarborough,3


#### Cluster 5 

In [100]:
scarb_merged.loc[scarb_merged['Cluster Labels'] == 4, scarb_merged.columns[[1] + list(range(5, scarb_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels
10,Scarborough,4
