# Segmenting and Clustering Neighborhoods in Toronto
## by Dalia Y. Domínguez 

In [128]:
#Import the libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json # library to handle JSON files
import numpy as np # library to handle data in a vectorized manner
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### 1. Create the Data Frame of the Neighborhoods in Toronto

Use the requests library to download the webpage https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M.
Save the text of the response as a variable named html_data and make the object beautiful_soup.

In [5]:
# url and get ready the information we'll need 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data  = requests.get(url).text 
beautiful_soup= BeautifulSoup(html_data, 'html.parser')

The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [6]:
# get ready the data frame
torontoPC = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"]) #headers
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood


We're only process the cells that have an assigned borough.
if more than one neighborhood can exist in one postal code area. It will will have in one row the neighborhoods separated with a comma as shown.
If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [7]:
for row in beautiful_soup.table.find_all("tr"):
    for postalCodeInf in row.find_all("td"):
        if ( postalCodeInf.span.text != 'Not assigned'):
            postalCode = postalCodeInf.b.text
            borough = postalCodeInf.span.text.split('(')[0]
            neighborhood = postalCodeInf.span.text.split('(')[1].replace('/',',')[:-1] 
            torontoPC = torontoPC.append({"PostalCode":postalCode, "Borough":borough, "Neighborhood":neighborhood}, ignore_index=True)
            
torontoPC['Borough']=torontoPC ['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

#torontoPC[torontoPC['Neighborhood']=='Not assigned'] #verified if there's a missing neighborhood

Data Frame:

In [8]:
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


Number of rows and columns of the frame:

In [9]:
torontoPC.shape

(103, 3)

### 2. Get the latitude and longitude of each Neighborhood

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

In [None]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

#for every postal code:
for postal_code in torontoPC["PostalCode"]:
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
location = torontoPC.append({"Latitude":latitude, "Longitude":longitude}, ignore_index=True)

#### Important Note: There is a limit on how many times you can call geocoder.google function. It is 2500 times per day. 
Given that this package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, "GeoSpatial Dataset" csv file that has the geographical coordinates of each postal code.

In [10]:
with open('Geospatial_Coordinates.csv', newline='') as csvfile:
    location = pd.read_csv(csvfile)

In [11]:
location

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Order the postal codes of Toronto by Postal Code such as location data frame:

In [12]:
torontoPC=torontoPC.sort_values(by=['PostalCode'])
torontoPC=torontoPC.reset_index(drop=True) #to reset an index in data frame
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov..."
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."


Add the location of each postal code to torontoPC data frame:

In [13]:
torontoPC['Latitude'] = location['Latitude'] 
torontoPC['Longitude'] = location['Longitude']
torontoPC.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


### 3. Explore and cluster the neighborhoods in Toronto

Use geopy library to get the latitude and longitude values of New York City
In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>ny_explorer</em>, as shown below.

In [23]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top

In [27]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontoPC['Latitude'], torontoPC['Longitude'], torontoPC['Borough'], torontoPC['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in Scarborough. So let's slice the original dataframe and create a new dataframe of the Scarborough data.

In [54]:
Borough_data = torontoPC[torontoPC['Borough'] == torontoPC['Borough'][0]].reset_index(drop=True)
Borough_data 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


Let's get the geographical coordinates of scarborough.

In [56]:
address = 'Scarborough, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


As we did with all of New York City, let's visualizat Scarborough the neighborhoods in it.

In [62]:
# create map of Scarborough using latitude and longitude values
map_Borough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Borough_data['Latitude'], Borough_data['Longitude'], Borough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Borough)  
    
map_Borough

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define Foursquare Credentials and Version

In [28]:
CLIENT_ID = 'OWPCBK4JBGUK1OEMAXO1MRZ43S3NNYVD4CYUA4J4BF4BMG0P' # your Foursquare ID
CLIENT_SECRET = 'SNK4WVH44RCCJLP201BLV4EKBW3WPOCIM2Y20XQ5M5WLUTP5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OWPCBK4JBGUK1OEMAXO1MRZ43S3NNYVD4CYUA4J4BF4BMG0P
CLIENT_SECRET:SNK4WVH44RCCJLP201BLV4EKBW3WPOCIM2Y20XQ5M5WLUTP5


#### Let's explore the first neighborhood in our dataframe.


Get the neighborhood's name.

In [64]:
Borough_data.loc[0, 'Neighborhood']

'Malvern , Rouge'

Get the neighborhood's latitude and longitude values.


In [82]:
neighborhood_latitude = Borough_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Borough_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_names = Borough_data.loc[0, 'Neighborhood'].split(' , ') # neighborhood names

['Malvern', 'Rouge']

#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.


First, let's create the GET request URL. Name your URL **url**.


In [70]:
# type your answer here

LIMIT = 100 
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OWPCBK4JBGUK1OEMAXO1MRZ43S3NNYVD4CYUA4J4BF4BMG0P&client_secret=SNK4WVH44RCCJLP201BLV4EKBW3WPOCIM2Y20XQ5M5WLUTP5&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

Send the GET request and examine the results

In [72]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60b2eba9ffa72828b3cb1295'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 1,
  'suggestedBounds': {'ne': {'lat': 43.8111863045, 'lng': -79.18812958073042},
   'sw': {'lat': 43.80218629549999, 'lng': -79.2005772192696}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': 'Wendy’s',
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',
    

Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.


In [73]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a _pandas_ dataframe.


In [74]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy’s,Fast Food Restaurant,43.807448,-79.199056


And how many venues were returned by Foursquare?

In [83]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

1 venues were returned by Foursquare.


### 2. Explore Neighborhoods in Scarborough
#### Let's create a function to repeat the same process to all the neighborhoods in Scarborough


In [84]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called _scarborough_venues_.


In [101]:
neighborhood_names = []
for neighborhood in Borough_data['Neighborhood']:
    neighborhood_names=neighborhood_names+neighborhood.split(" , ")
    
scarborough_venues = getNearbyVenues(names=neighborhood_names,
                                   latitudes=Borough_data['Latitude'],
                                   longitudes=Borough_data['Longitude'])

Malvern
Rouge
Rouge Hill
Port Union
Highland Creek
Guildwood
Morningside
West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park
Ionview
East Birchmount Park
Golden Mile
Clairlea
Oakridge


#### Let's check the size of the resulting dataframe


In [104]:
print(scarborough_venues.shape)
scarborough_venues.head()

(90, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Rouge,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Rouge,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,Rouge Hill,43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,Rouge Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


Let's check how many venues were returned for each neighborhood


In [105]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cedarbrae,4,4,4,4,4,4
Clairlea,12,12,12,12,12,12
East Birchmount Park,11,11,11,11,11,11
Golden Mile,3,3,3,3,3,3
Guildwood,2,2,2,2,2,2
Highland Creek,8,8,8,8,8,8
Ionview,4,4,4,4,4,4
Kennedy Park,8,8,8,8,8,8
Malvern,1,1,1,1,1,1
Morningside,5,5,5,5,5,5


#### Let's find out how many unique categories can be curated from all the returned venues


In [107]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 57 uniques categories.


## 3. Analyze each neighborhood

In [109]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Rental Car Location,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Supermarket,Thai Restaurant,Vietnamese Restaurant
0,Malvern,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rouge,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rouge Hill,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rouge Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [111]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Rental Car Location,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Supermarket,Thai Restaurant,Vietnamese Restaurant
0,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,Clairlea,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.083333,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.083333,0.0,0.0
2,East Birchmount Park,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0
3,Golden Mile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Guildwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Highland Creek,0.0,0.0,0.125,0.0,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
6,Ionview,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
7,Kennedy Park,0.125,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.125,0.125,0.0,0.125,0.0,0.0,0.0,0.125
8,Malvern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Morningside,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues


In [112]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Cedarbrae----
                   venue  freq
0           Skating Rink  0.25
1  General Entertainment  0.25
2                   Café  0.25
3        College Stadium  0.25
4      Accessories Store  0.00


----Clairlea----
                  venue  freq
0  Fast Food Restaurant  0.25
1        Breakfast Spot  0.08
2              Pharmacy  0.08
3           Pizza Place  0.08
4        Cosmetics Shop  0.08


----East Birchmount Park----
                 venue  freq
0          Pizza Place  0.18
1             Pharmacy  0.09
2   Chinese Restaurant  0.09
3          Gas Station  0.09
4  Fried Chicken Joint  0.09


----Golden Mile----
                   venue  freq
0                   Park  0.33
1             Playground  0.33
2           Intersection  0.33
3      Accessories Store  0.00
4  Korean BBQ Restaurant  0.00


----Guildwood----
                   venue  freq
0             Playground   0.5
1            Pizza Place   0.5
2      Accessories Store   0.0
3                   Park   0.0
4  Korean

#### Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [156]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [157]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Cedarbrae,College Stadium,General Entertainment,Skating Rink,Café,Vietnamese Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
1,Clairlea,Fast Food Restaurant,Coffee Shop,Supermarket,Chinese Restaurant,Cosmetics Shop,Sandwich Place,Bank,Breakfast Spot,Pizza Place,Pharmacy
2,East Birchmount Park,Pizza Place,Noodle House,Bank,Pharmacy,Fast Food Restaurant,Chinese Restaurant,Italian Restaurant,Fried Chicken Joint,Thai Restaurant,Gas Station
3,Golden Mile,Intersection,Playground,Park,Vietnamese Restaurant,Coffee Shop,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
4,Guildwood,Playground,Pizza Place,Vietnamese Restaurant,Coffee Shop,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop


## 4. Cluster Neighborhoods


Run _k_-means to cluster the neighborhood into 5 clusters.


In [158]:
# set number of clusters
kclusters = 5

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 0, 1, 1, 1, 2, 3])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [159]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarborough_merged = Borough_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
scarboroughn_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


Finally, let's visualize the resulting clusters

In [154]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighborhood'], neighborhoods_venues_sorted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

In [168]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,0,Guildwood,Playground,Pizza Place,Vietnamese Restaurant,Coffee Shop,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop


#### Cluster 2

In [169]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,Cedarbrae,College Stadium,General Entertainment,Skating Rink,Café,Vietnamese Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
1,1,Clairlea,Fast Food Restaurant,Coffee Shop,Supermarket,Chinese Restaurant,Cosmetics Shop,Sandwich Place,Bank,Breakfast Spot,Pizza Place,Pharmacy
2,1,East Birchmount Park,Pizza Place,Noodle House,Bank,Pharmacy,Fast Food Restaurant,Chinese Restaurant,Italian Restaurant,Fried Chicken Joint,Thai Restaurant,Gas Station
3,1,Golden Mile,Intersection,Playground,Park,Vietnamese Restaurant,Coffee Shop,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
5,1,Highland Creek,Thai Restaurant,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Cosmetics Shop,Hobby Shop
6,1,Ionview,Skating Rink,Latin American Restaurant,Breakfast Spot,Lounge,Vietnamese Restaurant,College Stadium,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant
7,1,Kennedy Park,Vietnamese Restaurant,Shopping Mall,Auto Garage,Bakery,Middle Eastern Restaurant,Sandwich Place,Accessories Store,Smoke Shop,Skating Rink,Caribbean Restaurant
11,1,Rouge,Bar,Moving Target,Vietnamese Restaurant,College Stadium,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
12,1,Rouge Hill,Restaurant,Electronics Store,Medical Center,Donut Shop,Breakfast Spot,Rental Car Location,Mexican Restaurant,Bank,Intersection,Convenience Store
13,1,Scarborough Village,Indian Restaurant,Pet Store,Chinese Restaurant,Light Rail Station,Vietnamese Restaurant,Skating Rink,Shopping Mall,Gas Station,Fried Chicken Joint,Fast Food Restaurant


#### Cluster 3

In [170]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,2,Malvern,Fast Food Restaurant,Vietnamese Restaurant,Ice Cream Shop,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Electronics Store,Donut Shop,Discount Store


#### Cluster 4

In [171]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,3,Morningside,Coffee Shop,Convenience Store,Discount Store,Department Store,Hobby Shop,Bakery,Cosmetics Shop,Hakka Restaurant,General Entertainment,Gas Station
10,3,Port Union,Coffee Shop,Korean BBQ Restaurant,College Stadium,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop


#### Cluster 5

In [172]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Cluster Labels'] == 4]

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,4,Woburn,American Restaurant,Motel,Vietnamese Restaurant,College Stadium,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
