# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto Submit (Part III)

#### I. Codes from Part II

In [2]:
import numpy as np 
import pandas as pd
import requests 
from bs4 import BeautifulSoup 

url = "https://www.wikizeroo.org/index.php?q=aHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGlzdF9vZl9wb3N0YWxfY29kZXNfb2ZfQ2FuYWRhOl9N"
r = requests.get(url) 
  
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 

postalcodes = [];
boroughs= [];
neighborhoods = [];
columnNum = 1;
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            passVal = False
            if columnNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    postalcodes.append(cell.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if cell.string == 'Not assigned':
                    passVal = True
                    del postalcodes[-1]
                    columnNum = 1
                    continue
                else:
                    boroughs.append(cell.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string); 
                columnNum = 1
import geocoder

colnames = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighbors = pd.DataFrame(columns=colnames)
coordinates = None


for data in range(0, len(postalcodes)-1):
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postalcodes[data]))
    coordinates = g.latlng

    neighbors = neighbors.append({ 'PostalCode': postalcodes[data],
                                   'Borough': boroughs[data],
                                   'Neighborhood': neighborhoods[data],
                                   'Latitude': coordinates[0],
                                   'Longitude': coordinates[1]}, ignore_index=True)

neighbors.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75242,-79.329242
1,M4A,North York,Victoria Village,43.7306,-79.313265
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166
3,M6A,North York,Lawrence Heights,43.72327,-79.451286
4,M6A,North York,Lawrence Manor,43.72327,-79.451286
5,M7A,Queen's Park,Queen's Park,43.66115,-79.391715
6,M9A,Queen's Park,Queen's Park,43.662299,-79.528195
7,M1B,Scarborough,Rouge,43.811525,-79.195517
8,M1B,Scarborough,Malvern,43.811525,-79.195517
9,M3B,North York,Don Mills North,43.749055,-79.362227


In [3]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

#### II. Data Wrangling

In [4]:
version = '20180605'

neighborhood_name = neighbors.loc[0, 'Neighborhood']
neighborhood_latitude = neighbors.loc[0, 'Latitude'] 
neighborhood_longitude = neighbors.loc[0, 'Longitude']

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

radius = 500
LIMIT = 100 

url = 'https://api.foursquare.com/v2/venues/explore?&client_id=VS0KQ3WUCVWPQX5YRKH5VYTCGT04NWVZEGYUKOQX2TTJGWHA&client_secret=EANHPKTYB2FH4I3WRIHXMPZLHP4I0IR4KTS1BHYANYFQDA3J&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

results = requests.get(url).json()
results

Latitude and longitude values of Parkwoods are 43.75242000000003, -79.32924245299995.


{'meta': {'code': 200, 'requestId': '5dd94d261e152c001bfab144'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [5]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [6]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [7]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


In [8]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
toronto_venues = getNearbyVenues(names=neighborhoods,
                                 latitudes=neighbors['Latitude'],
                                 longitudes=neighbors['Longitude'])

Parkwoods
Victoria Village
Harbourfront
Lawrence Heights
Lawrence Manor
Queen's Park
Queen's Park
Rouge
Malvern
Don Mills North

Woodbine Gardens
Parkview Hill
Ryerson

Garden District

Glencairn

Cloverdale

Islington
Martin Grove

Princess Gardens
West Deane Park
Highland Creek
Rouge Hill
Port Union
Flemingdon Park
Don Mills South

Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens

Eringate

Markland Wood
Old Burnhamthorpe

Guildwood

Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks

Woburn
Leaside
Central Bay Street

Christie

Cedarbrae

Hillcrest Village
Bathurst Manor
Downsview North

Wilson Heights
Thorncliffe Park
Adelaide

King

Richmond

Dovercourt Village
Dufferin

Scarborough Village
Fairview

Henry Farm
Oriole

Northwood Park
York University
East Toronto
Harbourfront East

Toronto Islands
Union Station
Little Portugal
Trinity
East Birchmount Park

Ionview
Kennedy Park
Bayview Village
CFB Toronto
Downsview East

The Danforth West

Riverdal

In [10]:
print(toronto_venues.shape)
toronto_venues.head()

(836, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.75242,-79.329242,Glen Manor Ravine,43.676821,-79.293942,Trail
1,Parkwoods,43.75242,-79.329242,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,Parkwoods,43.75242,-79.329242,Grover Pub and Grub,43.679181,-79.297215,Pub
3,Parkwoods,43.75242,-79.329242,Upper Beaches,43.680563,-79.292869,Neighborhood
4,Victoria Village,43.7306,-79.313265,Glen Manor Ravine,43.676821,-79.293942,Trail


In [11]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,4,4,4,4,4,4
Agincourt,4,4,4,4,4,4
Agincourt North,4,4,4,4,4,4
Albion Gardens,4,4,4,4,4,4
Alderwood,4,4,4,4,4,4
Bathurst Manor,4,4,4,4,4,4
Bathurst Quay,4,4,4,4,4,4
Bayview Village,4,4,4,4,4,4
Beaumond Heights,4,4,4,4,4,4
Bedford Park,4,4,4,4,4,4


In [12]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 4 uniques categories.


In [13]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Trail,Health Food Store,Neighborhood,Pub
0,1,0,Parkwoods,0
1,0,1,Parkwoods,0
2,0,0,Parkwoods,1
3,0,0,Parkwoods,0
4,1,0,Victoria Village,0


In [14]:
toronto_onehot.shape

(836, 4)

In [15]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Trail,Health Food Store,Pub
0,Adelaide,0.25,0.25,0.25
1,Agincourt,0.25,0.25,0.25
2,Agincourt North,0.25,0.25,0.25
3,Albion Gardens,0.25,0.25,0.25
4,Alderwood,0.25,0.25,0.25
5,Bathurst Manor,0.25,0.25,0.25
6,Bathurst Quay,0.25,0.25,0.25
7,Bayview Village,0.25,0.25,0.25
8,Beaumond Heights,0.25,0.25,0.25
9,Bedford Park,0.25,0.25,0.25


In [19]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Agincourt----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Agincourt North----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Albion Gardens
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Alderwood----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Bathurst Manor----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Bathurst Quay
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Bayview Village----
               venue  freq
0              Trail  0.25
1  H

               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----High Park----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Highland Creek----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Hillcrest Village----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Humber Bay----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Humber Bay Shores
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Humber Summit----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Humbergate
----
               venue  freq
0              Trail  0.25
1  Health Food Store 

               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Richmond
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Richview Gardens
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Riverdale----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Roncesvalles----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Rosedale----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Roselawn
----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Rouge----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                

               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Willowdale West----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Wilson Heights----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Woburn----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Woodbine Gardens----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----Woodbine Heights----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----York Mills----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2                Pub  0.25


----York Mills West
----
               venue  freq
0              Trail  0.25
1  Health Food Store

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Adelaide,Pub,Health Food Store,Trail
1,Agincourt,Pub,Health Food Store,Trail
2,Agincourt North,Pub,Health Food Store,Trail
3,Albion Gardens,Pub,Health Food Store,Trail
4,Alderwood,Pub,Health Food Store,Trail


#### III. Clustering

In [22]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [23]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = neighbors
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,43.75242,-79.329242,0,Pub,Health Food Store,Trail
1,M4A,North York,Victoria Village,43.7306,-79.313265,0,Pub,Health Food Store,Trail
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166,0,Pub,Health Food Store,Trail
3,M6A,North York,Lawrence Heights,43.72327,-79.451286,0,Pub,Health Food Store,Trail
4,M6A,North York,Lawrence Manor,43.72327,-79.451286,0,Pub,Health Food Store,Trail
5,M7A,Queen's Park,Queen's Park,43.66115,-79.391715,0,Pub,Health Food Store,Trail
6,M9A,Queen's Park,Queen's Park,43.662299,-79.528195,0,Pub,Health Food Store,Trail
7,M1B,Scarborough,Rouge,43.811525,-79.195517,0,Pub,Health Food Store,Trail
8,M1B,Scarborough,Malvern,43.811525,-79.195517,0,Pub,Health Food Store,Trail
9,M3B,North York,Don Mills North,43.749055,-79.362227,0,Pub,Health Food Store,Trail


In [24]:
map_clusters = folium.Map(location=[43.67635739999999, -79.2930312], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,North York,0,Pub,Health Food Store,Trail
1,North York,0,Pub,Health Food Store,Trail
2,Downtown Toronto,0,Pub,Health Food Store,Trail
3,North York,0,Pub,Health Food Store,Trail
4,North York,0,Pub,Health Food Store,Trail
5,Queen's Park,0,Pub,Health Food Store,Trail
6,Queen's Park,0,Pub,Health Food Store,Trail
7,Scarborough,0,Pub,Health Food Store,Trail
8,Scarborough,0,Pub,Health Food Store,Trail
9,North York,0,Pub,Health Food Store,Trail
