#  Segmenting and Clustering Neighborhoods in Toronto Project

In [1]:
!conda install -c conda-forge geopy --yes

!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [2]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

## Part 1: Construct Toronto Neighborhood Dataframe

### 1A - Scrape Website for Data

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [4]:
soup = BeautifulSoup(source, 'html.parser')

In [6]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

### 1B Construct Initial Panda Dataframe of Toronto Neighborhood Data

In [7]:
toronto_neighorhood = [('PostalCode', postalCodeList),
                      ('Borough', boroughList),
                      ('Neighborhood', neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighorhood))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 1C - Clean Toronto Panda Dataframe

In [23]:
toronto_df_dropna = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)

In [24]:
toronto_df_grouped = toronto_df_dropna.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))

In [25]:
na_neigh_rows = toronto_df_grouped.Neighborhood == 'Not assigned'
toronto_df_grouped.loc[na_neigh_rows, 'Neighborhood'] = toronto_df_grouped.loc[na_neigh_rows, 'Borough']

In [27]:
toronto_df_cleaned = toronto_df_grouped
toronto_df_cleaned.shape
toronto_df_cleaned.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Part 2: Adding Coordinates to the Toronto DataFrame

### 2A - Obtain Neighborhood Coordinates

In [28]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
coors = pd.read_csv('toronto_coordinates.csv')

In [13]:
print(coors.shape)
coors.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 2B - Adding Coordinates to Neighborhood Dataframe

In [14]:
toronto_df_temp = toronto_df_cleaned.set_index('PostalCode')
coors_temp = coors.set_index('Postal Code')
toronto_df_coors = pd.concat([toronto_df_temp, coors_temp], axis=1, join='inner')

In [15]:
toronto_df_coors.index.name = 'PostalCode'
toronto_df_coors.reset_index(inplace=True)

In [16]:
print(toronto_df_coors.shape)
toronto_df_coors.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Explore and Cluster Toronto Neighborhoods

### 3A - Obtain Toronto Coordinates

In [17]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


### 3B - Create Initial Map of Toronto Neighborhoods

In [18]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(toronto_df_coors['Latitude'], toronto_df_coors['Longitude'], toronto_df_coors['PostalCode'], toronto_df_coors['Borough'], toronto_df_coors['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### 3C - Narrow to Boroughs with Toronto in Name

In [30]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = toronto_df_coors[toronto_df_coors['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)

(39, 5)


In [20]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### 3C - Use FourSquare API to Explore Selected Boroughs

In [32]:
CLIENT_ID = 'KFJLMQKZT212IO1OU0F1G2F5GTNUAI5RBRB24H0FFRELEFYJ' # your Foursquare ID
CLIENT_SECRET = 'JWUKLPYOBX205SJZZLWVF44WG5RPEOMZGXY0LZQHRMXHET3W' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KFJLMQKZT212IO1OU0F1G2F5GTNUAI5RBRB24H0FFRELEFYJ
CLIENT_SECRET:JWUKLPYOBX205SJZZLWVF44WG5RPEOMZGXY0LZQHRMXHET3W


In [34]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [35]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1726, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


### 3D - Count Venues by Neighborhood

In [36]:
venues_df.groupby(['PostalCode', 'Borough', 'Neighborhood'])['VenueName'].count()

PostalCode  Borough           Neighborhood                                                                                        
M4E         East Toronto      The Beaches                                                                                               4
M4K         East Toronto      The Danforth West,Riverdale                                                                              42
M4L         East Toronto      The Beaches West,India Bazaar                                                                            21
M4M         East Toronto      Studio District                                                                                          42
M4N         Central Toronto   Lawrence Park                                                                                             3
M4P         Central Toronto   Davisville North                                                                                          7
M4R         Central Toronto   North Toron

In [37]:
len(venues_df['VenueCategory'].unique())

240

### 3E -  Convert Categorical Variables for Cluster Analysis

In [39]:
# one hot encoding
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]
print(toronto_central_onehot.shape)

(1726, 243)


### 3F - Group Venues

In [40]:
toronto_central_venues_freq = toronto_central_onehot.groupby(['PostalCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)

(39, 243)


### 3G - Identify top ten venues

In [41]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_central_venues_freq['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_central_venues_freq['Neighborhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted

Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Terminal,Airport,Bar,Coffee Shop,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry,Plane
31,M6H,West Toronto,"Dovercourt Village,Dufferin",Bakery,Supermarket,Pharmacy,Music Venue,Brazilian Restaurant,Café,Middle Eastern Restaurant,Recording Studio,Bar,Bank
32,M6J,West Toronto,"Little Portugal,Trinity",Bar,Asian Restaurant,Restaurant,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Men's Store,Pizza Place,Coffee Shop,Italian Restaurant
3,M4M,East Toronto,Studio District,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Gastropub,Brewery,Yoga Studio,Sandwich Place,Fish Market
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",Café,Nightclub,Coffee Shop,Breakfast Spot,Gym,Bakery,Performing Arts Venue,Pet Store,Grocery Store,Climbing Gym
25,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Restaurant,Sandwich Place,Bakery,Bookstore,Japanese Restaurant,Italian Restaurant,Theater,Bar,Chinese Restaurant
24,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",Café,Sandwich Place,Coffee Shop,Pizza Place,Burger Joint,Middle Eastern Restaurant,Cheese Shop,Indian Restaurant,Pub,BBQ Joint
26,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant,Dumpling Restaurant,Coffee Shop,Bar,Bakery,Mexican Restaurant,Cocktail Bar
19,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",Coffee Shop,Aquarium,Italian Restaurant,Café,Hotel,Scenic Lookout,Restaurant,Brewery,Fried Chicken Joint,Pizza Place
11,M4X,Downtown Toronto,"Cabbagetown,St. James Town",Coffee Shop,Bakery,Restaurant,Pizza Place,Pub,Café,Italian Restaurant,Convenience Store,Gastropub,Pharmacy


### 3H - Create Venue Clusters

In [54]:
kclusters = 3

toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['PostalCode', 'Borough', 'Neighborhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_venues_freq_clustering)

toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.join(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('PostalCode'), on='PostalCode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_central_clustered_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,0,Jewelry Store,Trail,Park,Sushi Restaurant,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Swim School,Bus Line,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442,1,Airport Service,Airport Terminal,Airport,Bar,Coffee Shop,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry,Plane
31,M6H,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259,1,Bakery,Supermarket,Pharmacy,Music Venue,Brazilian Restaurant,Café,Middle Eastern Restaurant,Recording Studio,Bar,Bank
32,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,1,Bar,Asian Restaurant,Restaurant,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Men's Store,Pizza Place,Coffee Shop,Italian Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Gastropub,Brewery,Yoga Studio,Sandwich Place,Fish Market
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191,1,Café,Nightclub,Coffee Shop,Breakfast Spot,Gym,Bakery,Performing Arts Venue,Pet Store,Grocery Store,Climbing Gym
25,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049,1,Café,Restaurant,Sandwich Place,Bakery,Bookstore,Japanese Restaurant,Italian Restaurant,Theater,Bar,Chinese Restaurant
24,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,1,Café,Sandwich Place,Coffee Shop,Pizza Place,Burger Joint,Middle Eastern Restaurant,Cheese Shop,Indian Restaurant,Pub,BBQ Joint


### 3I - Plot Clusters on Map of Toronto

In [55]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

In [56]:
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [57]:
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], toronto_central_clustered_df['Longitude'], toronto_central_clustered_df['PostalCode'], toronto_central_clustered_df['Borough'], toronto_central_clustered_df['Neighborhood'], toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [47]:
map_clusters

### The results suggest the following cluster names:

##### Cluster 0: Parks & Trails Neighborhoods
##### Cluster 1: Business Neighborhoods
##### Cluster 2: Active Recreational Activity Neighborhoods