# Segmenting And Clustering Neighborhoods in Toronto

Notebook for Toronto neighborhoods assignment in the capstone course of the IBM data science certificate.

## Part 1 - Preparing the neighborhood dataframe

In [1]:
# Our needed imports.
import folium
import ibm_boto3
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import requests
import types
from botocore.client import Config
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from IPython.display import Image 
from sklearn.cluster import KMeans

In [2]:
# Prepare our neighborhood dataframe.
neighborhood_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

In [3]:
# Load our Wikipedia page containing postal codes in Toronto.
crawl_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(crawl_url)

In [4]:
# Loop over table and add rows to our dataframe.
soup = BeautifulSoup(r.text, 'html.parser')
postal_codes_table = soup.find('table', {'class': 'wikitable'})
postal_codes_rows = postal_codes_table.find_all('tr')
for count, postal_code in enumerate(postal_codes_rows):
    # First row is header, skip.
    if count == 0:
        continue

    # Prepare our data.
    cells = postal_code.find_all('td')
    postal = cells[0].text.strip()
    borough = cells[1].text.strip()
    neighborhood = cells[2].text.strip()

    # Skip any postal code without an assigned borough.
    if borough == 'Not assigned':
        continue
    neighborhood_df = neighborhood_df.append(pd.Series([postal, borough, neighborhood], index=neighborhood_df.columns), ignore_index=True)

In [5]:
# Preview our neighborhood dataframe.
neighborhood_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# See our dataframe's shape.
print('Shape:', neighborhood_df.shape)

Shape: (103, 3)


## Part 2 - Adding latitude and longitude

Load in the CSV data from IBM's storage into notebook.

In [7]:
# The code was removed by Watson Studio for sharing.

In [8]:
# Turn CSV data into a DatFrame.
geospatial_df = pd.read_csv(geospatial_csv).rename(columns={'Postal Code': 'PostalCode'})

In [9]:
# Merge geospatial data into neighborhood data.
neighborhood_df = neighborhood_df.merge(geospatial_df)

In [10]:
# Preview enhanced DataFrame.
neighborhood_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3 - Analyzing the data

### Generating general map to see layout of neighborhoods

In [11]:
# Set up our general variables for analysis.
LATITUDE = 43.6529
LONGITUDE = -79.3849

In [12]:
# Create map of Toronto for seeing the neighborhoods.
general_map = folium.Map(location=[LATITUDE, LONGITUDE], zoom_start=9)

# Add markers to the map for each neighborhood.
for lat, lng, borough, neighborhood in zip(neighborhood_df['Latitude'], neighborhood_df['Longitude'], neighborhood_df['Borough'], neighborhood_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(general_map)  
    
general_map

### Getting venues for each neighborhood from Foursquare

In [13]:
# The code was removed by Watson Studio for sharing.

In [14]:
# Prepares our venue DataFrame.
neighborhood_venues = pd.DataFrame(columns=[
                            'Neighborhood', 
                            'Neighborhood Latitude',
                            'Neighborhood Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category'])

In [15]:
# Set up our params for the API.
radius = 1000
limit = 100
neighborhood_count = len(neighborhood_df)

# Cycle through each neighborhood.
for index, row in neighborhood_df.iterrows():
    if index % 10 == 0:
        print('Neighborhood {} of {}...'.format(index, neighborhood_count))
    
    # create the API request URL.
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        row['Latitude'], 
        row['Longitude'], 
        radius, 
        limit)
    
    # Load our results.
    r = requests.get(url)
    results = r.json()
    
    # Get the venues.
    try:
        venues = results["response"]['groups'][0]['items']
    except KeyError:
        print('Trouble finding venues for {}. Returned response was:'.format(row['Neighborhood']), results["response"])
        
    # Add each venue to our DataFrame.
    for venue in venues:
        neighborhood_venues = neighborhood_venues.append(pd.Series([
            row['Neighborhood'],
            row['Latitude'],
            row['Longitude'],
            venue['venue']['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']], index=neighborhood_venues.columns), ignore_index=True)
print('Finished finding venues!')

Neighborhood 0 of 103...
Neighborhood 10 of 103...
Neighborhood 20 of 103...
Neighborhood 30 of 103...
Neighborhood 40 of 103...
Neighborhood 50 of 103...
Neighborhood 60 of 103...
Neighborhood 70 of 103...
Neighborhood 80 of 103...
Neighborhood 90 of 103...
Neighborhood 100 of 103...
Finished finding venues!


In [16]:
# Quick preview of our venues.
print(neighborhood_venues.shape)
neighborhood_venues.head()

(4901, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


In [17]:
print('There are {} uniques categories.'.format(len(neighborhood_venues['Venue Category'].unique())))

There are 328 uniques categories.


### Analyzing our venues to compare neighborhoods

First, get a dataframe with the top X venue categories in it.

In [18]:
# Get our dummified categories.
venue_dummified = pd.get_dummies(neighborhood_venues[['Venue Category']], prefix="", prefix_sep="")

# Add our neighborhood back to dataframe.
venue_dummified['Neighborhood'] = neighborhood_venues['Neighborhood'] 

# Move neighborhood column to the beginning.
# Thanks to https://stackoverflow.com/a/56479671 😅
venue_dummified = venue_dummified[ ['Neighborhood'] + [ col for col in venue_dummified.columns if col != 'Neighborhood' ] ]

In [19]:
# Review our dataframe.
print('Shape:', venue_dummified.shape)
venue_dummified.head()

Shape: (4901, 328)


Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Calculate our mean venue categories per neighborhood.
venue_groups = venue_dummified.groupby('Neighborhood').mean().reset_index()
print('Shape:',venue_groups.shape)
venue_groups.head()

Shape: (98, 328)


Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Agincourt,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,...,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,...,0.025,0.0,0.0,0.0,0.0,0.0,0.025,0.025,0.0,0.0


In [21]:
num_top_venues = 10

# Create columns according to number of top venues.
indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# Create a new empty dataframe with our new columns and add in our neighborhoods.
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venue_groups['Neighborhood']

# Cycle over neighborhood groups...
for index, row in venue_groups.iterrows():
    # And add in num_top_venues of the top venue categories to each neighborhood.
    neighborhoods_venues_sorted.iloc[index, 1:] = row.iloc[1:].sort_values(ascending=False).index.values[0:num_top_venues]

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Shopping Mall,Bakery,Caribbean Restaurant,Sandwich Place,Japanese Restaurant,Park,Bank,Lounge,Coffee Shop
1,"Alderwood, Long Branch",Pharmacy,Discount Store,Convenience Store,Pizza Place,Park,Trail,Sandwich Place,Liquor Store,Garden Center,Gas Station
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Convenience Store,Sushi Restaurant,Diner,Middle Eastern Restaurant,Dog Run,Sandwich Place,Fried Chicken Joint,Supermarket
3,Bayview Village,Bank,Gas Station,Japanese Restaurant,Restaurant,Grocery Store,Shopping Mall,Trail,Park,Intersection,Chinese Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Bank,Sushi Restaurant,Park,Liquor Store,Bridal Shop,Sports Club,Restaurant


Now that we have our dataframe, time to use Kmeans clustering to group them together.

In [22]:
# Our number of clusters.
kclusters = 5

# Calculate our KMeans.
neighborhood_groups_clustering = venue_groups.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighborhood_groups_clustering)

In [23]:
# Add our clustering labels to our dataframe.
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [24]:
# Start preparing our final dataframe.
neighborhood_df_final = neighborhood_df.copy()

# Merge in our neighborhood clustering results.
neighborhood_df_final = neighborhood_df_final.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# If any neighborhood didn't have venues or ended with NaN scores, let's drop it.
neighborhood_df_final = neighborhood_df_final.dropna()

# Make sure the cluster labels are in int for our calculations.
neighborhood_df_final['Cluster Labels'] = neighborhood_df_final['Cluster Labels'].astype('int32')

neighborhood_df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Bus Stop,Pharmacy,Shopping Mall,Convenience Store,Fast Food Restaurant,Café,Food & Drink Shop,Skating Rink,Road
1,M4A,North York,Victoria Village,43.725882,-79.315572,4,Coffee Shop,Gym / Fitness Center,Sporting Goods Shop,Hockey Arena,Portuguese Restaurant,Golf Course,Men's Store,French Restaurant,Park,Intersection
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,Coffee Shop,Pub,Café,Diner,Theater,Restaurant,Italian Restaurant,Breakfast Spot,Bakery,Park
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,Coffee Shop,Restaurant,Fast Food Restaurant,Vietnamese Restaurant,Sushi Restaurant,Furniture / Home Store,Dessert Shop,Fried Chicken Joint,Cheese Shop,Sandwich Place
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4,Coffee Shop,Sushi Restaurant,Park,Japanese Restaurant,Ramen Restaurant,Gastropub,Café,Italian Restaurant,Burger Joint,Pizza Place


Now that we have created our clusters, let's visualize them.

In [25]:
# Create our map.
map_clusters = folium.Map(location=[LATITUDE, LONGITUDE], zoom_start=10)

# Set up different colors for each cluster
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add each neighborhood as a marker on the map.
markers_colors = []
for lat, lon, poi, cluster in zip(neighborhood_df_final['Latitude'], neighborhood_df_final['Longitude'], neighborhood_df_final['Neighborhood'], neighborhood_df_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Interesting observation

Something interesting about our clusters is that the follow the patterns of income levels across Toronto. The main 2 clusters along through the middle and along the ocean follow the groupings of middle to high income households while the 3rd cluster follows along areas that follow the lower income areas.

In [26]:
Image(url='http://spacing.ca/toronto/wp-content/uploads/sites/4/2014/10/2014-Mayoral-Vote-and-INCOME-and-THREE-CITIES-Table-and-NCRP-4-maps-3.jpg')