# Toronto Neighborhoods clustering and segmentation
#### Applied Data Science Capstone - Week 3 assignment

In [1]:
#Libraries to be used:

import pandas as pd
from pandas import json_normalize
import numpy as np
import folium
from bs4 import BeautifulSoup
import requests
import geocoder
from geopy.geocoders import Nominatim 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import json

### Data collecting and preparation

In [2]:
r=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup=BeautifulSoup(r.content, 'html.parser')

In [3]:
df_raw=pd.read_html(str(soup.find_all('table')))[0]
df_raw.head()

Unnamed: 0,Post Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df_raw=df_raw[df_raw.Borough != 'Not assigned']
df_raw.reset_index(inplace=True)

In [5]:
df_raw.rename(columns={'Post Code':'Postal Code'}, inplace=True)

In [6]:
len(df_raw['Postal Code'].unique()) #Number of postal codes

103

In [7]:
df_raw.drop('index', axis=1, inplace=True)

In [8]:
df_raw.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


---
---
### Obtaining coordinates for each postal code

In [9]:
lat_lng=pd.read_csv('https://cocl.us/Geospatial_data') #Using geocoder didn't work because of restrictions from Google :(

In [10]:
df_test=pd.DataFrame()
df_test[df_raw.columns]=df_raw
df=df_test.merge(lat_lng, how='left', on='Postal Code')
df.set_index('Postal Code', inplace=True)

In [11]:
df.loc[['M5G','M2H','M4B','M1J','M4G', 'M4M', 'M1R','M9V','M9L','M5V','M1B','M5A']] #Just to show the same as the screenshot

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
M2H,North York,Hillcrest Village,43.803762,-79.363452
M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M4G,East York,Leaside,43.70906,-79.363452
M4M,East Toronto,Studio District,43.659526,-79.340923
M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
M9L,North York,Humber Summit,43.756303,-79.565963
M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


---
---
### Clustering and Map creation

In [12]:
#Foursquare credentials:
CLIENT_ID = '532KKFQ4NDT0AAJHBMX0MMQD4FOL5PJKK2RTT0A4G1FYLQCW' # your Foursquare ID
CLIENT_SECRET = 'PLGEINWZYOWGJWTVR0AH2OH0X4R2MB3OT5LHHXSGRC232GR3' # your Foursquare Secret
VERSION = '20180605'


In [13]:
df_nameToronto=df[df['Borough'].str.contains('Toronto')] #Chooosing the Boroughs with "Toronto" as part of the name
df_nameToronto.reset_index(inplace=True)

In [14]:
df_nameToronto.shape

(39, 5)

In [15]:
# Function to retreive venues near a location.

def getNearbyVenues(names, latitudes, longitudes, radius=800):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'PC Latitude', 
                  'PC Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
LIMIT=100
Toronto_venues = getNearbyVenues(names=df_nameToronto['Postal Code'], #Using Postal code rather than Borough, since some boroughs have more than one postal code.
                                   latitudes=df_nameToronto['Latitude'],
                                   longitudes=df_nameToronto['Longitude']
                                  )
Toronto_venues.tail()

M5A
M7A
M5B
M5C
M4E
M5E
M5G
M6G
M5H
M6H
M5J
M6J
M4K
M5K
M6K
M4L
M5L
M4M
M4N
M5N
M4P
M5P
M6P
M4R
M5R
M6R
M4S
M5S
M6S
M4T
M5T
M4V
M5V
M4W
M5W
M4X
M5X
M4Y
M7Y


Unnamed: 0,Postal Code,PC Latitude,PC Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
2801,M7Y,43.662744,-79.321558,It's the Icing on the Cake,43.663453,-79.329813,Bakery
2802,M7Y,43.662744,-79.321558,Best In Town,43.659546,-79.328708,Food Truck
2803,M7Y,43.662744,-79.321558,TTC Stop #03057,43.663314,-79.330099,Light Rail Station
2804,M7Y,43.662744,-79.321558,Lakeshore Blvd E & Leslie St,43.658763,-79.328988,Intersection
2805,M7Y,43.662744,-79.321558,Shoppers Drug Mart,43.658033,-79.328882,Pharmacy


In [17]:
#One Hot encoding the venues per postal code
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Postal Code'] = Toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
Toronto_grouped=Toronto_onehot.groupby('Postal Code').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.020408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030612,0.0,...,0.0,0.0,0.0,0.0,0.020408,0.0,0.010204,0.0,0.0,0.020408
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#Function to sort top venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
#New dataframe with top ten venues per postal code.
num_top_venues = 15

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
CP_venues_sorted = pd.DataFrame(columns=columns)
CP_venues_sorted['Postal Code'] = Toronto_grouped['Postal Code']

for ind in np.arange(Toronto_grouped.shape[0]):
    CP_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

CP_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,M4E,Pub,Grocery Store,Breakfast Spot,Health Food Store,Café,Sandwich Place,Asian Restaurant,Japanese Restaurant,Bar,Coffee Shop,Pharmacy,Caribbean Restaurant,Cheese Shop,Chocolate Shop,Ramen Restaurant
1,M4K,Greek Restaurant,Coffee Shop,Pub,Café,Italian Restaurant,Fast Food Restaurant,Park,Furniture / Home Store,Ice Cream Shop,Flower Shop,Discount Store,Bakery,Grocery Store,Pizza Place,Restaurant
2,M4L,Indian Restaurant,Grocery Store,Park,Gym,Coffee Shop,Restaurant,Sandwich Place,Café,Bus Stop,Fast Food Restaurant,Light Rail Station,Brewery,Skate Park,Farmers Market,Fish & Chips Shop
3,M4M,Café,Bar,Coffee Shop,Bakery,Diner,Brewery,Sushi Restaurant,Sandwich Place,Park,American Restaurant,Gastropub,Pizza Place,Italian Restaurant,Latin American Restaurant,Arts & Crafts Store
4,M4N,Bookstore,Coffee Shop,Café,Park,Bus Line,Gym / Fitness Center,Restaurant,Event Space,Falafel Restaurant,Ethiopian Restaurant,Dive Bar,Farm,Farmers Market,Electronics Store,Eastern European Restaurant


In [29]:
#Clustering the venues

# set number of clusters
kclusters = 6

Toronto_grouped_clustering = Toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters,n_init=1000, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([1, 1, 1, 1, 0, 1, 1, 1, 4, 1])

In [30]:
# add clustering labels
#CP_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)  #Commented out for re-running the code. It is not needed to re-add the "Cluster Labels" columns

Toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(CP_venues_sorted.set_index('Postal Code'), on='Postal Code')

Toronto_merged.head() 

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
M3A,North York,Parkwoods,43.753259,-79.329656,,,,,,,,,,,,,,,,
M4A,North York,Victoria Village,43.725882,-79.315572,,,,,,,,,,,,,,,,
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Coffee Shop,Theater,Park,Café,Restaurant,Pub,Bakery,Breakfast Spot,Thai Restaurant,Italian Restaurant,Performing Arts Venue,Dance Studio,Distribution Center,Discount Store,Dessert Shop
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,,,,,,,,,,,,,,,,
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4.0,Coffee Shop,Italian Restaurant,Bookstore,Bubble Tea Shop,Park,Gastropub,Japanese Restaurant,Diner,Creperie,Office,Thai Restaurant,Ice Cream Shop,Clothing Store,Pizza Place,Café


In [31]:
#Dropping rows with Postal codes without venues

Toronto_merged.dropna(inplace=True)


In [32]:
Toronto_merged.reset_index(inplace=True)
Toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Coffee Shop,Theater,Park,Café,...,Pub,Bakery,Breakfast Spot,Thai Restaurant,Italian Restaurant,Performing Arts Venue,Dance Studio,Distribution Center,Discount Store,Dessert Shop
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4.0,Coffee Shop,Italian Restaurant,Bookstore,Bubble Tea Shop,...,Gastropub,Japanese Restaurant,Diner,Creperie,Office,Thai Restaurant,Ice Cream Shop,Clothing Store,Pizza Place,Café
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,4.0,Coffee Shop,Clothing Store,Japanese Restaurant,Gastropub,...,Bubble Tea Shop,Pizza Place,Middle Eastern Restaurant,Theater,Plaza,Department Store,Poke Place,Electronics Store,Cosmetics Shop,Ramen Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3.0,Café,Restaurant,Coffee Shop,Clothing Store,...,Hotel,American Restaurant,Cosmetics Shop,Plaza,Farmers Market,Bakery,Beer Bar,Gym,Theater,Pizza Place
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.0,Pub,Grocery Store,Breakfast Spot,Health Food Store,...,Sandwich Place,Asian Restaurant,Japanese Restaurant,Bar,Coffee Shop,Pharmacy,Caribbean Restaurant,Cheese Shop,Chocolate Shop,Ramen Restaurant


In [33]:
#Getting coordinates for Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [34]:
Toronto_merged['Cluster Labels']=Toronto_merged['Cluster Labels'].astype(int)

In [35]:
#Create and visualize the map with clusters:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Postal Code'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters