In [1]:
import urllib3
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# 1. Download Wiki HTML Page, Parse data and Create DataFrame

In [2]:
def getPreppedDataFrame(htmldata_url):
    http = urllib3.PoolManager()
    response = http.request('GET', htmldata_url)
    page = response.data
    
    # create lists
    columns = []
    postcodes = []
    boroughs = []
    neighbourhoods = []

    # use BeautifulSoup to parse HTML page and extract table/rows/columns
    soup = BeautifulSoup(page, 'html.parser')
    table = soup.find("table", class_="wikitable sortable")
    headers = table.find_all("th")
    rows = table.find_all("tr")

    # extract column headers
    for header in headers:
        columns.append(header.text.rstrip())

    # extract postcodes, boroughs and neighbourhoods from rows/columns
    for i in range(1, len(rows)):
        cols = rows[i].find_all("td")
        postcodes.append(cols[0].text.rstrip())
        boroughs.append(cols[1].text.rstrip())
        neighbourhoods.append(cols[2].text.rstrip())

    # use zip to create dataframe
    list_of_tuples = list(zip(postcodes, boroughs, neighbourhoods))  
    df = pd.DataFrame(list_of_tuples, columns=columns)  
    
    # handle invalid values for Borough
    df_clean = df.drop(df[df["Borough"] == 'Not assigned'].index)
    df_clean.reset_index(drop=True)

    # handle invalid values for Neighbourhood - no rows
    #print(df_clean[df_clean["Neighbourhood"] == 'Not assigned'])

    # group by Postcode
    df_grp = df_clean.groupby("Postcode")

    # iterate through each group to extract required data
    postcodes = []
    boroughs = []
    neighbourhoods = []
    for key,group_df in df_grp:
        #print(f"key={key}, borough={group_df['Borough'].unique()[0]}, neighbourhoods={group_df[['Neighbourhood']].apply(lambda n: ','.join(n))[0]}")
        postcodes.append(key)
        boroughs.append(group_df['Borough'].unique()[0])
        neighbourhoods.append(group_df[['Neighbourhood']].apply(lambda n: ','.join(n))[0])

    list_of_tuples = list(zip(postcodes, boroughs, neighbourhoods))  
    df_final = pd.DataFrame(list_of_tuples, columns=columns)  
    return df_final

In [3]:
!wget -q -O postcode_coordinates.csv https://cocl.us/Geospatial_data

In [4]:
df_coords = pd.read_csv("postcode_coordinates.csv")
df_coords.set_index("Postal Code", inplace = True) 
#df_coords
#df_coords.loc["M9W", ["Latitude", "Longitude"]][1]

In [5]:
# get previously created dataframe
df = getPreppedDataFrame('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

latitudes = []
longitudes = []
for ind in df.index: 
    postcode = df['Postcode'][ind]
    row = df_coords.loc[postcode, ["Latitude", "Longitude"]]
    latitudes.append(row[0])
    longitudes.append(row[1])

df["Latitude"] = latitudes
df["Longitude"] = longitudes



# 2. Fetch Venues for each Postcode using FourSquare API

#### Note: Only boroughs that contain the word 'Toronto' will be considered

In [6]:
CLIENT_ID = '2AF4TLBPWOCNZGHDH4LTN3V5P0HLVI0NYH5W3XLOX1SWQAY1' # your Foursquare ID
CLIENT_SECRET = 'VXRZFT2JFFIUF0Z1VCRX14U315VBQPRQDKDT4EZ0WY2PUGXO' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
# consider only 'Toronto' Boroughs
df_toronto = df[df['Borough'].str.contains("Toronto")]
#df_toronto

df_toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

# 3. Clean and Prepare DataFrame for analysis/clustering

In [8]:
# one hot encoding
df_toronto_venues_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_toronto_venues_onehot['Neighborhood'] = df_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_toronto_venues_onehot.columns[-1]] + list(df_toronto_venues_onehot.columns[:-1])
df_toronto_venues_onehot = df_toronto_venues_onehot[fixed_columns]
#df_toronto_venues_onehot

df_toronto_grouped = df_toronto_venues_onehot.groupby('Neighborhood').mean().reset_index()
#df_toronto_grouped

In [9]:
def return_most_common_venues(row, num_top_venues=10):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [10]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_toronto_grouped['Neighborhood']

for ind in np.arange(df_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Restaurant,Café,Thai Restaurant,Steakhouse,Bar,Sushi Restaurant,Seafood Restaurant,Lounge,Cosmetics Shop
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Cheese Shop,Farmers Market,Beer Bar,Café,Bakery,Restaurant,Eastern European Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Café,Breakfast Spot,Bakery,Coffee Shop,Music Venue,Stadium,Bar,Intersection,Convenience Store,Italian Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Comic Shop,Skate Park,Brewery,Restaurant,Burrito Place,Recording Studio,Pizza Place,Moving Target,Spa
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Sculpture Garden,Boutique,Bar,Rental Car Location,Plane,Coffee Shop,Boat or Ferry


# 4. Cluster Toronto neighborhoods using K-Means

In [11]:
##### set number of clusters
kclusters = 5

toronto_grouped_clustering = df_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=12, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
np.unique(kmeans.labels_)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_toronto_merged = df_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

df_toronto_merged.head() # check the last columns!

[0 0 0 0 0 0 0 0 0 0]


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Health Food Store,Pub,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Women's Store,Deli / Bodega
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Frozen Yogurt Shop,Pub,Pizza Place,Lounge,Liquor Store
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Pizza Place,Movie Theater,Fish & Chips Shop,Pub,Sushi Restaurant,Fast Food Restaurant,Italian Restaurant,Burrito Place,Steakhouse,Board Shop
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Yoga Studio,Bookstore,Sandwich Place,Cheese Shop
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Construction & Landscaping,Swim School,Bus Line,Dessert Shop,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [12]:
# Toronto
latitude = 43.651070
longitude = -79.347015
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_merged['Latitude'], df_toronto_merged['Longitude'], df_toronto_merged['Neighbourhood'], df_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 5. Explore resulting clusters

#### 5.1 Cluster 0 - Coffee Shops, Restaurants and Dining

In [13]:
print(df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 0].shape)
df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 0, df_toronto_merged.columns[[1] + list(range(5, df_toronto_merged.shape[1]))]]

(34, 16)


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,East Toronto,0,Trail,Health Food Store,Pub,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Women's Store,Deli / Bodega
41,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Frozen Yogurt Shop,Pub,Pizza Place,Lounge,Liquor Store
42,East Toronto,0,Pizza Place,Movie Theater,Fish & Chips Shop,Pub,Sushi Restaurant,Fast Food Restaurant,Italian Restaurant,Burrito Place,Steakhouse,Board Shop
43,East Toronto,0,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Yoga Studio,Bookstore,Sandwich Place,Cheese Shop
45,Central Toronto,0,Park,Breakfast Spot,Hotel,Gym,Gym / Fitness Center,Food & Drink Shop,Sandwich Place,Department Store,Discount Store,Dim Sum Restaurant
46,Central Toronto,0,Clothing Store,Coffee Shop,Spa,Salon / Barbershop,Restaurant,Café,Chinese Restaurant,Pet Store,Yoga Studio,Bagel Shop
47,Central Toronto,0,Sandwich Place,Dessert Shop,Pizza Place,Coffee Shop,Italian Restaurant,Gym,Café,Sushi Restaurant,Pharmacy,Brewery
49,Central Toronto,0,Pub,Coffee Shop,Supermarket,Sushi Restaurant,Sports Bar,Fried Chicken Joint,Restaurant,Pizza Place,American Restaurant,Liquor Store
51,Downtown Toronto,0,Coffee Shop,Restaurant,Pub,Pizza Place,Italian Restaurant,Café,Bakery,Bank,Snack Place,Chinese Restaurant
52,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Men's Store,Gastropub,Fast Food Restaurant,Hotel,Bubble Tea Shop


#### 5.2 Cluster 1 - Playgrounds and Trails

In [14]:
print(df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 1].shape)
df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 1, df_toronto_merged.columns[[1] + list(range(5, df_toronto_merged.shape[1]))]]

(1, 16)


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
48,Central Toronto,1,Playground,Trail,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


#### 5.3 Cluster 2 - Parks and Playgrounds

In [15]:
print(df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 2].shape)
df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 2, df_toronto_merged.columns[[1] + list(range(5, df_toronto_merged.shape[1]))]]

(2, 16)


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Downtown Toronto,2,Park,Playground,Trail,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
64,Central Toronto,2,Park,Jewelry Store,Trail,Sushi Restaurant,Women's Store,Department Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


#### 5.4 Cluster 3 - Home services and Gardens

In [16]:
print(df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 3].shape)
df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 3, df_toronto_merged.columns[[1] + list(range(5, df_toronto_merged.shape[1]))]]

(1, 16)


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Central Toronto,3,Home Service,Garden,Women's Store,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


#### 5.5 Cluster 4 - Parks and Contructions

In [17]:
print(df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 4].shape)
df_toronto_merged.loc[df_toronto_merged['Cluster Labels'] == 4, df_toronto_merged.columns[[1] + list(range(5, df_toronto_merged.shape[1]))]]

(1, 16)


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Central Toronto,4,Park,Construction & Landscaping,Swim School,Bus Line,Dessert Shop,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
