<h1> Import, Clean and Transform Dataset

In [2]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

from geopy.geocoders import Nominatim #coverts address into latitude and longitude values

import requests #handles requests
from pandas.io.json import json_normalize #json files to pandas dataframe

#plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#k-means
from sklearn.cluster import KMeans

import folium #map rendering library

from bs4 import BeautifulSoup #web scraping

print('Libraries imported.')

Libraries imported.


In [3]:
# #Web Scraping with BeautifulSoup
# url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# response = requests.get(url)
# print('Response Code: ', response)
# soup = BeautifulSoup(response.text, 'html.parser')
# soup.findAll('td')[:20]

In [4]:
# Use pandas to read in the HTML. Then renames the column names to the appropriate titles.
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)[0]
header = df.iloc[0]
df = df[1:]
df = df.rename(columns = header)

In [5]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [6]:
df['Borough'].value_counts()

Not assigned        77
Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [7]:
# Filtering out rows where Borough is 'Not assigned'
toronto = df[df['Borough'] != 'Not assigned']
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [8]:
# Grouping by the Postcodes and the Borough while aggregating the neighbourhood if within the same Postcode.
grouped_toronto = toronto.groupby(['Postcode', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))

In [9]:
# For loop through the rows and grabbing any rows where the Neighbourhood is Not assigned
# If condition is found, assign the Neighbourhood the same value as the Borough of that row
for index,row in grouped_toronto.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

In [11]:
# Checking the final dataframe
grouped_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
grouped_toronto.shape

(103, 3)

<h1> Getting Latitude and Longitude

In [13]:
lat_lng = pd.read_csv('Geospatial_Coordinates.csv')

In [14]:
lat_lng.columns = ['Postcode', 'Latitude', 'Longitude']
lat_lng.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
full_toronto = pd.merge(grouped_toronto, lat_lng, on='Postcode', how='left')
full_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Exploring Dataset

In [16]:
print('The dataframe has {} and {} neighbourhoods.'.format(full_toronto['Borough'].nunique(), full_toronto['Neighbourhood'].nunique()))

The dataframe has 11 and 103 neighbourhoods.


In [17]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical location for Toronto, Ontario is {}, {}.'.format(latitude, longitude))

The geographical location for Toronto, Ontario is 43.653963, -79.387207.


In [18]:
# Create map of Toronto using the latitude and longitude
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Adding markers to the map
for lat, lng, borough, neighbourhood in zip(full_toronto['Latitude'], full_toronto['Longitude'],
                                           full_toronto['Borough'], full_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [19]:
CLIENT_ID = 'VHZM10UIGEKH1UMU2RG0E13CGHNYPUFBVQ3QU2DHBO0VA2QF'
CLIENT_SECRET = 'JGRJYABMNMV2LRIR3YOVVALEE2UBS5PBXEOIMMBWIT5HWLTU'
VERSION = '20180605'

print('Credentials:  ')
print('Client_ID: ' + CLIENT_ID)
print('Client_Secret: ' + CLIENT_SECRET)

Credentials:  
Client_ID: VHZM10UIGEKH1UMU2RG0E13CGHNYPUFBVQ3QU2DHBO0VA2QF
Client_Secret: JGRJYABMNMV2LRIR3YOVVALEE2UBS5PBXEOIMMBWIT5HWLTU


In [20]:
# Let's see which neighbourhood is sixth on the list
# print(full_toronto[full_toronto['Borough']=='Downtown Toronto'])
print('We will be exploring the neighbourhood of {}.'.format(full_toronto.loc[57,'Neighbourhood']))
hood = full_toronto.loc[57,'Neighbourhood']

# Get the latitude and the longitude of the neighbourhood being explored
hood_lat = full_toronto.loc[57,'Latitude']
hood_lng = full_toronto.loc[57,'Longitude']

print('The latitude and longitude values of {} is {}, {}.'.format(hood, hood_lat, hood_lng))

We will be exploring the neighbourhood of Central Bay Street.
The latitude and longitude values of Central Bay Street is 43.6579524, -79.3873826.


In [21]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, hood_lat, hood_lng, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=VHZM10UIGEKH1UMU2RG0E13CGHNYPUFBVQ3QU2DHBO0VA2QF&client_secret=JGRJYABMNMV2LRIR3YOVVALEE2UBS5PBXEOIMMBWIT5HWLTU&v=20180605&ll=43.6579524,-79.3873826&radius=500&limit=100'

In [24]:
results = requests.get(url).json()

In [25]:
results['response']['groups'][0]['items'][0]['venue']

{'id': '537d4d6d498ec171ba22e7fe',
 'name': "Jimmy's Coffee",
 'location': {'address': '82 Gerrard Street W',
  'crossStreet': 'Gerrard & LaPlante',
  'lat': 43.65842123574496,
  'lng': -79.38561319551111,
  'labeledLatLngs': [{'label': 'display',
    'lat': 43.65842123574496,
    'lng': -79.38561319551111}],
  'distance': 151,
  'postalCode': 'M5G 1Z4',
  'cc': 'CA',
  'city': 'Toronto',
  'state': 'ON',
  'country': 'Canada',
  'formattedAddress': ['82 Gerrard Street W (Gerrard & LaPlante)',
   'Toronto ON M5G 1Z4',
   'Canada']},
 'categories': [{'id': '4bf58dd8d48988d1e0931735',
   'name': 'Coffee Shop',
   'pluralName': 'Coffee Shops',
   'shortName': 'Coffee Shop',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/coffeeshop_',
    'suffix': '.png'},
   'primary': True}],
 'photos': {'count': 0, 'groups': []}}

In [26]:
def get_category_type(row):
    try:
        category_list = row['categories']
    except:
        category_list = row['venue.categories']
    
    if len(category_list) == 0:
        return None
    else:
        return category_list[0]['name']

In [27]:
venues = results['response']['groups'][0]['items']

nearby_venue = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venue = nearby_venue.loc[:, filtered_columns]

nearby_venue['venue.categories'] = nearby_venue.apply(get_category_type, axis=1)

nearby_venue.columns = [col.split('.')[-1] for col in nearby_venue.columns]
print(nearby_venue.info())
nearby_venue.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
name          81 non-null object
categories    81 non-null object
lat           81 non-null float64
lng           81 non-null float64
dtypes: float64(2), object(2)
memory usage: 2.6+ KB
None


Unnamed: 0,name,categories,lat,lng
0,Jimmy's Coffee,Coffee Shop,43.658421,-79.385613
1,Tim Hortons,Coffee Shop,43.65857,-79.385123
2,Hailed Coffee,Coffee Shop,43.658833,-79.383684
3,The Elm Tree Restaurant,Modern European Restaurant,43.657397,-79.383761
4,The Queen and Beaver Public House,Gastropub,43.657472,-79.383524


In [28]:
print('{} venues were returned by Foursquare for the neighbourhood of {}.'.format(nearby_venue.shape[0], hood))

81 venues were returned by Foursquare for the neighbourhood of Central Bay Street.


In [29]:
full_toronto['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 5
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [30]:
# Let's examine neighbourhoods where the borough contains the word 'Toronto'
toronto_boroughs = [x for x in full_toronto['Borough'] if 'york' in x.lower()]

In [32]:
toronto_boroughs[0:5]

['North York', 'North York', 'North York', 'North York', 'North York']

In [33]:
toronto_final = full_toronto[full_toronto['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
toronto_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493


In [34]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # Create the API request URL to retrieve data and GET request
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
        results = requests.get(url).json()['response']['groups'][0]['items']
    
        # Return only relevant information for the nearby venues
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighbourhood',
                                 'Neighbourhood Latitude',
                                 'Neighbourhood Longitude',
                                 'Venue',
                                 'Venue Latitude',
                                 'Venue Longitude',
                                 'Venue Category'
                                ]
    return(nearby_venues)

In [35]:
toronto_venues = getNearbyVenues(names=toronto_final['Neighbourhood'],
                                 latitudes=toronto_final['Latitude'],
                                 longitudes=toronto_final['Longitude']
                                )

Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto
Bedford Park, Lawrence Manor East
Lawrence Heights, Lawrence Manor
Glencairn
Humewood-Cedarvale
Caledonia-Fairbanks
Downsview, North Park, Upwood Park
Del Ray, Keelesdale, Mount Dennis, Silverthorn
The Junction North, Runnymede
Humber Summit
Emery, Humberlea
Weston


In [36]:
toronto_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 7 columns):
Neighbourhood              339 non-null object
Neighbourhood Latitude     339 non-null float64
Neighbourhood Longitude    339 non-null float64
Venue                      339 non-null object
Venue Latitude             339 non-null float64
Venue Longitude            339 non-null float64
Venue Category             339 non-null object
dtypes: float64(4), object(3)
memory usage: 18.6+ KB


In [37]:
toronto_venues.groupby(['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,18,18,18,18
Bayview Village,43.786947,-79.385975,4,4,4,4
"Bedford Park, Lawrence Manor East",43.733283,-79.41975,22,22,22,22
"CFB Toronto, Downsview East",43.737473,-79.464763,2,2,2,2
Caledonia-Fairbanks,43.689026,-79.453512,5,5,5,5
"Del Ray, Keelesdale, Mount Dennis, Silverthorn",43.691116,-79.476013,4,4,4,4
Don Mills North,43.745906,-79.352188,4,4,4,4
Downsview Central,43.728496,-79.495697,2,2,2,2
Downsview Northwest,43.761631,-79.520999,5,5,5,5
Downsview West,43.739015,-79.506944,6,6,6,6


In [38]:
print('There are {} unique venue categories.'.format(toronto_venues['Venue Category'].nunique()))

There are 121 unique venue categories.


In [39]:
# one hot-encoding the venue category column, making them into column heads
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe with the dummy variables
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# sums up the number of venues in each category grouped by neighbourhoods
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beer Store,Bike Shop,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Butcher,Café,Candy Store,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Electronics Store,Empanada Restaurant,Event Space,Falafel Restaurant,Fast Food Restaurant,Field,Fish & Chips Shop,Food & Drink Shop,Food Court,Food Truck,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gastropub,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hockey Arena,Hotel,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Liquor Store,Lounge,Market,Massage Studio,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Record Shop,Restaurant,Salon / Barbershop,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Spa,Sporting Goods Shop,Sports Bar,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wings Joint,Women's Store,Yoga Studio
0,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.090909,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.090909,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto, Downsview East",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0


In [40]:
toronto_grouped.shape

(32, 122)

In [41]:
# print out the top 5 categories of each neighourhood
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print('-----'+hood+'-----')
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['category', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

-----Bathurst Manor, Downsview North, Wilson Heights-----
           category  freq
0       Coffee Shop  0.11
1       Supermarket  0.06
2  Sushi Restaurant  0.06
3          Pharmacy  0.06
4             Diner  0.06


-----Bayview Village-----
              category  freq
0                 Bank  0.25
1   Chinese Restaurant  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4                 Park  0.00


-----Bedford Park, Lawrence Manor East-----
               category  freq
0           Coffee Shop  0.09
1      Sushi Restaurant  0.09
2    Italian Restaurant  0.09
3  Fast Food Restaurant  0.05
4       Thai Restaurant  0.05


-----CFB Toronto, Downsview East-----
      category  freq
0      Airport   0.5
1         Park   0.5
2         Pool   0.0
3   Playground   0.0
4  Pizza Place   0.0


-----Caledonia-Fairbanks-----
               category  freq
0                  Park   0.4
1         Women's Store   0.2
2  Fast Food Restaurant   0.2
3                Market   0.2
4     Acce

In [42]:
# Create a function to retrieve the top venues of a specified number
def get_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [43]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
# goes through the numbers in the specified number of top venues
# the list in indicators will make sure 1st, 2nd, and 3rd is correctly labeled
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
print(columns)

# create a new dataframe with the new top venues
neighbourhoods_top_venues = pd.DataFrame(columns=columns)
neighbourhoods_top_venues['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_top_venues.iloc[ind, 1:] = get_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighbourhoods_top_venues.head()

['Neighbourhood', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue', '6th Most Common Venue', '7th Most Common Venue', '8th Most Common Venue', '9th Most Common Venue', '10th Most Common Venue']


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Pizza Place,Supermarket,Frozen Yogurt Shop,Fast Food Restaurant,Diner,Deli / Bodega,Middle Eastern Restaurant,Pharmacy,Restaurant
1,Bayview Village,Chinese Restaurant,Japanese Restaurant,Café,Bank,Yoga Studio,Deli / Bodega,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
2,"Bedford Park, Lawrence Manor East",Sushi Restaurant,Italian Restaurant,Coffee Shop,Pizza Place,Pharmacy,Café,Butcher,Pub,Liquor Store,Restaurant
3,"CFB Toronto, Downsview East",Airport,Park,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio
4,Caledonia-Fairbanks,Park,Women's Store,Fast Food Restaurant,Market,Yoga Studio,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop


# Clustering Neighbourhoods

In [44]:
# set the number of clusters we would like to have
kclusters = 5

toronto_cluster = toronto_grouped.drop('Neighbourhood', axis=1)

# KMeans clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

# label of the cluster for each row
kmeans.labels_

array([1, 1, 1, 3, 3, 1, 1, 4, 1, 3, 3, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       3, 1, 1, 1, 2, 1, 1, 1, 1, 3], dtype=int32)

In [45]:
# Create a dataframe with the top venue categories and the cluster labels
neighbourhoods_top_venues.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merge = toronto_final

# merge to get the data from both dataframe with added latitude and longitude
toronto_merge = toronto_merge.join(neighbourhoods_top_venues.set_index('Neighbourhood'), on='Neighbourhood').dropna()


In [47]:
toronto_merge.sort_values(by= 'Cluster Labels', ascending=False).head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,M9M,North York,"Emery, Humberlea",43.724766,-79.532242,4.0,Baseball Field,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega
15,M3M,North York,Downsview Central,43.728496,-79.495697,4.0,Food Truck,Baseball Field,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio
14,M3L,North York,Downsview West,43.739015,-79.506944,3.0,Grocery Store,Hotel,Park,Shopping Mall,Bank,Curling Ice,Dessert Shop,Department Store,Deli / Bodega,Dance Studio
22,M4J,East York,East Toronto,43.685347,-79.338106,3.0,Coffee Shop,Convenience Store,Park,Yoga Studio,Concert Hall,Construction & Landscaping,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega
6,M2P,North York,York Mills West,43.752758,-79.400049,3.0,Park,Convenience Store,Bank,Bar,Yoga Studio,Dog Run,Construction & Landscaping,Cosmetics Shop,Curling Ice,Dance Studio


# Visualizing the Clusters

In [48]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# cluster color scheme
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merge['Latitude'], toronto_merge['Longitude'], toronto_merge['Neighbourhood'], toronto_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius = 5,
        popup = label,
        color = rainbow[int(cluster-1)],
        fill = True,
        fill_color = rainbow[int(cluster-1)],
        fill_opacity = 0.7
    ).add_to(map_clusters)

map_clusters

In [49]:
toronto_merge.loc[toronto_merge['Cluster Labels'] == 0, toronto_merge.columns[[1] + list(range(5, toronto_merge.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,North York,0.0,Pizza Place,Italian Restaurant,Empanada Restaurant,Yoga Studio,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice


In [50]:
toronto_merge.loc[toronto_merge['Cluster Labels'] == 1, toronto_merge.columns[[1] + list(range(5, toronto_merge.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,1.0,Golf Course,Athletics & Sports,Fast Food Restaurant,Dog Run,Mediterranean Restaurant,Pool,Asian Restaurant,American Restaurant,Convenience Store,Cosmetics Shop
1,North York,1.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Japanese Restaurant,Bus Station,Asian Restaurant,Toy / Game Store,Tea Room,Food Court
2,North York,1.0,Chinese Restaurant,Japanese Restaurant,Café,Bank,Yoga Studio,Deli / Bodega,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
5,North York,1.0,Coffee Shop,Ramen Restaurant,Sandwich Place,Café,Restaurant,Sushi Restaurant,Pizza Place,Indonesian Restaurant,Shopping Mall,Bubble Tea Shop
7,North York,1.0,Coffee Shop,Discount Store,Pharmacy,Butcher,Pizza Place,Grocery Store,Food Truck,Dessert Shop,Frozen Yogurt Shop,Concert Hall
9,North York,1.0,Caribbean Restaurant,Gym / Fitness Center,Café,Japanese Restaurant,Yoga Studio,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
10,North York,1.0,Beer Store,Coffee Shop,Asian Restaurant,Gym,Sporting Goods Shop,Restaurant,Japanese Restaurant,Italian Restaurant,Discount Store,Concert Hall
11,North York,1.0,Coffee Shop,Pizza Place,Supermarket,Frozen Yogurt Shop,Fast Food Restaurant,Diner,Deli / Bodega,Middle Eastern Restaurant,Pharmacy,Restaurant
12,North York,1.0,Furniture / Home Store,Bar,Caribbean Restaurant,Miscellaneous Shop,Massage Studio,Falafel Restaurant,Coffee Shop,Fish & Chips Shop,Food Truck,Concert Hall
16,North York,1.0,Liquor Store,Discount Store,Grocery Store,Athletics & Sports,Gym / Fitness Center,Deli / Bodega,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


In [51]:
toronto_merge.loc[toronto_merge['Cluster Labels'] == 2, toronto_merge.columns[[1] + list(range(5, toronto_merge.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,York,2.0,Convenience Store,Yoga Studio,Gift Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega


In [52]:
toronto_merge.loc[toronto_merge['Cluster Labels'] == 3, toronto_merge.columns[[1] + list(range(5, toronto_merge.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,North York,3.0,Park,Convenience Store,Bank,Bar,Yoga Studio,Dog Run,Construction & Landscaping,Cosmetics Shop,Curling Ice,Dance Studio
8,North York,3.0,Park,Food & Drink Shop,Bus Stop,Yoga Studio,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
13,North York,3.0,Airport,Park,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio
14,North York,3.0,Grocery Store,Hotel,Park,Shopping Mall,Bank,Curling Ice,Dessert Shop,Department Store,Deli / Bodega,Dance Studio
22,East York,3.0,Coffee Shop,Convenience Store,Park,Yoga Studio,Concert Hall,Construction & Landscaping,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega
27,York,3.0,Park,Women's Store,Fast Food Restaurant,Market,Yoga Studio,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
28,North York,3.0,Construction & Landscaping,Park,Bakery,Basketball Court,Yoga Studio,Dog Run,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio


In [53]:
toronto_merge.loc[toronto_merge['Cluster Labels'] == 4, toronto_merge.columns[[1] + list(range(5, toronto_merge.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,North York,4.0,Food Truck,Baseball Field,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio
32,North York,4.0,Baseball Field,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega
