# Exploring and Cleaning the Dataset for Neighbourhoods in Toronto Obtained from a Wiki Table

In [1]:
# import required modules
import requests
import bs4 
import lxml.html as lh
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

In [2]:
# intended website
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# Create a handle, page to handle the contents of the intended website
page = requests.get(url)

In [4]:
# Store the contents of the website under a doc
doc = lh.fromstring(page.content)

In [5]:
# Parse data that are stored between <tr>..<tr> of HTML
tr_elements = doc.xpath('//tr')

In [6]:
# Check the length of the first 15 rows for sanity check. This means that all rows have 3 columns which implies that operation is successful.
[len(T) for T in tr_elements[:15]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [7]:
# Parsing first row as header
tr_elements = doc.xpath('//tr')

# Create an empty list
col=[]
i=0

# Storing each element (header)
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    col.append((name,[]))

In [8]:
# Creating Pandas DataFrame
for j in range(1, len(tr_elements)):
    T=tr_elements[j]
    
    # if row is not of size 3, the //tr data is not from the table we wanted
    if len(T)!=3:
        break
        
    # i is the index of the column
    i=0
    
    # Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content().replace("\n", "")
        # check if row is empty
        if i>0:
        # convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        # append the data to the empty list of the 1th column
        col[i][1].append(data)
        # increment i for the next column
        i+=1

In [9]:
# Sanity check : this shows each of the 3 columns has exactly 289 values
[len(C) for (title,C) in col]

[289, 289, 289]

In [10]:
# Create DataFrame
Dict={title:column for (title, column) in col}
df=pd.DataFrame(Dict)

In [11]:
df.shape

(289, 3)

In [12]:
# Checking the 1st 5 items in the DataFrame
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [13]:
# Viewing the last 5 items in the DataFrame
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
288,M9Z,Not assigned,Not assigned


In [14]:
# Making a copy of df to make changes so as to retain original, in case required.
df1 = df.copy()

In [15]:
# Cleaning up column headers
df1.columns = df1.columns.str.strip()

In [16]:
# Checking column headers
df1.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [17]:
# Sanity check to see if df1 is same as df
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [18]:
# Drop rows where Borough is 'Not assigned'
df1 = df1[df1.Borough != 'Not assigned']

In [19]:
# Check shape of df1 to ensure that rows where Borough is Not assigned are dropped
df1.shape

(212, 3)

In [20]:
# Check 1st 5 items of dataframe after dropping 'Not assigned' rows.
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [21]:
# Comibining Neighbourhood with common postal code & Resetting Index
df1 = pd.DataFrame(df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join) )
df1 = df1.reset_index()

In [22]:
# Checking 1st 5 rows of the new df1 with combined neighbourhood for common postcode
df1.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [23]:
# For those not assigned neighbourhood, they will assume that of the borough
df1.Neighbourhood[df1.Neighbourhood == 'Not assigned'] = df1.Borough

In [24]:
# Checking the new df1 after accounting for the not assigned neighbourhood to be that of the borough 
# and to combine neighbourhoods for common postcode
df1.shape

(103, 3)

# Getting the Latitude and Longtitude Coordinates for the Neighbourhoods

In [26]:
# Reading the geospatial data csv file
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')

In [27]:
# Checking column headers
geospatial_data.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [28]:
# Making a DataFrame
geospatial_data = pd.DataFrame(geospatial_data)

In [29]:
# Checking first 5 lines of DataFrame
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
# Joining the 2 DataFrames
canada_loc = pd.concat([df1, geospatial_data], axis=1, join='inner')

In [31]:
# Checking first 5 lines of combined DataFrame
canada_loc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [32]:
# Dropping duplicate column 
canada_loc.drop(['Postal Code'], axis=1, inplace=True)

In [33]:
# Viewing the 1st 5 items of the Postal Code column
canada_loc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [34]:
# The 2 DataFrames can also be combined using Join instead of concat as above
canada_loc1 = df1.join(geospatial_data)

In [35]:
# Checking 1st 5 lines of joined DataFrame
canada_loc1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [36]:
# Dropping duplicate column 
canada_loc1.drop(['Postal Code'], axis=1, inplace=True)

In [37]:
# Viewing the 1st 5 items in the new DataFrame
canada_loc1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [38]:
# Checking shape of dropped DataFrame (103 rows and 3 columns)
canada_loc1.shape

(103, 5)

# Obtaining Boroughs with Toronto

In [39]:
# Understand what are the unique values for Borough
canada_loc.Borough.value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [40]:
# Selecting Boroughs with Toronto for further analysis
target = ['Downtown Toronto', 'Central Toronto','West Toronto', 'East Toronto']
toronto_data = canada_loc1.loc[canada_loc['Borough'].isin(target)].reset_index(drop=True)
toronto_data.shape

(38, 5)

In [41]:
# Review the DataFrame
toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


# Putting Data in a Map

In [42]:
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

In [43]:
# map rendering library
import folium 

In [45]:
# create map of Toronto, Canada using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Postcode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [76]:
# Defining Foursquare Credentials and Version
CLIENT_ID = '' 
CLIENT_SECRET = '' 
VERSION = '20190108' # Foursquare API version


In [47]:
# Getting the Postcode for the 10th position in the list for further exploration
toronto_data.loc[9, 'Postcode']

'M4V'

In [48]:
postcode_latitude = toronto_data.loc[9, 'Latitude'] # postcode latitude value
postcode_longitude = toronto_data.loc[9, 'Longitude'] # postcode longitude value

neighbourhood_name = toronto_data.loc[9, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               postcode_latitude, 
                                                               postcode_longitude))

Latitude and longitude values of Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West are 43.68641229999999, -79.4000493.


In [49]:
# Explore the top 150 venues in Postcode M4V with a radius of 750m
LIMIT = 150 # limit of number of venues returned by Foursquare API

radius = 750 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    postcode_latitude, 
    postcode_longitude, 
    radius, 
    LIMIT)
# url # display URL

In [50]:
# Import library to handle JSON files
import json 

In [51]:
# Getting the results of the query for the top 150 evnues in Postcode M4V within a radius of 750m
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c36f32b351e3d26294c7a94'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-55c78cef498ec4095e9fba41-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/conveniencestore_',
          'suffix': '.png'},
         'id': '4d954b0ea243a5684a65b473',
         'name': 'Convenience Store',
         'pluralName': 'Convenience Stores',
         'primary': True,
         'shortName': 'Convenience Store'}],
       'id': '55c78cef498ec4095e9fba41',
       'location': {'address': '111 St. Clair West',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 91,
        'formattedAddress': ['111 St. Clair West', 'Toronto ON', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.68699

In [52]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [53]:
# Cleaning the JSON file and structuring it into a Pandas DataFrame

# Import json_normalize module from pandas.io.json
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,LCBO,Convenience Store,43.686991,-79.399238
1,Delica Kitchen,Café,43.687398,-79.393744
2,Scaramouche,French Restaurant,43.681293,-79.399492
3,The Bagel House,Bagel Shop,43.687374,-79.393696
4,Cava Restaurant,Tapas Restaurant,43.689809,-79.394932


In [54]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

57 venues were returned by Foursquare.


# Exploring Postcodes with Toronto in the Borough

In [55]:
def getNearbyVenues(names, latitudes, longitudes, radius=750):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [56]:
# Generating venues for toronto
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [57]:
# Checking the size of the DataFrame as well as the first 5 lines of data
print(toronto_venues.shape)
toronto_venues.head()

(2587, 7)


Unnamed: 0,Neighbourhood,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
1,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
2,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.67263,-79.287993,Ice Cream Shop
3,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
4,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub


In [58]:
# Checking number of venues returned by each neighbourhood
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",86,86,86,86,86,86
Business Reply Mail Processing Centre 969 Eastern,53,53,53,53,53,53
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",21,21,21,21,21,21
"Cabbagetown, St. James Town",71,71,71,71,71,71
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,27,27,27,27,27,27
Church and Wellesley,100,100,100,100,100,100


In [59]:
# Checking the uique categories returned
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 280 uniques categories.


# Analyzing each neighbourhood

In [60]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# Checking new dataframe size
toronto_onehot.shape

(2587, 281)

In [62]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
2,"Brockton, Exhibition Place, Parkdale Village",0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,...,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018868,0.0,0.018868
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.047619,0.047619,0.095238,0.095238,0.142857,0.0,...,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.01,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.01
7,"Chinatown, Grange Park, Kensington Market",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.04,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.01,0.01


In [63]:
# What's the new size
toronto_grouped.shape

(38, 281)

In [64]:
# Finding out top 5 common venues in each neighbourhood
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                 Café  0.07
1          Coffee Shop  0.06
2           Steakhouse  0.04
3                Hotel  0.04
4  American Restaurant  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.06
1          Restaurant  0.06
2               Hotel  0.06
3                Café  0.05
4  Italian Restaurant  0.04


----Brockton, Exhibition Place, Parkdale Village----
            venue  freq
0     Coffee Shop  0.06
1            Café  0.05
2             Bar  0.05
3       Gift Shop  0.02
4  Sandwich Place  0.02


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0  Fast Food Restaurant  0.08
1    Light Rail Station  0.06
2                   Bar  0.06
3    Italian Restaurant  0.04
4       Harbor / Marina  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0  Airport Terminal  

In [65]:
# Function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [66]:
# Create a new dataframe, displaying the top 10 venues for each neighborhood.

import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,American Restaurant,Steakhouse,Hotel,Asian Restaurant,Thai Restaurant,Theater,Sushi Restaurant,Concert Hall
1,Berczy Park,Hotel,Coffee Shop,Restaurant,Café,Italian Restaurant,Pub,Bakery,Cocktail Bar,Park,Japanese Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Bar,Café,Mexican Restaurant,Arts & Crafts Store,Italian Restaurant,Sandwich Place,BBQ Joint,Thrift / Vintage Store,Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Fast Food Restaurant,Bar,Light Rail Station,Bakery,Park,Italian Restaurant,Harbor / Marina,Burrito Place,Yoga Studio,Sandwich Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Terminal,Harbor / Marina,Airport Lounge,Airport Service,Sculpture Garden,Boat or Ferry,Tunnel,Park,Rental Car Location,Airport
5,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Café,Grocery Store,Park,Pizza Place,Pub,Italian Restaurant,Bakery,Thai Restaurant
6,Central Bay Street,Coffee Shop,Café,Bubble Tea Shop,Burger Joint,Italian Restaurant,Chinese Restaurant,Clothing Store,Pizza Place,Bar,Tea Room
7,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Bakery,Mexican Restaurant,Chinese Restaurant,Dessert Shop,Coffee Shop,Dumpling Restaurant
8,Christie,Grocery Store,Café,Park,Indian Restaurant,Diner,Coffee Shop,Latin American Restaurant,Karaoke Bar,Restaurant,Baby Store
9,Church and Wellesley,Coffee Shop,Japanese Restaurant,Burger Joint,Gay Bar,Restaurant,Bubble Tea Shop,Café,Sushi Restaurant,Dance Studio,Sandwich Place


# Clustering the Neighbourhoods

In [67]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [68]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [69]:
# Create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood.

toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Bar,Breakfast Spot,Gastropub,Sandwich Place,Japanese Restaurant,Coffee Shop,Pet Store,Shoe Store,Café
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Pub,Fast Food Restaurant,Grocery Store,Café,Ice Cream Shop,Spa,Yoga Studio,Bakery
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Indian Restaurant,Coffee Shop,Sandwich Place,Brewery,Grocery Store,Fast Food Restaurant,Park,Café,Gym,Skate Park
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Bakery,Café,Bar,Sandwich Place,Diner,Sushi Restaurant,American Restaurant,Italian Restaurant,Convenience Store
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Swim School,Business Service,Coffee Shop,Bus Line,Park,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant


In [70]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examining the Clusters

In [71]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Pub,Bar,Breakfast Spot,Gastropub,Sandwich Place,Japanese Restaurant,Coffee Shop,Pet Store,Shoe Store,Café
1,East Toronto,0,Greek Restaurant,Coffee Shop,Pub,Fast Food Restaurant,Grocery Store,Café,Ice Cream Shop,Spa,Yoga Studio,Bakery
2,East Toronto,0,Indian Restaurant,Coffee Shop,Sandwich Place,Brewery,Grocery Store,Fast Food Restaurant,Park,Café,Gym,Skate Park
3,East Toronto,0,Coffee Shop,Bakery,Café,Bar,Sandwich Place,Diner,Sushi Restaurant,American Restaurant,Italian Restaurant,Convenience Store
4,Central Toronto,0,Swim School,Business Service,Coffee Shop,Bus Line,Park,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
5,Central Toronto,0,Coffee Shop,Park,Gym,Café,Pizza Place,Bakery,Taco Place,Dessert Shop,Sushi Restaurant,Supermarket
6,Central Toronto,0,Coffee Shop,Sporting Goods Shop,Bakery,Italian Restaurant,Restaurant,Diner,Café,Gym Pool,Grocery Store,Spa
7,Central Toronto,0,Italian Restaurant,Coffee Shop,Dessert Shop,Sandwich Place,Pizza Place,Gym,Restaurant,Fast Food Restaurant,Indian Restaurant,Café
8,Central Toronto,0,Park,Grocery Store,Candy Store,Café,Thai Restaurant,Sandwich Place,Bank,Gym / Fitness Center,Gym,Japanese Restaurant
9,Central Toronto,0,Coffee Shop,Sushi Restaurant,Italian Restaurant,Café,Bagel Shop,Skating Rink,Sandwich Place,Pub,Pizza Place,Gym


In [72]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,1,Coffee Shop,Café,Bubble Tea Shop,Burger Joint,Italian Restaurant,Chinese Restaurant,Clothing Store,Pizza Place,Bar,Tea Room


In [73]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,Downtown Toronto,2,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Japanese Restaurant,Bakery,Seafood Restaurant,Cocktail Bar,Beer Bar


In [74]:
# Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,Central Toronto,3,Coffee Shop,Pizza Place,History Museum,Vegetarian / Vegan Restaurant,Sandwich Place,Café,Gym,Jewish Restaurant,Burger Joint,Pub
27,Downtown Toronto,3,Airport Terminal,Harbor / Marina,Airport Lounge,Airport Service,Sculpture Garden,Boat or Ferry,Tunnel,Park,Rental Car Location,Airport


In [75]:
# Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Playground,Home Service,Business Service,Pet Store,Garden,Falafel Restaurant,Exhibit,Event Space,Ethiopian Restaurant,Electronics Store
