# Web Scraping Toronto neighbourhoods data from Wikipedia

__1. Web Scraping Wikipedia Table__

In [1]:
import requests
import lxml.html as lh
import pandas as pd

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url) #Create a handle, page, to handle the contents of the website
doc = lh.fromstring(page.content) #Store the contents of the website under doc
tr_elements = doc.xpath('//tr') #tr_elements = doc.xpath('//tr') 

In [3]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [4]:
tr_elements = doc.xpath('//tr')
col=[]
i=0
for t in tr_elements[0]: #For each row, store each first element (header) and an empty list
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [5]:
#Since the first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [6]:
[len(C) for (title,C) in col]

[288, 288, 288]

In [7]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

__2. Dataframe manipulation and computations - Data Transform__

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [9]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,288,288,288
unique,180,12,209
top,M9V,Not assigned,Not assigned\n
freq,8,77,78


In [10]:
for col in df.columns: 
    print(col) 
df.columns

Postcode
Borough
Neighbourhood



Index(['Postcode', 'Borough', 'Neighbourhood\n'], dtype='object')

In [11]:
df.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace=True) #last column header had new line escape character, so rename that to remove new line character

In [12]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [13]:
df2 = df[df['Borough'] != 'Not assigned'] #copy a dataframe without unassigned boroughs to the main dataframe

In [14]:
df2 #to check all boroughs have been assigned

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
10,M9A,Etobicoke,Islington Avenue\n
11,M1B,Scarborough,Rouge\n
12,M1B,Scarborough,Malvern\n


In [15]:
df3 = df2[df2['Neighbourhood'] == 'Not assigned\n'] #copy a dataframe with assigned boroughs but unassigned neighborhoods to a new dataframe

In [16]:
df3

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned\n


In [17]:
df3['Neighbourhood'] = df3['Borough'] #assign unassigned neighborhood in new dataframe to its borough value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
df3 #check

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [19]:
com_df = pd.concat([df2, df3], axis=0) #combine original dataframe and previous data into a combined final dataframe

In [20]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
10,M9A,Etobicoke,Islington Avenue\n
11,M1B,Scarborough,Rouge\n
12,M1B,Scarborough,Malvern\n


___I noticed that all the neighborhood values of the dataframe also have a new line escape character. What follows is the code to remove the escape character at the end of all neighborhood column values___

In [21]:
l = com_df.ix[:,2].tolist() #assign all neighborhood column values to a list

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


In [22]:
for i in range(len(l)-1): #loop through contents of list (last element not required since it was the neighborhood we assigned to its borough) and slice out final two characters
    l[i] = l[i][0:-1]

In [23]:
l #check that all values have no escape characters at the end

['Parkwoods',
 'Victoria Village',
 'Harbourfront',
 'Regent Park',
 'Lawrence Heights',
 'Lawrence Manor',
 'Not assigned',
 'Islington Avenue',
 'Rouge',
 'Malvern',
 'Don Mills North',
 'Woodbine Gardens',
 'Parkview Hill',
 'Ryerson',
 'Garden District',
 'Glencairn',
 'Cloverdale',
 'Islington',
 'Martin Grove',
 'Princess Gardens',
 'West Deane Park',
 'Highland Creek',
 'Rouge Hill',
 'Port Union',
 'Flemingdon Park',
 'Don Mills South',
 'Woodbine Heights',
 'St. James Town',
 'Humewood-Cedarvale',
 'Bloordale Gardens',
 'Eringate',
 'Markland Wood',
 'Old Burnhamthorpe',
 'Guildwood',
 'Morningside',
 'West Hill',
 'The Beaches',
 'Berczy Park',
 'Caledonia-Fairbanks',
 'Woburn',
 'Leaside',
 'Central Bay Street',
 'Christie',
 'Cedarbrae',
 'Hillcrest Village',
 'Bathurst Manor',
 'Downsview North',
 'Wilson Heights',
 'Thorncliffe Park',
 'Adelaide',
 'King',
 'Richmond',
 'Dovercourt Village',
 'Dufferin',
 'Scarborough Village',
 'Fairview',
 'Henry Farm',
 'Oriole',
 'Nor

In [24]:
len(l) 

212

In [25]:
len(com_df) #check if length of df matches length of list with escape chars removed

212

In [26]:
com_df['Neighbourhood'] = l #assign neighbourhood column values to our list

In [27]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


___The following is the code to combine all neighbourhoods with the same postal code into 1 record in the dataframe. Using 'groupby' and 'agg'___

In [28]:
com_df = com_df.groupby('Postcode').agg({'Neighbourhood': ', '.join, 'Borough':'first' }).reset_index() #

In [29]:
com_df #check that neighborhoods have been combined

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough
5,M1J,Scarborough Village,Scarborough
6,M1K,"East Birchmount Park, Ionview, Kennedy Park",Scarborough
7,M1L,"Clairlea, Golden Mile, Oakridge",Scarborough
8,M1M,"Cliffcrest, Cliffside, Scarborough Village West",Scarborough
9,M1N,"Birch Cliff, Cliffside West",Scarborough


In [30]:
com_df = com_df[['Postcode', 'Borough', 'Neighbourhood']] #rearrange columns 

In [31]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [32]:
print(com_df.shape)

(103, 3)


___Read in csv file and put into dataframe___

In [33]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')
coords

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [34]:
com_df['Longitude'] = coords['Longitude'].values #assign longitude values in csv dataframe to our dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [35]:
com_df['Latitude'] = coords['Latitude'].values #assign latitude values in csv dataframe to our dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [36]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Longitude,Latitude
0,M1B,Scarborough,"Rouge, Malvern",-79.194353,43.806686
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",-79.160497,43.784535
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",-79.188711,43.763573
3,M1G,Scarborough,Woburn,-79.216917,43.770992
4,M1H,Scarborough,Cedarbrae,-79.239476,43.773136
5,M1J,Scarborough,Scarborough Village,-79.239476,43.744734
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",-79.262029,43.727929
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",-79.284577,43.711112
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",-79.239476,43.716316
9,M1N,Scarborough,"Birch Cliff, Cliffside West",-79.264848,43.692657


In [38]:
import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

## 1. Explore Neighborhoods in Toronto

In [50]:
toronto_data = com_df[com_df['Neighbourhood'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Longitude,Latitude
0,M3K,North York,"CFB Toronto, Downsview East",-79.464763,43.737473
1,M4J,East York,East Toronto,-79.338106,43.685347
2,M4R,Central Toronto,North Toronto West,-79.405678,43.715383
3,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",-79.381752,43.640816
4,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",-79.381576,43.647177


#### Using geopy library to get the latitude and longitude values of Toronto

In [51]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [52]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [103]:
# %hidden_cell
CLIENT_ID = '4IVRXFDYLDSG3DS1DVO5AJNLDTQ440PLZKNWEQUEU2BKG3PH' # your Foursquare ID
CLIENT_SECRET = 'UR11KPXXW2RSBWGNW20YYCKI4LG3KQNYVEIFCF1IQ0UOQY1I' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4IVRXFDYLDSG3DS1DVO5AJNLDTQ440PLZKNWEQUEU2BKG3PH
CLIENT_SECRET:UR11KPXXW2RSBWGNW20YYCKI4LG3KQNYVEIFCF1IQ0UOQY1I


In [54]:
toronto_data.loc[0, 'Neighbourhood']

'CFB Toronto, Downsview East'

In [56]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of CFB Toronto, Downsview East are 43.737473200000004, -79.46476329999999.


#### Get the top 100 venues that are in CFB Toronto within a radius of 500 meters.

In [57]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL 

'https://api.foursquare.com/v2/venues/explore?&client_id=4IVRXFDYLDSG3DS1DVO5AJNLDTQ440PLZKNWEQUEU2BKG3PH&client_secret=UR11KPXXW2RSBWGNW20YYCKI4LG3KQNYVEIFCF1IQ0UOQY1I&v=20180605&ll=43.737473200000004,-79.46476329999999&radius=500&limit=100'

In [58]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5daeec332b274a0039927aa8'},
  'headerLocation': 'Clanton Park',
  'headerFullLocation': 'Clanton Park, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.741973204500006,
    'lng': -79.45854667937631},
   'sw': {'lat': 43.7329731955, 'lng': -79.47097992062366}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bcb76143740b7133e926265',
       'name': 'Toronto Downsview Airport (YZD)',
       'location': {'address': 'Garratt Blvd',
        'lat': 43.738882611749744,
        'lng': -79.47011109314882,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.738882611749744,
          'lng': -79.47011109314882}],
        'distance': 457,
        'cc': 'CA',
   

In [59]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [60]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Toronto Downsview Airport (YZD),Airport,43.738883,-79.470111
1,Ancaster Park,Park,43.734706,-79.464777


In [61]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


#### Creating a function to repeat the same process for all the neighborhoods in Toronto

In [62]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [65]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

CFB Toronto, Downsview East
East Toronto
North Toronto West
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Harbord, University of Toronto
Humber Bay Shores, Mimico South, New Toronto


In [66]:
print(toronto_venues.shape)
toronto_venues.head()

(278, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"CFB Toronto, Downsview East",43.737473,-79.464763,Toronto Downsview Airport (YZD),43.738883,-79.470111,Airport
1,"CFB Toronto, Downsview East",43.737473,-79.464763,Ancaster Park,43.734706,-79.464777,Park
2,East Toronto,43.685347,-79.338106,Aldwych Park,43.684901,-79.341091,Park
3,East Toronto,43.685347,-79.338106,The Path,43.683923,-79.335007,Park
4,East Toronto,43.685347,-79.338106,Sammon Convenience,43.686951,-79.335007,Convenience Store


In [71]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CFB Toronto, Downsview East",2,2,2,2,2,2
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100
East Toronto,4,4,4,4,4,4
"Harbord, University of Toronto",38,38,38,38,38,38
"Harbourfront East, Toronto Islands, Union Station",100,100,100,100,100,100
"Humber Bay Shores, Mimico South, New Toronto",14,14,14,14,14,14
North Toronto West,20,20,20,20,20,20


In [69]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 96 uniques categories.


## 2. Analyze Each Neighborhood

In [72]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,American Restaurant,Aquarium,Art Gallery,Asian Restaurant,Bakery,Bar,Baseball Stadium,Basketball Stadium,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
toronto_onehot.shape

(278, 96)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [74]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,American Restaurant,Aquarium,Art Gallery,Asian Restaurant,Bakery,Bar,Baseball Stadium,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar
0,"CFB Toronto, Downsview East",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Design Exchange, Toronto Dominion Centre",0.0,0.0,0.04,0.0,0.01,0.01,0.02,0.03,0.0,...,0.02,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.01
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Harbord, University of Toronto",0.026316,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.0,...,0.0,0.0,0.026316,0.0,0.0,0.026316,0.0,0.0,0.026316,0.0
4,"Harbourfront East, Toronto Islands, Union Station",0.0,0.0,0.0,0.05,0.01,0.0,0.02,0.02,0.02,...,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.01,0.0,0.01
5,"Humber Bay Shores, Mimico South, New Toronto",0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
toronto_grouped.shape

(7, 96)

#### Let's print each neighborhood along with the top 5 most common venues

In [76]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CFB Toronto, Downsview East----
               venue  freq
0            Airport   0.5
1               Park   0.5
2  Indian Restaurant   0.0
3                Pub   0.0
4      Poutine Place   0.0


----Design Exchange, Toronto Dominion Centre----
                 venue  freq
0          Coffee Shop  0.15
1                 Café  0.08
2                Hotel  0.06
3  American Restaurant  0.04
4           Restaurant  0.04


----East Toronto----
                 venue  freq
0                 Park  0.50
1    Convenience Store  0.25
2          Coffee Shop  0.25
3           Restaurant  0.00
4  Rental Car Location  0.00


----Harbord, University of Toronto----
                 venue  freq
0                 Café  0.16
1           Restaurant  0.05
2   Italian Restaurant  0.05
3  Japanese Restaurant  0.05
4               Bakery  0.05


----Harbourfront East, Toronto Islands, Union Station----
         venue  freq
0  Coffee Shop  0.13
1     Aquarium  0.05
2        Hotel  0.05
3         Café  0.04


In [77]:
def return_most_common_venues(row, num_top_venues): #sort venues 
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Creating the new dataframe to display the top 10 venues for each neighborhood.

In [78]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"CFB Toronto, Downsview East",Airport,Park,Wine Bar,Flower Shop,Convenience Store,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store
1,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Bar,Italian Restaurant,Gastropub,Deli / Bodega,Seafood Restaurant
2,East Toronto,Park,Coffee Shop,Convenience Store,Wine Bar,Comfort Food Restaurant,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store
3,"Harbord, University of Toronto",Café,Bar,Restaurant,Italian Restaurant,Japanese Restaurant,Bookstore,Bakery,Comfort Food Restaurant,Pub,Coffee Shop
4,"Harbourfront East, Toronto Islands, Union Station",Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Brewery,Scenic Lookout,Fried Chicken Joint,Restaurant,Sports Bar


## 3. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [79]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 2, 0, 0, 4, 3], dtype=int32)

In [91]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3K,North York,"CFB Toronto, Downsview East",-79.464763,43.737473,1,Airport,Park,Wine Bar,Flower Shop,Convenience Store,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store
1,M4J,East York,East Toronto,-79.338106,43.685347,2,Park,Coffee Shop,Convenience Store,Wine Bar,Comfort Food Restaurant,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store
2,M4R,Central Toronto,North Toronto West,-79.405678,43.715383,3,Coffee Shop,Clothing Store,Sporting Goods Shop,Yoga Studio,Burger Joint,Mexican Restaurant,Diner,Dessert Shop,Park,Pet Store
3,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",-79.381752,43.640816,0,Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Brewery,Scenic Lookout,Fried Chicken Joint,Restaurant,Sports Bar
4,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",-79.381576,43.647177,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Bar,Italian Restaurant,Gastropub,Deli / Bodega,Seafood Restaurant


___Finally, let's visualize the resulting clusters___

In [93]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 4. Examine Clusters

Now, we examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories.

In [95]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Downtown Toronto,0,Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Brewery,Scenic Lookout,Fried Chicken Joint,Restaurant,Sports Bar
4,Downtown Toronto,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Bar,Italian Restaurant,Gastropub,Deli / Bodega,Seafood Restaurant
5,Downtown Toronto,0,Café,Bar,Restaurant,Italian Restaurant,Japanese Restaurant,Bookstore,Bakery,Comfort Food Restaurant,Pub,Coffee Shop


In [96]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,1,Airport,Park,Wine Bar,Flower Shop,Convenience Store,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store


In [97]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East York,2,Park,Coffee Shop,Convenience Store,Wine Bar,Comfort Food Restaurant,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store


In [98]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Central Toronto,3,Coffee Shop,Clothing Store,Sporting Goods Shop,Yoga Studio,Burger Joint,Mexican Restaurant,Diner,Dessert Shop,Park,Pet Store


In [99]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Etobicoke,4,Café,Gym,Liquor Store,Restaurant,Sandwich Place,Seafood Restaurant,Pizza Place,Fast Food Restaurant,Flower Shop,Pharmacy
