In [211]:
import pandas as pd
import numpy as np

from html.parser import HTMLParser
from bs4 import BeautifulSoup
import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [65]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/72/ff/004bfe344150a064e558cb2aedeaa02ecbf75e60e148a55a9198f0c41765/folium-0.10.0-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 19.4MB/s 
Collecting jinja2>=2.9 (from folium)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/e7/fd8b501e7a6dfe492a433deb7b9d833d39ca74916fa8bc63dd1a4947a671/Jinja2-2.10.1-py2.py3-none-any.whl (124kB)
[K     |████████████████████████████████| 133kB 35.6MB/s 
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: jinja2, branca, folium
  Found existing installation: Jinja2 2.8
    Not uninstalling jinja2 at /usr/local/lib/python3.5/dist-packages, outside environment /resources/common/.virtualenv/python3
    Can't uninstall 'Jinja2'. No files were found to uninstall.
Successful

In [35]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 13.8MB/s 
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting click (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl (81kB)
[K     |████████████████████████████████| 81kB 18.5MB/s 
Installing collected packages: ratelim, click, geocoder
Successfully installed click-7.0 geocoder-1.38.1 ratelim-0.1.6


Load html page from wikipedia

In [149]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

Parse the table from html to pandas and save it to list

In [150]:
l = pd.read_html(page, header=0, attrs={"class":"wikitable sortable"})[0]

Drop all rows where Borough is Not assigned

In [151]:
l = l[l.Borough != 'Not assigned']

In [152]:
l = l.rename(columns={'Neighbourhood': 'Neighborhood'})

Group rows by Postcode and Borught and Neighborhood consolidate to string separated by ,.

In [153]:
l = l.groupby(['Postcode', 'Borough'],as_index=False)['Neighborhood'].agg(', '.join)

In [154]:
indexes = l[ ~l['Borough'].str.contains("Toronto") ].index


l.drop( indexes , inplace=True)

Search all rows where Neighborhood is not assigned and fill them with a value of Borough

In [155]:
for i in range(len(l)):
    if l['Neighborhood'].values[i] == 'Not assigned':
        l['Neighborhood'].values[i] = l['Borough'].values[i]

Add Latitude and Longitude columns to the list

In [156]:
l["Latitude"] = ""
l["Longitude"] = ""

Import geocoder library

In [157]:
import geocoder # import geocoder

Define a function for getting coordinates via geocoder.arcgis

In [158]:
def getCoordinates(postcode, borough, neighborhood):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis("%s, %s, %s" % (postcode, borough, neighborhood))
        lat_lng_coords = g.latlng
    
    coordinates = {
    "lat" : lat_lng_coords[0],
    "lon" : lat_lng_coords[1]
        }
    return coordinates

For each row on the list fill in latitide nad longitude by calling getCoordinates function

In [159]:
for i in range(len(l)):
    postcode = l['Postcode'].values[i]
    borough = l['Borough'].values[i]
    neighborhood = l['Neighborhood'].values[i]
    c = getCoordinates(postcode, borough, neighborhood)
    l['Latitude'].values[i] = c['lat']
    l['Longitude'].values[i] = c['lon']
        

Create a DataFrame from the list

In [160]:
neighborhoods = pd.DataFrame(l)

Show the result

In [161]:
neighborhoods

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.6741,-79.2964
41,M4K,East Toronto,"The Danforth West, Riverdale",40.4737,-80.7325
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.6741,-79.2964
43,M4M,East Toronto,Studio District,40.4737,-80.7325
44,M4N,Central Toronto,Lawrence Park,43.7255,-79.4023
45,M4P,Central Toronto,Davisville North,37.7992,-95.9526
46,M4R,Central Toronto,North Toronto West,43.724,-79.402
47,M4S,Central Toronto,Davisville,43.7018,-79.3835
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.6837,-79.389
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.6861,-79.4023


Show size of the DataFrame

In [162]:
neighborhoods.shape

(38, 5)

In [125]:
import folium # map rendering library

In [163]:
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis("Toronto")
        lat_lng_coords = g.latlng
    
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
        


In [128]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_toronto

In [164]:
CLIENT_ID = 'EJCQWYHVCHNTPBQIRO5AS3JDCRWELWCTRQ04MLO5L2VFJ0R1' # your Foursquare ID
CLIENT_SECRET = '5OC1XWIM3WCSZYJUQOFGQCTS4JAEILPA3BCIYOQTTJZN4E3W' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EJCQWYHVCHNTPBQIRO5AS3JDCRWELWCTRQ04MLO5L2VFJ0R1
CLIENT_SECRET:5OC1XWIM3WCSZYJUQOFGQCTS4JAEILPA3BCIYOQTTJZN4E3W


In [165]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [166]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [167]:
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                 
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [168]:
print(toronto_venues.shape)
toronto_venues.head()

(1615, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.67413,-79.29644,Mastermind Toys,43.671453,-79.293971,Toy / Game Store
1,The Beaches,43.67413,-79.29644,Glen Manor Ravine,43.676821,-79.293942,Trail
2,The Beaches,43.67413,-79.29644,Beacher Cafe,43.671938,-79.291238,Breakfast Spot
3,The Beaches,43.67413,-79.29644,Castro's Lounge,43.671104,-79.295107,Bar
4,The Beaches,43.67413,-79.29644,Sanna's Farmacia,43.670929,-79.295969,Juice Bar


In [169]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",67,67,67,67,67,67
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",100,100,100,100,100,100
"Cabbagetown, St. James Town",35,35,35,35,35,35
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,13,13,13,13,13,13
Church and Wellesley,84,84,84,84,84,84
"Commerce Court, Victoria Hotel",100,100,100,100,100,100


In [170]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 217 uniques categories.


In [179]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,...,Trail,Train Station,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
toronto_onehot.shape

(1615, 217)

In [181]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train Station,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.03,0.0,0.01,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.014925,0.0,0.014925,0.0,0.0,0.0,...,0.0,0.0,0.0,0.029851,0.0,0.0,0.014925,0.0,0.0,0.0
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.02,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0
4,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Central Bay Street,0.0,0.01,0.0,0.01,0.01,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0
6,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07,0.0,0.0,0.04,0.01,0.0,0.0
7,Christie,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Church and Wellesley,0.0,0.011905,0.0,0.0,0.0,0.011905,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,0.0,0.011905,0.0
9,"Commerce Court, Victoria Hotel",0.0,0.04,0.0,0.01,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0


In [182]:
toronto_grouped.shape

(34, 217)

In [184]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.09
1                 Café  0.06
2                Hotel  0.04
3  Japanese Restaurant  0.03
4           Restaurant  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.10
1                Café  0.06
2          Restaurant  0.05
3  Italian Restaurant  0.04
4              Bakery  0.04


----Brockton, Exhibition Place, Parkdale Village----
            venue  freq
0     Coffee Shop  0.10
1            Café  0.06
2          Bakery  0.04
3  Sandwich Place  0.04
4      Restaurant  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                venue  freq
0         Coffee Shop  0.08
1  Italian Restaurant  0.07
2          Restaurant  0.05
3                 Bar  0.04
4      Sandwich Place  0.03


----Cabbagetown, St. James Town----
         venue  freq
0         Café  0.09
1  Coffee Shop  0.09
2       Bakery  0.

In [185]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [214]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Hotel,Gym,Burger Joint,Restaurant,Steakhouse,Japanese Restaurant,Seafood Restaurant,Bar
1,Berczy Park,Coffee Shop,Café,Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Beer Bar,Japanese Restaurant,Cocktail Bar,Hotel
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Restaurant,Sandwich Place,Bakery,Furniture / Home Store,Vegetarian / Vegan Restaurant,Pub,Bar,Hotel
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",Coffee Shop,Italian Restaurant,Restaurant,Bar,French Restaurant,Hotel,Beer Bar,Sandwich Place,Spa,Steakhouse
4,"Cabbagetown, St. James Town",Coffee Shop,Café,Park,Pizza Place,Restaurant,Bakery,Pharmacy,Butcher,Jewelry Store,Breakfast Spot


In [215]:
# set number of clusters
kclusters = 5

del toronto_grouped_clustering
del kmeans

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [216]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.6741,-79.2964,2.0,Japanese Restaurant,Nail Salon,Bar,Women's Store,Restaurant,Breakfast Spot,Burger Joint,Café,Sandwich Place,Chocolate Shop
41,M4K,East Toronto,"The Danforth West, Riverdale",40.4737,-80.7325,,,,,,,,,,,
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.6741,-79.2964,2.0,Japanese Restaurant,Nail Salon,Bar,Women's Store,Restaurant,Breakfast Spot,Burger Joint,Café,Sandwich Place,Chocolate Shop
43,M4M,East Toronto,Studio District,40.4737,-80.7325,,,,,,,,,,,
44,M4N,Central Toronto,Lawrence Park,43.7255,-79.4023,2.0,Coffee Shop,Italian Restaurant,Ice Cream Shop,Bank,Yoga Studio,Deli / Bodega,Bus Line,Shoe Store,Seafood Restaurant,Pub


In [219]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters