In [1]:
import requests
import lxml.html as lh
import pandas as pd

# First get the webpage and find the table (marked in html with /tr)

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website

page = requests.get(url)

#Store the contents of the website under doc

doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML

tr_elements = doc.xpath('//tr')

# Loop and import the table rows into a python list, find the column names

In [3]:
#Create empty list

col=[]
i=0

#For each row, store each first element (header) and an empty list

for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


# Iterate through the list and clean up the data

In [4]:
#Since the first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0 and t!="Not assigned":
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

# Create a python dictionary from the clean data and make a pandas dataframe from that dictionary - shape the data as in the exercise instructions.

In [5]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

df.columns=['Postcode','Borough','Neighbourhood']

df.drop([0],axis=0,inplace=True)

df.reset_index()

# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, 
# you will notice that M5A is listed twice and has two neighborhoods: 
# Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
# separated with a comma as shown in row 11 in the above table.

df=df.groupby("Postcode").agg(lambda x:','.join(set(x)))

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 
# So for the 9th cell in the table on the Wikipedia page, 
# the value of the Borough and the Neighborhood columns will be Queen's Park.

df.loc[df['Neighbourhood']=="Not assigned",'Neighbourhood']=df.loc[df['Neighbourhood']=="Not assigned",'Borough']

df.shape


(103, 2)

# Then just for the heck of it print the first ten lines of the resulting dataframe.

In [6]:
df.head(10)

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge\n,Malvern\n"
M1C,Scarborough,"Port Union\n,Rouge Hill\n,Highland Creek\n"
M1E,Scarborough,"Guildwood\n,West Hill\n,Morningside\n"
M1G,Scarborough,Woburn\n
M1H,Scarborough,Cedarbrae\n
M1J,Scarborough,Scarborough Village\n
M1K,Scarborough,"East Birchmount Park\n,Kennedy Park\n,Ionview\n"
M1L,Scarborough,"Golden Mile\n,Clairlea\n,Oakridge\n"
M1M,Scarborough,"Scarborough Village West\n,Cliffside\n,Cliffcr..."
M1N,Scarborough,"Birch Cliff\n,Cliffside West\n"


# Get the geodata .csv file

In [7]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [8]:
df['Latitude']=geo_data['Latitude'].values
df['Longitude']=geo_data['Longitude'].values

df

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge\n,Malvern\n",43.806686,-79.194353
M1C,Scarborough,"Port Union\n,Rouge Hill\n,Highland Creek\n",43.784535,-79.160497
M1E,Scarborough,"Guildwood\n,West Hill\n,Morningside\n",43.763573,-79.188711
M1G,Scarborough,Woburn\n,43.770992,-79.216917
M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
M1J,Scarborough,Scarborough Village\n,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park\n,Kennedy Park\n,Ionview\n",43.727929,-79.262029
M1L,Scarborough,"Golden Mile\n,Clairlea\n,Oakridge\n",43.711112,-79.284577
M1M,Scarborough,"Scarborough Village West\n,Cliffside\n,Cliffcr...",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff\n,Cliffside West\n",43.692657,-79.264848


# Import the libraries needed for folium 

In [9]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
# uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Check the number of boroughs and neighborhoods

In [12]:
print('The dataframe has {} boroughs and {} Neighbourhood.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 Neighbourhood.


# Set up the geo coordinates to draw a map of Toronto and it's neighborhoods

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="can_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

print(df['Latitude'])

# add markers to map
#for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']):
#    label = folium.Popup(label, parse_html=True)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='blue',
#        fill=True,
#        fill_color='#3186cc',
#        fill_opacity=0.7,
#        parse_html=False).add_to(map_toronto)  
    
map_toronto

Postcode
M1B    43.806686
M1C    43.784535
M1E    43.763573
M1G    43.770992
M1H    43.773136
M1J    43.744734
M1K    43.727929
M1L    43.711112
M1M    43.716316
M1N    43.692657
M1P    43.757410
M1R    43.750072
M1S    43.794200
M1T    43.781638
M1V    43.815252
M1W    43.799525
M1X    43.836125
M2H    43.803762
M2J    43.778517
M2K    43.786947
M2L    43.757490
M2M    43.789053
M2N    43.770120
M2P    43.752758
M2R    43.782736
M3A    43.753259
M3B    43.745906
M3C    43.725900
M3H    43.754328
M3J    43.767980
M3K    43.737473
M3L    43.739015
M3M    43.728496
M3N    43.761631
M4A    43.725882
M4B    43.706397
M4C    43.695344
M4E    43.676357
M4G    43.709060
M4H    43.705369
M4J    43.685347
M4K    43.679557
M4L    43.668999
M4M    43.659526
M4N    43.728020
M4P    43.712751
M4R    43.715383
M4S    43.704324
M4T    43.689574
M4V    43.686412
M4W    43.679563
M4X    43.667967
M4Y    43.665860
M5A    43.654260
M5B    43.657162
M5C    43.651494
M5E    43.644771
M5G    43.657952
M5H  

# Draw that map!

In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Set up your Foursquare credentials

In [16]:
CLIENT_ID = 'ULNSFT4QOPZGUX51FNT2W0B4RCWLTF3PZWT1XQ5JTHIYG4OC' # your Foursquare ID
CLIENT_SECRET = '2FFFFUXOV5FSS2UA0O5SBHH55KJY5SYVGJ242QSZKWPY4RVI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ULNSFT4QOPZGUX51FNT2W0B4RCWLTF3PZWT1XQ5JTHIYG4OC
CLIENT_SECRET:2FFFFUXOV5FSS2UA0O5SBHH55KJY5SYVGJ242QSZKWPY4RVI


# Pick a borough to see what neighborhoods are there...  Let's look at York

In [17]:
toronto_data = df[df['Borough'] == 'York'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,York,Humewood-Cedarvale\n,43.693781,-79.428191
1,York,Caledonia-Fairbanks\n,43.689026,-79.453512
2,York,"Silverthorn\n,Keelesdale\n,Mount Dennis\n,Del ...",43.691116,-79.476013
3,York,"Runnymede\n,The Junction North\n",43.673185,-79.487262
4,York,Weston\n,43.706876,-79.518188


# Pick a neighborhood in that borough - #1 is Caledonia/Fairbanks

In [18]:
toronto_data.loc[1, 'Neighbourhood']

'Caledonia-Fairbanks\n'

# Get the latitude and longitude for Caledonia/Fairbanks

In [19]:
neighborhood_latitude = toronto_data.loc[1, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[1, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[1, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Caledonia-Fairbanks
 are 43.6890256, -79.453512.


# Construct the Foursquare search URL

In [20]:
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)


# Now use that url to get the Foursquare json search results

In [21]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ce2b8cd4434b92140ff6f0c'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 6,
  'suggestedBounds': {'ne': {'lat': 43.6935256045, 'lng': -79.44730040297749},
   'sw': {'lat': 43.6845255955, 'lng': -79.45972359702252}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b9ec940f964a520c70137e3',
       'name': 'Shoppers Drug Mart',
       'location': {'address': '2343 Eglinton Ave W',
        'lat': 43.690650720838846,
        'lng': -79.45631000555339,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.690650720838846,
          'lng': -79.45631000555339}],
        'distance': 288,
        'postalCode': 'M6E 2L6',
        'cc': 'CA',


# Define a function to extract the categories of each venue in the json data

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Now use that function to construct a dataset of nearby venues

In [23]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Shoppers Drug Mart,Pharmacy,43.690651,-79.45631
1,KFC,Fast Food Restaurant,43.690647,-79.456326
2,Nairn Park,Park,43.690654,-79.4563
3,Maximum Woman,Women's Store,43.690651,-79.456333
4,Walmart,Market,43.69066,-79.456317


# Count the number of nearby venues returned for this neighborhood

In [24]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


# Now define a function to return nearby venues for all the neighborhoods

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Now call that function to create a dataframe of all Toronto venues

In [28]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )


Humewood-Cedarvale

Caledonia-Fairbanks

Silverthorn
,Keelesdale
,Mount Dennis
,Del Ray

Runnymede
,The Junction North

Weston



# Check the shape of the venues dataset

In [32]:
print(toronto_venues.shape)
toronto_venues.head()

(20, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale\n,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale\n,43.693781,-79.428191,Glen Cedar Park,43.695399,-79.429253,Playground
2,Humewood-Cedarvale\n,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena
3,Humewood-Cedarvale\n,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
4,Caledonia-Fairbanks\n,43.689026,-79.453512,Shoppers Drug Mart,43.690651,-79.45631,Pharmacy


# How many venues for each neighborhood?

In [33]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Caledonia-Fairbanks\n,6,6,6,6,6,6
Humewood-Cedarvale\n,4,4,4,4,4,4
"Runnymede\n,The Junction North\n",3,3,3,3,3,3
"Silverthorn\n,Keelesdale\n,Mount Dennis\n,Del Ray\n",5,5,5,5,5,5
Weston\n,2,2,2,2,2,2


# How many unique categories of venues are there?

In [34]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 18 uniques categories.


# Analyze each neighborhood's venues

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Bar,Brewery,Bus Line,Caribbean Restaurant,Convenience Store,Fast Food Restaurant,Field,Hockey Arena,Market,Park,Pharmacy,Playground,Restaurant,Sandwich Place,Skating Rink,Trail,Turkish Restaurant,Women's Store
0,Humewood-Cedarvale\n,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale\n,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Humewood-Cedarvale\n,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Humewood-Cedarvale\n,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Caledonia-Fairbanks\n,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [36]:
toronto_onehot.shape

(20, 19)

# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [37]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Bar,Brewery,Bus Line,Caribbean Restaurant,Convenience Store,Fast Food Restaurant,Field,Hockey Arena,Market,Park,Pharmacy,Playground,Restaurant,Sandwich Place,Skating Rink,Trail,Turkish Restaurant,Women's Store
0,Caledonia-Fairbanks\n,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.333333,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
1,Humewood-Cedarvale\n,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0
2,"Runnymede\n,The Junction North\n",0.0,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Silverthorn\n,Keelesdale\n,Mount Dennis\n,Del ...",0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.2,0.0
4,Weston\n,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
toronto_grouped.shape

(5, 19)

# Let's print each neighborhood along with the top 10 most common venues

In [39]:
num_top_venues = 10

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks
----
                  venue  freq
0                  Park  0.33
1  Fast Food Restaurant  0.17
2              Pharmacy  0.17
3                Market  0.17
4         Women's Store  0.17
5     Convenience Store  0.00
6                 Field  0.00
7          Hockey Arena  0.00
8  Caribbean Restaurant  0.00
9               Brewery  0.00


----Humewood-Cedarvale
----
                venue  freq
0               Trail  0.25
1               Field  0.25
2        Hockey Arena  0.25
3          Playground  0.25
4                 Bar  0.00
5            Pharmacy  0.00
6  Turkish Restaurant  0.00
7        Skating Rink  0.00
8      Sandwich Place  0.00
9          Restaurant  0.00


----Runnymede
,The Junction North
----
                  venue  freq
0              Bus Line  0.33
1  Caribbean Restaurant  0.33
2               Brewery  0.33
3                   Bar  0.00
4            Playground  0.00
5    Turkish Restaurant  0.00
6                 Trail  0.00
7          Skating Rin

# Write a function to sort the venues in descending order.

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Now create a pandas dataframe of the sorted neighborhood/venue data

In [53]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Caledonia-Fairbanks\n,Park,Women's Store,Pharmacy,Fast Food Restaurant,Market,Restaurant,Playground,Sandwich Place,Skating Rink,Turkish Restaurant
1,Humewood-Cedarvale\n,Hockey Arena,Trail,Playground,Field,Women's Store,Brewery,Bus Line,Caribbean Restaurant,Convenience Store,Fast Food Restaurant
2,"Runnymede\n,The Junction North\n",Brewery,Bus Line,Caribbean Restaurant,Women's Store,Turkish Restaurant,Convenience Store,Fast Food Restaurant,Field,Hockey Arena,Market
3,"Silverthorn\n,Keelesdale\n,Mount Dennis\n,Del ...",Bar,Skating Rink,Sandwich Place,Restaurant,Turkish Restaurant,Field,Brewery,Bus Line,Caribbean Restaurant,Convenience Store
4,Weston\n,Convenience Store,Park,Women's Store,Hockey Arena,Brewery,Bus Line,Caribbean Restaurant,Fast Food Restaurant,Field,Market


# Run k-means to cluster the neighborhoods into 5 groups

In [54]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 2, 1, 0])

# Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [55]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

#toronto_merged.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,York,Humewood-Cedarvale\n,43.693781,-79.428191,4,Hockey Arena,Trail,Playground,Field,Women's Store,Brewery,Bus Line,Caribbean Restaurant,Convenience Store,Fast Food Restaurant
1,York,Caledonia-Fairbanks\n,43.689026,-79.453512,3,Park,Women's Store,Pharmacy,Fast Food Restaurant,Market,Restaurant,Playground,Sandwich Place,Skating Rink,Turkish Restaurant
2,York,"Silverthorn\n,Keelesdale\n,Mount Dennis\n,Del ...",43.691116,-79.476013,1,Bar,Skating Rink,Sandwich Place,Restaurant,Turkish Restaurant,Field,Brewery,Bus Line,Caribbean Restaurant,Convenience Store
3,York,"Runnymede\n,The Junction North\n",43.673185,-79.487262,2,Brewery,Bus Line,Caribbean Restaurant,Women's Store,Turkish Restaurant,Convenience Store,Fast Food Restaurant,Field,Hockey Arena,Market
4,York,Weston\n,43.706876,-79.518188,0,Convenience Store,Park,Women's Store,Hockey Arena,Brewery,Bus Line,Caribbean Restaurant,Fast Food Restaurant,Field,Market


# Finally, let's visualize the resulting clusters

In [57]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters