# Assignment Week 3 Capstone Data Science

#### Import libraries required for assignment

In [1]:
#Required to open URL
import requests

In [2]:
#Install BeautifulSoup - comment out else when done
#!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

In [3]:
#Pandas library
import pandas as pd

In [4]:
#Install geocoder - comment out else when done
#!conda install -c conda-forge geocoder --yes
import geocoder

In [5]:
#Install geopy - comment out else when done
#!conda install -c conda-forge geopy --yes
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [6]:
import json # library to handle JSON files
# import k-means from clustering stage
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize


In [7]:
import numpy as np # library to handle data in a vectorized manner

In [8]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

### First stage is to scrape the wiki page for the data and format data into a data frame.

In [9]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Parse the HTML of the URL into a Beautiful Soup tree

In [10]:
soup = BeautifulSoup(website_url)

In [11]:
#Find the title
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

#### Look through and find the required table. Data required starts with HTML tag "table"

In [12]:
#print(soup.prettify())

#### Bring back the table using find

In [13]:
pcode_table=soup.find('table', class_='wikitable sortable')
#pcode_table

#### Each row in the table is starts with "tr" and the data within is row is within "td". 

##### Loop through the pcode table and find the "tr" then the "td" tags. If the row has 3 columns then store the data in lists A (first column), B (second column) and C(third column).

In [14]:
A=[]
B=[]
C=[]

for row in pcode_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
        

In [15]:
#Have a quick check of the first element of the each list.
A[0]

'M1A'

In [16]:
B[0]

'Not assigned'

In [17]:
C[0]

'Not assigned\n'

#### There appears to be some newlines in the columns. Use rstrip to go through list and clean.

In [18]:
j=0
for i in A:
    A[j]=i.rstrip()
    j=j+1
j=0
for i in B:
    B[j]=i.rstrip()
    j=j+1
j=0
for i in C:
    C[j]=i.rstrip()
    j=j+1

#### Import the pandas library and create a dataframe df with columns PostalCode, Borough and Neighbourhood and assign the column lists to the dataframe.

In [19]:
import pandas as pd
df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough']=B
df['Neighbourhood']=C

#### Check the top and bottom of the dataframe to ensure it's captured

In [20]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [21]:
df.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor
287,M9Z,Not assigned,Not assigned


#### Need to ensure that rows are removed that don't have a borough or neighbourhood assigned

In [22]:
df = df[df['Borough'] != 'Not assigned']

In [23]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Combine neighbourhoods (comma seperated), and remove duplicate rows of Postalcodes and Boroughs.

In [24]:
dfnew = df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(','.join)

In [25]:
dfnew.head()

Postal Code  Borough    
M1B          Scarborough                           Rouge,Malvern
M1C          Scarborough    Highland Creek,Rouge Hill,Port Union
M1E          Scarborough         Guildwood,Morningside,West Hill
M1G          Scarborough                                  Woburn
M1H          Scarborough                               Cedarbrae
Name: Neighbourhood, dtype: object

#### Convert back to a dataframe and reset the index

In [26]:
dfnew2 = pd.DataFrame(dfnew)
dfnew2 = dfnew2.reset_index()

In [27]:
dfnew2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Change Neighbour to Borough if it's unassigned.

In [28]:
dfnew2.loc[(dfnew2['Neighbourhood'] == 'Not assigned'),'Neighbourhood']=dfnew2['Borough']

In [29]:
df = dfnew2

#### Shape of dataframe

In [30]:
df.shape

(103, 3)

### Second Stage of assignment is to get the Neighbourhood lats and longs

##### Have attempted to use geocoder but it continually returns 'None'. Therefore will proceed to get lats and longs via the CSV

In [31]:
# initialize your variable to None
postal_code = "M1B"
# loop until you get the coordinates
#while(lat_lng_coords is None):
g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
lat_lng_coords = g.latlng
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [32]:
print(lat_lng_coords)

None


#### Read in Geo data from CSV file

In [33]:
dfgeo=pd.read_csv('Geospatial_Coordinates.csv')

In [34]:
dfgeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Now the lat long data is loaded will merge with scraped wiki data.

In [35]:
dfnew = df.merge(dfgeo,on=['Postal Code'],how='left')

In [36]:
dfnew.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Check the size of the dataframe and change the name to 'neighbourhoods'

In [37]:
dfnew.shape

(103, 5)

In [38]:
neighbourhoods = dfnew

In [39]:
neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [40]:
#Write data to a CSV file -- comment as required
#neighbourhoods.to_csv('Neighbourhood_data.csv')

#### Create a map of Toronto with neighborhoods superimposed on top.

In [41]:
#Get the central lats and longs for Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [42]:
map_toronto = folium.Map(location=[latitude, longitude])
map_toronto

In [43]:
# add neighbourhood markers to map
for lat, lng, bor, nhood in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(nhood, bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  


In [44]:
map_toronto

#### Going to explore the Scarborough Bourough in the following analysis

In [45]:
scarborough_data = neighbourhoods[neighbourhoods['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Get the Lats and longs of the Scarborough borough

In [46]:
address = 'Scarborough, Ontario'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.773077, -79.257774.


In [47]:
# create map of Scarborough using latitude and longitude values
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough

#### Use the FourSquare API to get venue data for the Toronto neighbourhoods

##### Define Foursquare credentials

In [48]:
CLIENT_ID = 'FQK1QA11AK1OPXWQ1X1D4AVU1UKV1OV1SNOSJ04VVQJJGVHZ' # your Foursquare ID
CLIENT_SECRET = 'PWBEB4JGCJVP0XAFQ33YRC43ZMZIWXANE4LTCZ42XQNZRYYP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FQK1QA11AK1OPXWQ1X1D4AVU1UKV1OV1SNOSJ04VVQJJGVHZ
CLIENT_SECRET:PWBEB4JGCJVP0XAFQ33YRC43ZMZIWXANE4LTCZ42XQNZRYYP


##### Define function that will retrieve venue data for a neighbourhood

In [49]:
def getNearbyVenues(names, latitudes, longitudes, radius=750):
    
    LIMIT = 100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [50]:

scarborough_venues = getNearbyVenues(names=scarborough_data['Neighbourhood'],
                                   latitudes=scarborough_data['Latitude'],
                                   longitudes=scarborough_data['Longitude']
                                  )

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge


In [51]:
#Have a look at the data
print(scarborough_venues.shape)
scarborough_venues

(205, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.198080,Fast Food Restaurant
3,"Rouge,Malvern",43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
4,"Rouge,Malvern",43.806686,-79.194353,Tim Hortons,43.802000,-79.198169,Coffee Shop
...,...,...,...,...,...,...,...
200,L'Amoreaux West,43.799525,-79.318389,A Buck or Two,43.798286,-79.318485,Thrift / Vintage Store
201,L'Amoreaux West,43.799525,-79.318389,Coffee Time,43.797952,-79.318678,Coffee Shop
202,L'Amoreaux West,43.799525,-79.318389,Presotea,43.799397,-79.319014,Bubble Tea Shop
203,L'Amoreaux West,43.799525,-79.318389,Warden Park,43.804789,-79.320449,Other Great Outdoors


#### Check how many venues were returned for each neighborhood

In [52]:
scarborough_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,14,14,14,14,14,14
"Agincourt North,L'Amoreaux East,Milliken,Steeles East",18,18,18,18,18,18
"Birch Cliff,Cliffside West",7,7,7,7,7,7
Cedarbrae,22,22,22,22,22,22
"Clairlea,Golden Mile,Oakridge",16,16,16,16,16,16
"Clarks Corners,Sullivan,Tam O'Shanter",20,20,20,20,20,20
"Cliffcrest,Cliffside,Scarborough Village West",5,5,5,5,5,5
"Dorset Park,Scarborough Town Centre,Wexford Heights",15,15,15,15,15,15
"East Birchmount Park,Ionview,Kennedy Park",16,16,16,16,16,16
"Guildwood,Morningside,West Hill",11,11,11,11,11,11


#### Let's find out how many unique categories can be curated from all the returned venues

In [53]:
print('There are {} unique categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 85 unique categories.


#### Analyse each neighbourhood

In [54]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
scarborough_onehot['Neighbourhood'] = scarborough_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.tail()

Unnamed: 0,Neighbourhood,African Restaurant,American Restaurant,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Badminton Court,Bakery,Bank,...,Spa,Sports Bar,Supermarket,Sushi Restaurant,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
200,L'Amoreaux West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
201,L'Amoreaux West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202,L'Amoreaux West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,L'Amoreaux West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
204,L'Amoreaux West,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
scarborough_onehot.shape

(205, 86)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [56]:
scarborough_grouped = scarborough_onehot.groupby('Neighbourhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighbourhood,African Restaurant,American Restaurant,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Badminton Court,Bakery,Bank,...,Spa,Sports Bar,Supermarket,Sushi Restaurant,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.071429,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,...,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0
1,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Birch Cliff,Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
3,Cedarbrae,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.090909,0.045455,...,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455
4,"Clairlea,Golden Mile,Oakridge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Clarks Corners,Sullivan,Tam O'Shanter",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
6,"Cliffcrest,Cliffside,Scarborough Village West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
7,"Dorset Park,Scarborough Town Centre,Wexford He...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0,0.0
8,"East Birchmount Park,Ionview,Kennedy Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Guildwood,Morningside,West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0


#### Let's print each neighbourhood along with the top 5 most common venues

In [57]:
num_top_venues = 5

for hood in scarborough_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                      venue  freq
0  Mediterranean Restaurant  0.07
1                    Lounge  0.07
2                 Pool Hall  0.07
3            Sandwich Place  0.07
4        Seafood Restaurant  0.07


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
                  venue  freq
0           Pizza Place  0.11
1    Chinese Restaurant  0.11
2     Korean Restaurant  0.06
3  Fast Food Restaurant  0.06
4                  Park  0.06


----Birch Cliff,Cliffside West----
             venue  freq
0  College Stadium  0.14
1  Thai Restaurant  0.14
2             Café  0.14
3     Skating Rink  0.14
4            Diner  0.14


----Cedarbrae----
               venue  freq
0           Pharmacy  0.09
1        Coffee Shop  0.09
2  Indian Restaurant  0.09
3             Bakery  0.09
4        Yoga Studio  0.05


----Clairlea,Golden Mile,Oakridge----
          venue  freq
0  Intersection  0.19
1         Diner  0.12
2   Coffee Shop  0.12
3        Bakery  0.12
4      Bus Line

#### Let's put that into a *pandas* dataframe

In [58]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighbourhood.

In [59]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = scarborough_grouped['Neighbourhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.tail()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,L'Amoreaux West,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Pharmacy,Breakfast Spot,Other Great Outdoors,Pizza Place,Sandwich Place,Bubble Tea Shop,Grocery Store
12,"Maryvale,Wexford",Middle Eastern Restaurant,Pizza Place,Burger Joint,Grocery Store,Bar,Print Shop,Restaurant,Seafood Restaurant,Breakfast Spot,Fish Market
13,"Rouge,Malvern",Fast Food Restaurant,African Restaurant,Paper / Office Supplies Store,Hobby Shop,Bus Station,Spa,Coffee Shop,Fish Market,Convenience Store,Department Store
14,Scarborough Village,Fast Food Restaurant,Coffee Shop,Women's Store,Pizza Place,Convenience Store,Restaurant,Sandwich Place,College Stadium,Department Store,Diner
15,Woburn,Coffee Shop,Park,Convenience Store,Business Service,Yoga Studio,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant


### Cluster Neighbourhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [60]:
# set number of clusters
kclusters = 5

scarborough_grouped_clustering = scarborough_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([3, 0, 3, 3, 3], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [61]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarborough_merged = scarborough_data

# merge scarborough_grouped with scarborough_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

scarborough_merged.tail() # check the last columns!


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,M1S,Scarborough,Agincourt,43.7942,-79.262029,3.0,Shanghai Restaurant,Breakfast Spot,Sandwich Place,Seafood Restaurant,Discount Store,Shopping Mall,Skating Rink,Badminton Court,Supermarket,Sushi Restaurant
13,M1T,Scarborough,"Clarks Corners,Sullivan,Tam O'Shanter",43.781638,-79.304302,0.0,Pizza Place,Bus Stop,Shopping Mall,Noodle House,Convenience Store,Golf Course,Pharmacy,Fast Food Restaurant,Sandwich Place,Seafood Restaurant
14,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel...",43.815252,-79.284577,0.0,Chinese Restaurant,Pizza Place,Caribbean Restaurant,Korean Restaurant,Park,Pharmacy,Malay Restaurant,Fast Food Restaurant,Noodle House,Shop & Service
15,M1W,Scarborough,L'Amoreaux West,43.799525,-79.318389,0.0,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Pharmacy,Breakfast Spot,Other Great Outdoors,Pizza Place,Sandwich Place,Bubble Tea Shop,Grocery Store
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,,,,,,,,,,,


In [62]:
#The data shows that Scarborough Upper Rouge did not return any venues therefore data contains NaN, will delete this row from
#the data
scarborough_merged = scarborough_merged.drop(scarborough_merged[scarborough_merged['Neighbourhood'] == 'Upper Rouge'].index)
scarborough_merged['Cluster Labels']= scarborough_merged['Cluster Labels'].astype('int32')

In [63]:
scarborough_merged.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849,0,Middle Eastern Restaurant,Pizza Place,Burger Joint,Grocery Store,Bar,Print Shop,Restaurant,Seafood Restaurant,Breakfast Spot,Fish Market
12,M1S,Scarborough,Agincourt,43.7942,-79.262029,3,Shanghai Restaurant,Breakfast Spot,Sandwich Place,Seafood Restaurant,Discount Store,Shopping Mall,Skating Rink,Badminton Court,Supermarket,Sushi Restaurant
13,M1T,Scarborough,"Clarks Corners,Sullivan,Tam O'Shanter",43.781638,-79.304302,0,Pizza Place,Bus Stop,Shopping Mall,Noodle House,Convenience Store,Golf Course,Pharmacy,Fast Food Restaurant,Sandwich Place,Seafood Restaurant
14,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel...",43.815252,-79.284577,0,Chinese Restaurant,Pizza Place,Caribbean Restaurant,Korean Restaurant,Park,Pharmacy,Malay Restaurant,Fast Food Restaurant,Noodle House,Shop & Service
15,M1W,Scarborough,L'Amoreaux West,43.799525,-79.318389,0,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Pharmacy,Breakfast Spot,Other Great Outdoors,Pizza Place,Sandwich Place,Bubble Tea Shop,Grocery Store


Finally, let's visualize the resulting clusters

In [64]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighbourhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Closer look at the cluster data

#### Cluster 1

In [65]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 0, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Fast Food Restaurant,African Restaurant,Paper / Office Supplies Store,Hobby Shop,Bus Station,Spa,Coffee Shop,Fish Market,Convenience Store,Department Store
2,Scarborough,0,Fast Food Restaurant,Pizza Place,Coffee Shop,Grocery Store,Greek Restaurant,Thrift / Vintage Store,Sports Bar,Fried Chicken Joint,Beer Store,College Stadium
5,Scarborough,0,Fast Food Restaurant,Coffee Shop,Women's Store,Pizza Place,Convenience Store,Restaurant,Sandwich Place,College Stadium,Department Store,Diner
11,Scarborough,0,Middle Eastern Restaurant,Pizza Place,Burger Joint,Grocery Store,Bar,Print Shop,Restaurant,Seafood Restaurant,Breakfast Spot,Fish Market
13,Scarborough,0,Pizza Place,Bus Stop,Shopping Mall,Noodle House,Convenience Store,Golf Course,Pharmacy,Fast Food Restaurant,Sandwich Place,Seafood Restaurant
14,Scarborough,0,Chinese Restaurant,Pizza Place,Caribbean Restaurant,Korean Restaurant,Park,Pharmacy,Malay Restaurant,Fast Food Restaurant,Noodle House,Shop & Service
15,Scarborough,0,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Pharmacy,Breakfast Spot,Other Great Outdoors,Pizza Place,Sandwich Place,Bubble Tea Shop,Grocery Store


Good areas to eat out, especially pizza, fast food and Asian food.

#### Cluster 2

In [66]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 1, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,1,Coffee Shop,Park,Convenience Store,Business Service,Yoga Studio,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant


Coffee shops and parks, more laid back.

#### Cluster 3

In [71]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 2, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,2,Breakfast Spot,Italian Restaurant,Bar,Burger Joint,Yoga Studio,Food Court,Department Store,Diner,Discount Store,Electronics Store


Mix of food 

#### Cluster 4

In [72]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 3, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Scarborough,3,Pharmacy,Indian Restaurant,Coffee Shop,Bakery,Burger Joint,Hakka Restaurant,Fried Chicken Joint,Flower Shop,Music Store,Chinese Restaurant
6,Scarborough,3,Coffee Shop,Discount Store,Pharmacy,Convenience Store,Grocery Store,Department Store,Rental Car Location,Bus Station,Sandwich Place,Light Rail Station
7,Scarborough,3,Intersection,Coffee Shop,Diner,Bakery,Bus Line,Fast Food Restaurant,Convenience Store,Metro Station,Soccer Field,Park
9,Scarborough,3,Café,General Entertainment,College Stadium,Diner,Bank,Skating Rink,Thai Restaurant,Fast Food Restaurant,Grocery Store,Convenience Store
10,Scarborough,3,Indian Restaurant,Electronics Store,Pet Store,Bakery,Furniture / Home Store,Latin American Restaurant,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Gym
12,Scarborough,3,Shanghai Restaurant,Breakfast Spot,Sandwich Place,Seafood Restaurant,Discount Store,Shopping Mall,Skating Rink,Badminton Court,Supermarket,Sushi Restaurant


Shopping and entertainment areas.

#### Cluster 5

In [73]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 4, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Scarborough,4,Hardware Store,Burger Joint,Chinese Restaurant,Pizza Place,Wings Joint,Flower Shop,Convenience Store,Department Store,Diner,Discount Store
