## Segmenting and Clustering Neighbourhoods in Toronto

#### 3-part assignment to study neighbourhood trends in Toronto

Need to install "geocoder" if not already installed

In [0]:
!pip install geocoder

!pip install -c conda-forge geopy --yes  
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!pip install -c conda-forge folium=0.5.0 --yes  
import folium # map rendering library


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --yes

Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --yes


Import all packages

In [0]:
from bs4 import BeautifulSoup
import requests
import re 
import pandas as pd, numpy as np
import geocoder
import io

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


## References used:
#Lab: New York, FourSquare API and clustering
#https://stackoverflow.com/questions/50355577/scraping-wikipedia-tables-with-python-selectively
#https://stackoverflow.com/questions/42776834/get-the-title-of-a-link-using-beautifulsoup
#https://stackoverflow.com/questions/46275765/pandas-merge-row-data-with-multiple-values-to-python-list-for-a-column
#https://www.geeksforgeeks.org/python-program-split-join-string/
#https://stackoverflow.com/questions/32400867/pandas-read-csv-from-url

### Part 1 : creating the dataset from table in Wikipedia

Use a loop to read in all entries from the WikiTable into a list

In [0]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

res = requests.get(URL).content   
soup = BeautifulSoup(res,'lxml')

all_items = []
# Find all with "tr"
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    # Find items in the table
    data = items.find_all(['th','td'])

    # Get the raw strings
    postcode = str(data[0].contents[0] )
    borough_1 = str(data[1].contents[0] )
    neighbourhood_1 = str(data[2].contents[0] )

    # Remove links in Borough, if any
    if re.search(   '</a>'  , borough_1 ) :
      borough = re.split('>|<|</a>',  borough_1  ) [-3:-2][0]  
    else : 
      borough = borough_1
    
    # Remove links in Neighbourhood, if any
    if re.search(   '</a>'  , neighbourhood_1 ) :
      neighbourhood = re.split('>|<|</a>',  neighbourhood_1  ) [-3:-2][0] 
    else :
      neighbourhood = neighbourhood_1
    
    # Remove Extra line breaks if they appear
    borough = re.sub(r'\n', '', borough)   
    neighbourhood = re.sub(r'(\n)', r'', neighbourhood) 
    if neighbourhood == "Not assigned" :
      neighbourhood = borough

    # Create the new row, and add to the list
    if borough != "Not assigned" :
      new_row = [postcode, borough, neighbourhood]
      all_items.append(new_row)

Convert the list to a pandas dataframe, and make sure there is one row per postal code

In [0]:
# Creating a dataframe object from list of lists
all_items_df = pd.DataFrame(all_items) 
all_items_df.columns =['PostalCode', 'Borough', 'Neighbourhood'] 

# For PostCode with multiple Neighborhood, collapse them into a list
grouped_df = all_items_df.groupby(['PostalCode', 'Borough']).agg(lambda x: ', '.join( tuple(x)  )  ).reset_index()    
 
# Sort by postal code
Toronto_df = grouped_df.sort_values(['PostalCode'])

Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Check the number of rows and columns in the dataframe

In [0]:
Toronto_df.shape

(103, 3)

### Part 2: adding Longitude and Latitude to the dataset

In [0]:
# Load data from the csv file
geo_URL="http://cocl.us/Geospatial_data"
geo_bytes=requests.get(geo_URL).content
geo_data=pd.read_csv(io.StringIO(geo_bytes.decode('utf-8')))

In [0]:
## Double checking results
# print(type(geo_data))
# print(type(geo_bytes))
# print(geo_data.head(), geo_data.shape )

In [0]:
Toronto_LatLong = pd.merge(Toronto_df, geo_data, left_on='PostalCode', right_on='Postal Code')[['PostalCode', 'Borough', 'Neighbourhood','Latitude','Longitude']]

Toronto_LatLong.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [0]:
# Check how many boroughs and neighbourhoods there are
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_LatLong['Borough'].unique()),
        Toronto_LatLong.shape[0]
    )
)
print("Boroughs are ", Toronto_LatLong['Borough'].unique())

The dataframe has 10 boroughs and 103 neighborhoods.
Boroughs are  ['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke']


### Part 3: clustering analysis of the data

Find the coordinates of Toronto to make a map

In [0]:
address = 'Toronto, ON'  

geolocator = Nominatim(user_agent="ON_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [0]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[  latitude, longitude ], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_LatLong['Latitude'], Toronto_LatLong['Longitude'], Toronto_LatLong['Borough'], Toronto_LatLong['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

Define FourSquare Credentials

In [0]:
CLIENT_ID = 'xxxxxxx' # your Foursquare ID
CLIENT_SECRET = 'xxxxxxxxxxxx' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


LIMIT = 5 #100 # limit of number of venues returned by Foursquare API

Your credentails:
CLIENT_ID: WLCPKLDTCHGABWDC4ZYERUR44LCZMHSTYRR2CHRYRWIFUURJ
CLIENT_SECRET:ZZXXL41D0BKQKTXBFQFMBZRZ34DN2YOFGIFOWFHPV533RVUM


Create a function to look up venues in each neighborhood from FourSquare using API

In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        print(type(results))
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run this function to get all venues for all available neighborhoods

In [0]:

toronto_venues = getNearbyVenues(names=Toronto_LatLong['Neighbourhood'],
                                   latitudes=Toronto_LatLong['Latitude'],
                                   longitudes=Toronto_LatLong['Longitude']
                                  )

Rouge, Malvern
<class 'list'>
Highland Creek, Rouge Hill, Port Union
<class 'list'>
Guildwood, Morningside, West Hill
<class 'list'>
Woburn
<class 'list'>
Cedarbrae
<class 'list'>
Scarborough Village
<class 'list'>
East Birchmount Park, Ionview, Kennedy Park
<class 'list'>
Clairlea, Golden Mile, Oakridge
<class 'list'>
Cliffcrest, Cliffside, Scarborough Village West
<class 'list'>
Birch Cliff, Cliffside West
<class 'list'>
Dorset Park, Scarborough Town Centre, Wexford Heights
<class 'list'>
Maryvale, Wexford
<class 'list'>
Agincourt
<class 'list'>
Clarks Corners, Sullivan, Tam O'Shanter
<class 'list'>
Agincourt North, L'Amoreaux East, Milliken, Steeles East
<class 'list'>
L'Amoreaux West
<class 'list'>
Upper Rouge
<class 'list'>
Hillcrest Village
<class 'list'>
Fairview, Henry Farm, Oriole
<class 'list'>
Bayview Village
<class 'list'>
Silver Hills, York Mills
<class 'list'>
Newtonbrook, Willowdale
<class 'list'>
Willowdale South
<class 'list'>
York Mills West
<class 'list'>
Willowdale 

Check results. Each venue has its name and also the category, and the category will be used for clustering

In [0]:
print(  toronto_venues.shape  )
toronto_venues.head()

(427, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


In [0]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 134 uniques categories.


Analyze each neighbourhood, and start by one-hot encoding the categories

In [0]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beer Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Stop,Cafeteria,Café,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cuban Restaurant,Curling Ice,Dance Studio,Deli / Bodega,...,Neighborhood,Noodle House,Organic Grocery,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Rental Car Location,Restaurant,River,Salad Place,Salon / Barbershop,Sandwich Place,Shopping Mall,Skating Rink,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Swim School,Tea Room,Thai Restaurant,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
toronto_onehot.shape

(427, 134)

Scale the one-hot encodings by the overall means

In [0]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beer Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Stop,Cafeteria,Café,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cuban Restaurant,Curling Ice,Dance Studio,...,Museum,Noodle House,Organic Grocery,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Rental Car Location,Restaurant,River,Salad Place,Salon / Barbershop,Sandwich Place,Shopping Mall,Skating Rink,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Swim School,Tea Room,Thai Restaurant,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.2,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.2,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.4,0.000000,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Willowdale West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.2,0.2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,"Woodbine Gardens, Parkview Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.2,0.2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.2,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.2,0.2,...,0.0,0.0,0.0,0.000000,0.0,0.2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
toronto_grouped.shape

(99, 134)

View the top 5 venues in each neighborhood

In [0]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                           venue  freq
0                          Plaza   0.2
1                     Restaurant   0.2
2                   Concert Hall   0.2
3  Vegetarian / Vegan Restaurant   0.2
4                          Hotel   0.2


----Agincourt----
                       venue  freq
0                     Lounge  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4                     Market  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
         venue  freq
0         Park  0.33
1  Coffee Shop  0.33
2   Playground  0.33
3  Yoga Studio  0.00
4        Motel  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0             Pharmacy   0.2
1        Grocery Store   0.2
2       Sandwich Place   0.2
3           Beer Store   0.2
4  Fried Chicken Joint   0.2


----Alderwood,

Create the top 5 venues for each neighborhood

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Run the funcion to find the 1st-5th most common venues by neighborhood

In [0]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Concert Hall,Vegetarian / Vegan Restaurant,Plaza,Restaurant,Hotel
1,Agincourt,Breakfast Spot,Latin American Restaurant,Lounge,Skating Rink,Curling Ice
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Coffee Shop,Department Store,Dog Run
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pharmacy,Grocery Store,Sandwich Place,Beer Store,Fried Chicken Joint
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Pub,Department Store


K-means clustering to group neighborhoods by what types of venues they have available

In [0]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 2, 0, 3, 1, 0, 1, 1, 0], dtype=int32)

In [0]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted['Cluster Labels'] = neighborhoods_venues_sorted[['Cluster Labels']].astype(int)

#print(neighborhoods_venues_sorted.dtypes)
#print(    neighborhoods_venues_sorted[['Cluster Labels']].astype(int)    )
toronto_merged = Toronto_LatLong
toronto_merged.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# Remove any that had no data
toronto_merged = toronto_merged.dropna()
toronto_merged['Cluster Labels'] = toronto_merged[['Cluster Labels']].astype(int)

toronto_merged.head() # check the last columns!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Fast Food Restaurant,Women's Store,Dessert Shop,Drugstore,Dog Run,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Construction & Landscaping,Bar,Diner,Eastern European Restaurant,Drugstore,0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Electronics Store,Mexican Restaurant,Bank,Spa,Rental Car Location,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Convenience Store,Korean Restaurant,Dessert Shop,Drugstore,3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Athletics & Sports,Bank,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,0


In [0]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],     #[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Analyze the clusters, and include a description of what makes that cluster stand out (for example, Cluster 2 is most likely to have a gym

Cluster 1 - Mostly Restaurants, although this is the most diverse cluster

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
0,Scarborough,Fast Food Restaurant,Women's Store,Dessert Shop,Drugstore,Dog Run,0
1,Scarborough,Construction & Landscaping,Bar,Diner,Eastern European Restaurant,Drugstore,0
2,Scarborough,Electronics Store,Mexican Restaurant,Bank,Spa,Rental Car Location,0
4,Scarborough,Athletics & Sports,Bank,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,0
7,Scarborough,Bakery,Bus Line,Intersection,Metro Station,Ice Cream Shop,0
8,Scarborough,Motel,American Restaurant,Department Store,Drugstore,Dog Run,0
9,Scarborough,Café,College Stadium,General Entertainment,Skating Rink,Cosmetics Shop,0
10,Scarborough,Indian Restaurant,Pet Store,Vietnamese Restaurant,Chinese Restaurant,Women's Store,0
11,Scarborough,Bakery,Middle Eastern Restaurant,Breakfast Spot,Sandwich Place,Shopping Mall,0
12,Scarborough,Breakfast Spot,Latin American Restaurant,Lounge,Skating Rink,Curling Ice,0


Cluster 2 - Gyms

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
27,North York,Gym,Restaurant,Clothing Store,Italian Restaurant,Discount Store,1
28,North York,Bridal Shop,Restaurant,Coffee Shop,Deli / Bodega,Middle Eastern Restaurant,1
49,Central Toronto,Liquor Store,Restaurant,American Restaurant,Supermarket,Sushi Restaurant,1
55,Downtown Toronto,Gym,Restaurant,Coffee Shop,Japanese Restaurant,Italian Restaurant,1
56,Downtown Toronto,Concert Hall,Restaurant,Vegetarian / Vegan Restaurant,Museum,Liquor Store,1
58,Downtown Toronto,Concert Hall,Vegetarian / Vegan Restaurant,Plaza,Restaurant,Hotel,1
60,Downtown Toronto,Gym,Pub,Restaurant,Hotel,Coffee Shop,1
61,Downtown Toronto,Gym,Coffee Shop,Restaurant,Pub,Café,1
62,North York,Café,Coffee Shop,Restaurant,Thai Restaurant,Indian Restaurant,1
66,Downtown Toronto,Bakery,Japanese Restaurant,Italian Restaurant,Dessert Shop,Restaurant,1


Cluster 3 - Parks, Playgrounds, and Dog Runs

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
5,Scarborough,Playground,Women's Store,Department Store,Drugstore,Dog Run,2
14,Scarborough,Park,Playground,Coffee Shop,Department Store,Dog Run,2
20,North York,Cafeteria,Park,Women's Store,Drugstore,Dog Run,2
23,North York,Park,Convenience Store,Bank,Dessert Shop,Drugstore,2
25,North York,Park,Food & Drink Shop,Women's Store,Dessert Shop,Drugstore,2
30,North York,Park,Airport,Bus Stop,Dessert Shop,Drugstore,2
40,East York,Park,Convenience Store,Coffee Shop,Dessert Shop,Drugstore,2
44,Central Toronto,Park,Swim School,Bus Line,Construction & Landscaping,Cuban Restaurant,2
45,Central Toronto,Breakfast Spot,Hotel,Food & Drink Shop,Department Store,Park,2
48,Central Toronto,Playground,Trail,Women's Store,Department Store,Dog Run,2


Cluster 4 - Coffee shops and pizza places

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
3,Scarborough,Coffee Shop,Convenience Store,Korean Restaurant,Dessert Shop,Drugstore,3
6,Scarborough,Discount Store,Coffee Shop,Hobby Shop,Department Store,Women's Store,3
24,North York,Pizza Place,Discount Store,Grocery Store,Coffee Shop,Pharmacy,3
29,North York,Caribbean Restaurant,Massage Studio,Coffee Shop,Furniture / Home Store,Bar,3
34,North York,Pizza Place,Coffee Shop,Portuguese Restaurant,Hockey Arena,Intersection,3
43,East Toronto,Pet Store,Ice Cream Shop,Sandwich Place,Bookstore,Coffee Shop,3
53,Downtown Toronto,Breakfast Spot,Coffee Shop,Distribution Center,Spa,Bakery,3
57,Downtown Toronto,Coffee Shop,Park,Women's Store,Dessert Shop,Drugstore,3
71,North York,Boutique,Vietnamese Restaurant,Coffee Shop,Clothing Store,Furniture / Home Store,3
78,West Toronto,Coffee Shop,Gym,Italian Restaurant,Bar,Women's Store,3


Cluster 5 - Baseball Fields, Drugstores, Diners, and Eastern European Restaurants (and this is the most specific cluster)

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
91,Etobicoke,Construction & Landscaping,Baseball Field,Diner,Eastern European Restaurant,Drugstore,4
97,North York,Baseball Field,Women's Store,Diner,Eastern European Restaurant,Drugstore,4
