In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from splinter import Browser
from bs4 import BeautifulSoup
import pprint as pp

## Part One

In [3]:
executable_path = {'executable_path': './chromedriver.exe'}

browser = Browser('chrome', **executable_path)

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
browser.visit(url)

In [5]:
html = browser.html
wiki_df = pd.read_html(html)

In [6]:
wiki_df[0]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [7]:
browser.quit()

In [8]:
wiki_df = wiki_df[0]

In [9]:
# ignore cells with a borough that is "Not assigned"
wiki_df = wiki_df.loc[wiki_df['Borough'] != "Not assigned"]

In [10]:
wiki_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


check for duplicate postal codes:

In [11]:
postal_group = wiki_df.groupby(['Postal Code']).count()
postal_group.loc[postal_group['Neighborhood'] > 1]

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1


In [12]:
# check if there are anymore neighborhoods that are "Not assigned"
wiki_df.loc[wiki_df['Neighborhood'] == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [13]:
wiki_df.shape

(103, 3)

## Part Two

In [14]:
coords = pd.read_csv("Geospatial_Coordinates.csv")

In [15]:
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
# check if the dataframes are the same sizes
wiki_df.loc[wiki_df['Postal Code'].isin(coords['Postal Code']),:].shape


(103, 3)

In [17]:
wiki_postal_sorted = wiki_df.sort_values(by = "Postal Code")
coords_postal_sorted = coords.sort_values(by='Postal Code')

In [18]:
print(wiki_postal_sorted.head())
print(coords_postal_sorted.head())


   Postal Code      Borough                            Neighborhood
9          M1B  Scarborough                          Malvern, Rouge
18         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek
27         M1E  Scarborough       Guildwood, Morningside, West Hill
36         M1G  Scarborough                                  Woburn
45         M1H  Scarborough                               Cedarbrae
  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


In [19]:
wiki_postal_sorted = wiki_postal_sorted.reset_index()
coords_postal_sorted = coords_postal_sorted.reset_index()

In [20]:
wiki_postal_sorted.drop(columns=['index'], inplace= True)
coords_postal_sorted.drop(columns=['index'], inplace=True)

In [21]:
# check if both table row orders are the same

sum(wiki_postal_sorted['Postal Code'] != coords_postal_sorted['Postal Code'])

0

In [22]:
wiki_with_coords = pd.concat([wiki_postal_sorted, coords_postal_sorted], axis=1)

In [23]:
wiki_with_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [24]:
# check for null values
wiki_with_coords.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
Postal Code     0
Latitude        0
Longitude       0
dtype: int64

In [25]:
from geopy.geocoders import Nominatim
import json
import requests
from pandas.io.json import json_normalize

# import k means from clustering stage
from sklearn.cluster import KMeans
# map rendering library
import folium


In [26]:
wiki_with_coords['latlng'] = wiki_with_coords[['Latitude','Longitude']].values.tolist()

In [27]:
wiki_with_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude,latlng
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353,"[43.806686299999996, -79.19435340000001]"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497,"[43.7845351, -79.16049709999999]"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,"[43.7635726, -79.1887115]"
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917,"[43.7709921, -79.21691740000001]"
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,"[43.773136, -79.23947609999999]"


do an analysis on the Scarborough borough:

In [53]:
scarborough_data = wiki_with_coords[wiki_with_coords['Borough'] == "Scarborough"]
scarborough_data

Unnamed: 0,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude,latlng
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353,"[43.806686299999996, -79.19435340000001]"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497,"[43.7845351, -79.16049709999999]"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,"[43.7635726, -79.1887115]"
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917,"[43.7709921, -79.21691740000001]"
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,"[43.773136, -79.23947609999999]"
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476,"[43.7447342, -79.23947609999999]"
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",M1K,43.727929,-79.262029,"[43.7279292, -79.26202940000002]"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",M1L,43.711112,-79.284577,"[43.711111700000004, -79.2845772]"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",M1M,43.716316,-79.239476,"[43.716316, -79.23947609999999]"
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848,"[43.692657000000004, -79.2648481]"


In [None]:
scar_lat = scarborough_data['Latitude'][0]
scar_lng = scarborough_data['Longitude'][0]

In [54]:
map_toronto = folium.Map(location=scarborough_data['latlng'][0], zoom_start=10)

# add markers to map
for lat,lng,borough,neighborhood in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], \
                                        scarborough_data['Borough'], scarborough_data['Neighborhood']):
    label = f"{neighborhood}, {borough}"
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)

In [55]:
map_toronto

segment with foursquare API

In [36]:
# import foursquare client_id and secret
from config import client_id, client_secret

In [44]:

version = '20180605'
# version = '20200501'

# check if variables are stored correctly
print('Your credentails:')
print(f"client id is {client_id}")
print(f"client secret is {client_secret}")

Your credentails:
client id is ARVTTGYLHBSQZMEGZ21STN3DZMQIGE2QANNZOCBSESSBQJXO
client secret is HWGU3GO2ZBVYVDGQDIHWS3YDZF1QOLZEGKSOXYKNBKLY0IVX


In [56]:
neighborhood_lat = scarborough_data['Latitude'][0]
neighborhood_lng = scarborough_data['Longitude'][0]
scar_neighborhood = scarborough_data['Neighborhood'][0]
print(f"The latitude and longitude values of {scar_neighborhood} are {neighborhood_lat},{neighborhood_lng}")

The latitude and longitude values of Malvern, Rouge are 43.806686299999996,-79.19435340000001


Get the top 30 venues that are in Scarborough:

In [75]:
url = f"https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v={version}&ll={neighborhood_lat},{neighborhood_lng}&radius={1000}&limit={30}"
print(url)

https://api.foursquare.com/v2/venues/explore?&client_id=ARVTTGYLHBSQZMEGZ21STN3DZMQIGE2QANNZOCBSESSBQJXO&client_secret=HWGU3GO2ZBVYVDGQDIHWS3YDZF1QOLZEGKSOXYKNBKLY0IVX&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=30


In [76]:
import pprint as pp

In [77]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f050c520bf45922e6338c40'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 17,
  'suggestedBounds': {'ne': {'lat': 43.81568630900001,
    'lng': -79.18190576146081},
   'sw': {'lat': 43.797686290999984, 'lng': -79.20680103853921}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d669cba83865481c948fa53',
       'name': 'Images Salon & Spa',
       'location': {'address': '8130 Sheppard Ave E',
        'crossStreet': 'Morningside Ave',
        'lat': 43.80228301948931,
        'lng': -79.19856472801668,
        'labeledLatLngs'

In [78]:
pp.pprint(results)

{'meta': {'code': 200, 'requestId': '5f050c520bf45922e6338c40'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
                                                 'items': [{'reasonName': 'globalInteractionReason',
                                                            'summary': 'This '
                                                                       'spot '
                                                                       'is '
                                                                       'popular',
                                                            'type': 'general'}]},
                                     'referralId': 'e-0-4d669cba83865481c948fa53-0',
                                     'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/spa_',
                                                                        'suffix': '.png'},
                                                               

                                                                                 'and '
                                                                                 'sheppard',
                                                                                 'Toronto '
                                                                                 'ON',
                                                                                 'Canada'],
                                                            'labeledLatLngs': [{'label': 'display',
                                                                                'lat': 43.80186301779314,
                                                                                'lng': -79.1992957809335}],
                                                            'lat': 43.80186301779314,
                                                            'lng': -79.1992957809335,
                                                            's

In [79]:
# check the number of results we received
len(results['response']['groups'][0]['items'])

17

In [92]:
# clean the json file and store it into a pandas dataframe
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

  This is separate from the ipykernel package so we can avoid doing imports until


In [93]:
nearby_venues.head()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.crossStreet,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,...,venue.location.postalCode,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.location.neighborhood
0,e-0-4d669cba83865481c948fa53-0,0,"[{'summary': 'This spot is popular', 'type': '...",4d669cba83865481c948fa53,Images Salon & Spa,8130 Sheppard Ave E,Morningside Ave,43.802283,-79.198565,"[{'label': 'display', 'lat': 43.80228301948931...",...,M1B 3W3,CA,Toronto,ON,Canada,"[8130 Sheppard Ave E (Morningside Ave), Toront...","[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",0,[],
1,e-0-4b6718c2f964a5203f3a2be3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4b6718c2f964a5203f3a2be3,Harvey's,853 Milner Ave,at Morningside Ave,43.80002,-79.198307,"[{'label': 'display', 'lat': 43.80002047583462...",...,M1B 5N6,CA,Scarborough,ON,Canada,"[853 Milner Ave (at Morningside Ave), Scarboro...","[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",0,[],
2,e-0-4bb6b9446edc76b0d771311c-2,0,"[{'summary': 'This spot is popular', 'type': '...",4bb6b9446edc76b0d771311c,Wendy’s,,Morningside & Sheppard,43.807448,-79.199056,"[{'label': 'display', 'lat': 43.80744841934756...",...,,CA,Toronto,ON,Canada,"[Toronto ON, Canada]","[{'id': '4bf58dd8d48988d16e941735', 'name': 'F...",0,[],
3,e-0-579a91b3498e9bd833afa78a-3,0,"[{'summary': 'This spot is popular', 'type': '...",579a91b3498e9bd833afa78a,Wendy's,8129 Sheppard Avenue,,43.802008,-79.19808,"[{'label': 'display', 'lat': 43.8020084, 'lng'...",...,M1B 6A3,CA,Scarborough,ON,Canada,"[8129 Sheppard Avenue, Scarborough ON M1B 6A3,...","[{'id': '4bf58dd8d48988d16e941735', 'name': 'F...",0,[],
4,e-0-4c6ac7de35d3be9a50bf2206-4,0,"[{'summary': 'This spot is popular', 'type': '...",4c6ac7de35d3be9a50bf2206,RBC Royal Bank,865 MILNER AVE,Morningside,43.798782,-79.19709,"[{'label': 'display', 'lat': 43.79878248056552...",...,M1B 5N6,CA,Scarborough,ON,Canada,"[865 MILNER AVE (Morningside), Scarborough ON ...","[{'id': '4bf58dd8d48988d10a951735', 'name': 'B...",0,[],


In [94]:
# filter columns 
filtered_columns = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues = nearby_venues[filtered_columns]

In [None]:
# functions that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except: 
        categories_list = row['venue.categories']
    
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [96]:
# filter the categories for each row using the predefined function "get_category_type"
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

In [97]:
nearby_venues.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Harvey's,Restaurant,43.80002,-79.198307
2,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
3,Wendy's,Fast Food Restaurant,43.802008,-79.19808
4,RBC Royal Bank,Bank,43.798782,-79.19709


In [104]:
# replace column names 
nearby_venues.columns = [a.split(".")[-1] for a in nearby_venues.columns]

In [106]:
# check if column names are changed
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Harvey's,Restaurant,43.80002,-79.198307
2,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
3,Wendy's,Fast Food Restaurant,43.802008,-79.19808
4,RBC Royal Bank,Bank,43.798782,-79.19709


In [109]:
print(f"{nearby_venues.shape[0]} venues were returned by FourSquare")

17 venues were returned by FourSquare


Create a function that find nearby venues to the rest of the neighborhods in Scarborough

In [118]:
def otherNearbyVenues(names, latitudes,longitudes,radius=600):
    venues_list = []
    for name, lat, lng, in zip(names, latitudes,longitudes):
        print(name)
    #     create the API request URL
        url = f"https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v={version}&ll={lat},{lng}&radius={600}&limit={30}"

    #     make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']

    #     return only relevant information
        venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
#     for venue_list in venues_list:
#     for item in venue_list:
#         nearby_venues.append(item)
    nearby_venues.columns = ['Neighborhood',
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude', 
                            'Venue Longitude',
                            'Venue Category']
    
    return nearby_venues
    

In [119]:
scarborough_venues = otherNearbyVenues(names = scarborough_data['Neighborhood'],
                                      latitudes = scarborough_data['Latitude'],
                                      longitudes = scarborough_data['Longitude'])

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge


In [121]:
scarborough_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
2,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,"Malvern, Rouge",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
4,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar


In [122]:
# check the size of the dataframe
scarborough_venues.shape

(141, 7)

Check how many venues were returned for each neighborhood


In [123]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Birch Cliff, Cliffside West",5,5,5,5,5,5
Cedarbrae,12,12,12,12,12,12
"Clarks Corners, Tam O'Shanter, Sullivan",16,16,16,16,16,16
"Cliffside, Cliffcrest, Scarborough Village West",6,6,6,6,6,6
"Dorset Park, Wexford Heights, Scarborough Town Centre",12,12,12,12,12,12
"Golden Mile, Clairlea, Oakridge",14,14,14,14,14,14
"Guildwood, Morningside, West Hill",15,15,15,15,15,15
"Kennedy Park, Ionview, East Birchmount Park",9,9,9,9,9,9
"Malvern, Rouge",4,4,4,4,4,4


In [125]:
print(f"There are {len(scarborough_venues['Venue Category'].unique())} unique categories")

There are 74 unique categories


Analyze Each Neighborhood

In [126]:
scarborough_venues.head(2)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant


In [129]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add a neighborhood column to onehot dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood']

In [137]:
scarborough_onehot.columns

Index(['American Restaurant', 'Asian Restaurant', 'Athletics & Sports',
       'Bakery', 'Bank', 'Bar', 'Breakfast Spot', 'Bus Line', 'Bus Station',
       'Bus Stop', 'Business Service', 'Café', 'Caribbean Restaurant',
       'Chinese Breakfast Place', 'Chinese Restaurant', 'Clothing Store',
       'Coffee Shop', 'College Stadium', 'Construction & Landscaping',
       'Convenience Store', 'Department Store', 'Diner', 'Discount Store',
       'Electronics Store', 'Fast Food Restaurant', 'Fish Market',
       'Fried Chicken Joint', 'Furniture / Home Store', 'Gas Station',
       'General Entertainment', 'Greek Restaurant', 'Grocery Store', 'Gym',
       'Hakka Restaurant', 'Hardware Store', 'Hobby Shop', 'Hockey Arena',
       'Ice Cream Shop', 'Indian Restaurant', 'Intersection',
       'Italian Restaurant', 'Korean Restaurant', 'Latin American Restaurant',
       'Laundromat', 'Light Rail Station', 'Lounge', 'Medical Center',
       'Metro Station', 'Mexican Restaurant', 'Middle Easte

In [143]:
# make the neighborhoods column the first column of the dataframe
new_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
new_columns
scarborough_onehot = scarborough_onehot[new_columns]
scarborough_onehot.head()

Unnamed: 0,Wings Joint,Neighborhood,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Shopping Plaza,Skating Rink,Smoke Shop,Soccer Field,Spa,Supermarket,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant,Wine Shop
0,0,"Malvern, Rouge",0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,"Malvern, Rouge",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,"Malvern, Rouge",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,"Malvern, Rouge",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [144]:
# check the shape of the dataframe
scarborough_onehot.shape

(141, 75)

In [147]:
# group rows by taking the frequency of occurance of each category
scarborough_grouped = scarborough_onehot.groupby(['Neighborhood']).mean().reset_index()

In [151]:
print(scarborough_grouped.shape)
scarborough_grouped.head()


(16, 75)


Unnamed: 0,Neighborhood,Wings Joint,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Shopping Plaza,Skating Rink,Smoke Shop,Soccer Field,Spa,Supermarket,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant,Wine Shop
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.0,0.0,0.0,0.083333,0.083333,0.083333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0
3,"Clarks Corners, Tam O'Shanter, Sullivan",0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0
4,"Cliffside, Cliffcrest, Scarborough Village West",0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Print each neighborhood along with the top 5 common venues

In [183]:
scarborough_grouped['Neighborhood']

0                                             Agincourt
1                           Birch Cliff, Cliffside West
2                                             Cedarbrae
3               Clarks Corners, Tam O'Shanter, Sullivan
4       Cliffside, Cliffcrest, Scarborough Village West
5     Dorset Park, Wexford Heights, Scarborough Town...
6                       Golden Mile, Clairlea, Oakridge
7                     Guildwood, Morningside, West Hill
8           Kennedy Park, Ionview, East Birchmount Park
9                                        Malvern, Rouge
10    Milliken, Agincourt North, Steeles East, L'Amo...
11               Rouge Hill, Port Union, Highland Creek
12                                  Scarborough Village
13                        Steeles West, L'Amoreaux West
14                                    Wexford, Maryvale
15                                               Woburn
Name: Neighborhood, dtype: object

In [191]:
scarborough_grouped[scarborough_grouped['Neighborhood'] == scarborough_grouped['Neighborhood'][0]].T.reset_index()

Unnamed: 0,index,0
0,Neighborhood,Agincourt
1,Wings Joint,0
2,American Restaurant,0
3,Asian Restaurant,0
4,Athletics & Sports,0
...,...,...
70,Supermarket,0
71,Thai Restaurant,0
72,Thrift / Vintage Store,0
73,Vietnamese Restaurant,0


In [195]:
num_top_venues = 5

for neigh in scarborough_grouped['Neighborhood']:
    print(neigh)
#   transpose each dataframe and reset index so that neighborhood names are not indexes
#   [1:] to get rid of the first index which tells us the neighborhood name
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == neigh].T.reset_index()[1:]
#   change column names of each dataframe
    temp.columns = ['venue', 'freq']
#   change datatype of the freq column to float
    temp['freq'] = temp['freq'].astype(float)
#   round values to 2 decimal points
    temp['freq'] = round(temp['freq'],3)
#   sort values and take the top 5 venues 
    top5 = temp.sort_values(by='freq', ascending = False).reset_index(drop=True).head(num_top_venues)
    print(top5)
    print("--------------------")

Agincourt
                       venue  freq
0             Clothing Store   0.2
1  Latin American Restaurant   0.2
2             Breakfast Spot   0.2
3                     Lounge   0.2
4             Sandwich Place   0.2
--------------------
Birch Cliff, Cliffside West
                   venue  freq
0  General Entertainment   0.2
1           Skating Rink   0.2
2        College Stadium   0.2
3                   Café   0.2
4                  Diner   0.2
--------------------
Cedarbrae
                 venue   freq
0    Indian Restaurant  0.167
1     Hakka Restaurant  0.083
2   Chinese Restaurant  0.083
3  Fried Chicken Joint  0.083
4   Athletics & Sports  0.083
--------------------
Clarks Corners, Tam O'Shanter, Sullivan
                 venue   freq
0          Pizza Place  0.125
1       Sandwich Place  0.062
2          Gas Station  0.062
3         Noodle House  0.062
4  Fried Chicken Joint  0.062
--------------------
Cliffside, Cliffcrest, Scarborough Village West
                venue   

Put the top 10 venues for each neighborhood into a dataframe

In [372]:
num_top_venues = 10

## Create column names
indicators = ['st','nd','rd']
columns = ['Neighborhood']
# make the first 3 number end with st, nd, and rd respectively.
# else, end with "th"
for ind in np.arange(num_top_venues):
    try:
        columns.append(f"{ind+1}{indicators[ind]} Most Common Venue")
    except:
        columns.append(f"{ind+1}th Most Common Venue")


## Create the dataframe    
neighborhood_venues_sorted = pd.DataFrame(columns=columns)
neighborhood_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for a in np.arange(scarborough_grouped.shape[0]):
#     take only the top 10 venues
    row_data = np.array(scarborough_grouped.iloc[a,:][1:].\
                        sort_values(ascending=False).keys()[:10])
    neighborhood_venues_sorted.iloc[a,1:] = row_data
    
        
    

In [373]:
neighborhood_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Sandwich Place,Clothing Store,Lounge,Breakfast Spot,Latin American Restaurant,Wine Shop,Department Store,Diner,Discount Store,Electronics Store
1,"Birch Cliff, Cliffside West",Diner,General Entertainment,Café,Skating Rink,College Stadium,Wine Shop,Fast Food Restaurant,Convenience Store,Department Store,Discount Store
2,Cedarbrae,Indian Restaurant,Coffee Shop,Bank,Gas Station,Fried Chicken Joint,Chinese Restaurant,Caribbean Restaurant,Hakka Restaurant,Bakery,Thai Restaurant
3,"Clarks Corners, Tam O'Shanter, Sullivan",Pizza Place,Italian Restaurant,Noodle House,Bus Stop,Rental Car Location,Sandwich Place,Intersection,Discount Store,Fast Food Restaurant,Bank
4,"Cliffside, Cliffcrest, Scarborough Village West",Wings Joint,Chinese Restaurant,Motel,Hardware Store,Park,American Restaurant,Bakery,Gym,Grocery Store,Greek Restaurant
5,"Dorset Park, Wexford Heights, Scarborough Town...",Indian Restaurant,Bakery,Wine Shop,Chinese Restaurant,Vietnamese Restaurant,Light Rail Station,Fast Food Restaurant,Electronics Store,Pet Store,Gym
6,"Golden Mile, Clairlea, Oakridge",Intersection,Bus Line,Diner,Bakery,Soccer Field,Convenience Store,Park,Ice Cream Shop,Coffee Shop,General Entertainment
7,"Guildwood, Morningside, West Hill",Restaurant,Fast Food Restaurant,Medical Center,Park,Pizza Place,Electronics Store,Rental Car Location,Mexican Restaurant,Intersection,Greek Restaurant
8,"Kennedy Park, Ionview, East Birchmount Park",Convenience Store,Light Rail Station,Coffee Shop,Department Store,Metro Station,Hobby Shop,Hockey Arena,Bus Station,Gas Station,Furniture / Home Store
9,"Malvern, Rouge",Fast Food Restaurant,Spa,Print Shop,Wine Shop,Construction & Landscaping,Convenience Store,Department Store,Diner,Discount Store,Electronics Store


In [264]:
# testing codes
test = np.array(scarborough_grouped.iloc[1,:][1:].sort_values(ascending=False).keys()[:10])
neighborhood_venues_sorted.iloc[0,1:] = test


## Cluster Neighborhoods
run k means to cluster neighborhods into 3 clusters

In [368]:
scarborough_grouped.head(3)

Unnamed: 0,Neighborhood,Wings Joint,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Shopping Plaza,Skating Rink,Smoke Shop,Soccer Field,Spa,Supermarket,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant,Wine Shop
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.0,0.0,0.0,0.083333,0.083333,0.083333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0


In [278]:
from sklearn.cluster import KMeans

In [369]:
# set number of clusters
k_clusters = 3

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood',axis = 1)

kmeans = KMeans(n_clusters = k_clusters, random_state=0)

# fit the k-means model
kmeans.fit(scarborough_grouped_clustering)

# Grab the labels for each point in the model (view first 10)
k_means_labels = kmeans.labels_[:10]

# check cluster labels generated for each row in the dataframe
k_means_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

create dataframe that includes the cluster and the top 10 venues for each neighborhood

In [374]:
# insert the cluster labels to the first column
neighborhood_venues_sorted.insert(0, "Cluster Labels", kmeans.labels_)

In [375]:
# scarborough_data.head()
scarborough_latlng_data = scarborough_data[['Neighborhood',"Borough", "Latitude", "Longitude"]]
scarborough_latlng_data.head(3)

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
0,"Malvern, Rouge",Scarborough,43.806686,-79.194353
1,"Rouge Hill, Port Union, Highland Creek",Scarborough,43.784535,-79.160497
2,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711


In [376]:
# add the neighborhood lat and lngs 
neighborhood_merged = scarborough_latlng_data.join(neighborhood_venues_sorted.set_index("Neighborhood"), on = "Neighborhood")
# drop null value (upper rouge)
neighborhood_merged.dropna(inplace=True)
# change datatype of cluster labels to integer (for graphing)
neighborhood_merged['Cluster Labels'] = neighborhood_merged['Cluster Labels'].astype(int)
neighborhood_merged.head(4)

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Malvern, Rouge",Scarborough,43.806686,-79.194353,0,Fast Food Restaurant,Spa,Print Shop,Wine Shop,Construction & Landscaping,Convenience Store,Department Store,Diner,Discount Store,Electronics Store
1,"Rouge Hill, Port Union, Highland Creek",Scarborough,43.784535,-79.160497,2,Construction & Landscaping,Bar,Fried Chicken Joint,Convenience Store,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant,Fish Market
2,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711,0,Restaurant,Fast Food Restaurant,Medical Center,Park,Pizza Place,Electronics Store,Rental Car Location,Mexican Restaurant,Intersection,Greek Restaurant
3,Woburn,Scarborough,43.770992,-79.216917,1,Coffee Shop,Korean Restaurant,Business Service,Fish Market,Convenience Store,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant


## Visualize the clusters on a map


In [379]:
map_clusters = folium.Map(location=scarborough_data['latlng'][0], zoom_start=10)

# set colors for each clusters
cluster_colors = ['blue','orange','green']

# add markers to map
for lat,lng,borough,neighborhood,cluster in zip(neighborhood_merged['Latitude'], neighborhood_merged['Longitude'], \
                                        neighborhood_merged['Borough'], neighborhood_merged['Neighborhood'],\
                                       neighborhood_merged['Cluster Labels']):
    label = f"{neighborhood}, Cluster:{cluster}"
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color=cluster_colors[cluster-1],
    fill=True,
    fill_color= cluster_colors[cluster-1],
    fill_opacity=0.7,
    parse_html=False).add_to(map_clusters)

In [380]:
map_clusters

Analyze each clusters and define categories that distinguish each cluster

### Cluster 1

In [381]:
neighborhood_merged.head(3)

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Malvern, Rouge",Scarborough,43.806686,-79.194353,0,Fast Food Restaurant,Spa,Print Shop,Wine Shop,Construction & Landscaping,Convenience Store,Department Store,Diner,Discount Store,Electronics Store
1,"Rouge Hill, Port Union, Highland Creek",Scarborough,43.784535,-79.160497,2,Construction & Landscaping,Bar,Fried Chicken Joint,Convenience Store,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant,Fish Market
2,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711,0,Restaurant,Fast Food Restaurant,Medical Center,Park,Pizza Place,Electronics Store,Rental Car Location,Mexican Restaurant,Intersection,Greek Restaurant


In [382]:
neighborhood_merged.loc[neighborhood_merged['Cluster Labels'] == 0, \
                       neighborhood_merged.columns[[0] + list(range(4,neighborhood_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Malvern, Rouge",0,Fast Food Restaurant,Spa,Print Shop,Wine Shop,Construction & Landscaping,Convenience Store,Department Store,Diner,Discount Store,Electronics Store
2,"Guildwood, Morningside, West Hill",0,Restaurant,Fast Food Restaurant,Medical Center,Park,Pizza Place,Electronics Store,Rental Car Location,Mexican Restaurant,Intersection,Greek Restaurant
4,Cedarbrae,0,Indian Restaurant,Coffee Shop,Bank,Gas Station,Fried Chicken Joint,Chinese Restaurant,Caribbean Restaurant,Hakka Restaurant,Bakery,Thai Restaurant
5,Scarborough Village,0,Laundromat,Pizza Place,Middle Eastern Restaurant,Furniture / Home Store,Fast Food Restaurant,Convenience Store,Department Store,Diner,Discount Store,Electronics Store
6,"Kennedy Park, Ionview, East Birchmount Park",0,Convenience Store,Light Rail Station,Coffee Shop,Department Store,Metro Station,Hobby Shop,Hockey Arena,Bus Station,Gas Station,Furniture / Home Store
7,"Golden Mile, Clairlea, Oakridge",0,Intersection,Bus Line,Diner,Bakery,Soccer Field,Convenience Store,Park,Ice Cream Shop,Coffee Shop,General Entertainment
8,"Cliffside, Cliffcrest, Scarborough Village West",0,Wings Joint,Chinese Restaurant,Motel,Hardware Store,Park,American Restaurant,Bakery,Gym,Grocery Store,Greek Restaurant
9,"Birch Cliff, Cliffside West",0,Diner,General Entertainment,Café,Skating Rink,College Stadium,Wine Shop,Fast Food Restaurant,Convenience Store,Department Store,Discount Store
10,"Dorset Park, Wexford Heights, Scarborough Town...",0,Indian Restaurant,Bakery,Wine Shop,Chinese Restaurant,Vietnamese Restaurant,Light Rail Station,Fast Food Restaurant,Electronics Store,Pet Store,Gym
11,"Wexford, Maryvale",0,Grocery Store,Breakfast Spot,Vietnamese Restaurant,Gas Station,Fish Market,Korean Restaurant,Coffee Shop,Pizza Place,Seafood Restaurant,Outdoor Supply Store


### Cluster 2

In [383]:
neighborhood_merged.loc[neighborhood_merged['Cluster Labels'] == 1, \
                       neighborhood_merged.columns[[0] + list(range(4,neighborhood_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Woburn,1,Coffee Shop,Korean Restaurant,Business Service,Fish Market,Convenience Store,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant


### Cluster 3

In [384]:
neighborhood_merged.loc[neighborhood_merged['Cluster Labels'] == 2, \
                       neighborhood_merged.columns[[0] + list(range(4,neighborhood_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Rouge Hill, Port Union, Highland Creek",2,Construction & Landscaping,Bar,Fried Chicken Joint,Convenience Store,Department Store,Diner,Discount Store,Electronics Store,Fast Food Restaurant,Fish Market
