In [1]:
import numpy as np
import pandas as pd
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Part 1 - Data Preparation

In [3]:
# scrape the following Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wikipage = requests.get(url)
data = pd.read_html(wikipage.content)[0]
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [29]:
df = data [data ["Borough"] != "Not assigned"]
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [37]:
df = df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [38]:
df.shape

(103, 3)

### Part 2 Geocode

In [39]:
# extract geocodes from csv file
csv = 'http://cocl.us/Geospatial_data'
df_geocodes = pd.read_csv(csv)
df_geocodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [41]:
combined = df.join(df_geocodes.set_index('Postal Code'), on = 'Postal Code')
combined.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part 3 Neighbourhood Clustering

In [49]:
# Toronto coordinates
address = 'Downtown Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


In [52]:
# create a map
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=10)

for latitude, longtitude, borough, neighbourhood in zip(combined['Latitude'], combined['Longitude'], combined['Borough'], combined['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longtitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In [112]:
# Foursquare credential
CLIENT_ID = 'LWHBU2HYHPRXHQ4VTSU1HMCL0FEVXBQSNGW3QFFS4KYELAW5'
CLIENT_SECRET = 'XXUEMEXQLCW4M0MZSVZHBKLQB22VMYW1YBFKGTIMP5ANWIXS'
VERSION = '20180605'

In [93]:
# get the neighbourhood geocodes
address = 'Scarborough,Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
neighbourhood_latitude = combined.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = combined.loc[0, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Malvern, Rouge are 43.806686299999996, -79.19435340000001.


In [106]:
# top 100 venues within 500m
LIMIT = 100
radius = 1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    Client_Id, 
    Client_Secret, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()

In [101]:
import json
from pandas.io.json import json_normalize 
venues=results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
nearby_venues.columns

Index(['referralId', 'reasons.count', 'reasons.items', 'venue.id',
       'venue.name', 'venue.location.address', 'venue.location.crossStreet',
       'venue.location.lat', 'venue.location.lng',
       'venue.location.labeledLatLngs', 'venue.location.distance',
       'venue.location.postalCode', 'venue.location.cc', 'venue.location.city',
       'venue.location.state', 'venue.location.country',
       'venue.location.formattedAddress', 'venue.categories',
       'venue.photos.count', 'venue.photos.groups',
       'venue.location.neighborhood', 'venue.venuePage.id'],
      dtype='object')

In [102]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Part 4 Venues

In [103]:
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Images Salon & Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",43.802283,-79.198565
1,African Rainforest Pavilion,"[{'id': '58daa1558bbb0b01f18ec1fd', 'name': 'Z...",43.817725,-79.183433
2,Toronto Pan Am Sports Centre,"[{'id': '4f4528bc4b90abdf24c9de85', 'name': 'A...",43.790623,-79.193869
3,Polar Bear Exhibit,"[{'id': '4bf58dd8d48988d17b941735', 'name': 'Z...",43.823372,-79.185145
4,Toronto Zoo,"[{'id': '4bf58dd8d48988d17b941735', 'name': 'Z...",43.820582,-79.181551


In [104]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

In [105]:
a=pd.Series(nearby_venues.categories)
a.value_counts()[:10]

Zoo Exhibit             17
Fast Food Restaurant     3
Gas Station              2
Pizza Place              2
Zoo                      2
Athletics & Sports       2
Restaurant               2
Other Great Outdoors     1
Liquor Store             1
Tram Station             1
Name: categories, dtype: int64

In [125]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # making GET request
        venue_results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venue_results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [113]:
downtown_venues = getNearbyVenues(names=combined['Neighbourhood'],
                                   latitudes=combined['Latitude'],
                                   longitudes=combined['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [115]:
print('There are {} Uniques Categories.'.format(len(downtown_venues['Venue Category'].unique())))
downtown_venues.groupby('Neighborhood').count().head()

There are 325 Uniques Categories.


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,8,8,8,8,8,8
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,8,8,8,8,8,8
"Bedford Park, Lawrence Manor East",30,30,30,30,30,30


In [119]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_onehot.head(5)

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
num_top_venues = 5
for hood in downtown_grouped['Neighborhood']:
    print("---- "+hood+" ----")
    temp =downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Agincourt ----
             venue  freq
0        Pool Hall  0.12
1           Lounge  0.12
2   Sandwich Place  0.12
3  Badminton Court  0.12
4        Newsagent  0.12


---- Alderwood, Long Branch ----
                venue  freq
0   Convenience Store  0.22
1         Pizza Place  0.22
2                 Pub  0.11
3  Athletics & Sports  0.11
4         Coffee Shop  0.11


---- Bathurst Manor, Wilson Heights, Downsview North ----
           venue  freq
0           Bank  0.09
1    Coffee Shop  0.09
2           Park  0.09
3    Gas Station  0.04
4  Shopping Mall  0.04


---- Bayview Village ----
                 venue  freq
0                 Bank  0.25
1         Skating Rink  0.12
2  Japanese Restaurant  0.12
3                 Café  0.12
4   Chinese Restaurant  0.12


---- Bedford Park, Lawrence Manor East ----
                venue  freq
0  Italian Restaurant  0.10
1         Coffee Shop  0.10
2      Sandwich Place  0.07
3          Hobby Shop  0.07
4           Juice Bar  0.03


---- Berczy

         venue  freq
0         Park   0.6
1   Playground   0.2
2        Trail   0.2
3  Yoga Studio   0.0
4    Nightclub   0.0


---- Roselawn ----
                     venue  freq
0               Playground  0.17
1                      Spa  0.17
2     Fast Food Restaurant  0.17
3  Health & Beauty Service  0.17
4                Pet Store  0.17


---- Rouge Hill, Port Union, Highland Creek ----
            venue  freq
0  Breakfast Spot  0.50
1    Burger Joint  0.25
2             Bar  0.25
3     Yoga Studio  0.00
4          Office  0.00


---- Runnymede, Swansea ----
                venue  freq
0                Café  0.09
1         Coffee Shop  0.07
2         Pizza Place  0.05
3  Italian Restaurant  0.05
4                 Pub  0.05


---- Runnymede, The Junction North ----
                venue  freq
0         Coffee Shop  0.12
1             Brewery  0.12
2  Athletics & Sports  0.12
3                Park  0.12
4   Convenience Store  0.06


---- Scarborough Village ----
                  v

In [121]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [128]:
# most common venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(20)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Newsagent,Pool Hall,Breakfast Spot,Badminton Court,Lounge,Latin American Restaurant,Sandwich Place,Dumpling Restaurant,Dog Run
1,"Alderwood, Long Branch",Pizza Place,Convenience Store,Athletics & Sports,Pub,Gym,Gas Station,Coffee Shop,Farm,Distribution Center,Dive Bar
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Park,Pharmacy,Sushi Restaurant,Supermarket,Intersection,Deli / Bodega,Gift Shop,Ice Cream Shop
3,Bayview Village,Bank,Grocery Store,Japanese Restaurant,Chinese Restaurant,Skating Rink,Intersection,Café,Women's Store,Dog Run,Doner Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Hobby Shop,Comfort Food Restaurant,Bank,Pub,Bagel Shop,Bakery,Juice Bar
5,Berczy Park,Coffee Shop,Hotel,Restaurant,Café,Beer Bar,Japanese Restaurant,Cocktail Bar,Grocery Store,Park,Lounge
6,"Birch Cliff, Cliffside West",Park,Café,General Entertainment,Skating Rink,Diner,Thai Restaurant,College Stadium,Event Space,Donut Shop,Discount Store
7,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Bar,Restaurant,Nightclub,Thrift / Vintage Store,Bakery,Supermarket,Japanese Restaurant,Performing Arts Venue
8,"Business reply mail Processing Centre, South C...",Fast Food Restaurant,Light Rail Station,Coffee Shop,Restaurant,Harbor / Marina,Brewery,Bakery,Steakhouse,Beer Store,Board Shop
9,"CN Tower, King and Spadina, Railway Lands, Har...",Rental Car Location,Harbor / Marina,Airport Terminal,Airport Lounge,Sculpture Garden,Coffee Shop,Boat or Ferry,Airport Gate,Boutique,Airport Food Court
