# 1 - Build & clean dataframe 
---

In [101]:
import pandas as pd

## 1a - Scrape Web page content into dataframe

In [102]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dfs = pd.read_html(url)

In [103]:
df = dfs[0] # Read first table in HTML document

In [104]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 1b - Dataframe Cleanup

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [105]:
df = df.drop(df[df.Borough == 'Not assigned'].index)

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [106]:
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


==> No cell meets this criteria

In [107]:
print("Number of rows:", df.shape[0])

Number of rows: 103


<p style="margin-top: 6em"></p>

# 2 - Add geographical coordinates to dataframe
___

## 2a - Load geolocation coordinates

In [108]:
#Using csv file because geolocation API too unreliable
geo_df = pd.read_csv("geospatial_coordinates.csv")

def get_coordinates_from_postal_code(postal_code):
    rec = geo_df[geo_df['Postal Code'] == postal_code]
    lat = rec['Latitude'].item()
    lng = rec['Longitude'].item()
    return lat, lng


## 2b - Add latitude & longitude to dataframe

In [109]:
postal_codes = df['Postal Code'].values
latitudes = []
longitudes = []

for postal_code in postal_codes:
    lat, lng = get_coordinates_from_postal_code(postal_code)
    latitudes.extend([lat])
    longitudes.extend([lng])
    
df['Latitude'] = latitudes
df['Longitude'] = longitudes


In [110]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<p style="margin-top: 6em"></p>

# 3 - Explore and cluster neighborhoods in Toronto
___

## 3a - Create Downtown Toronto data subset

In [111]:
downtown_toronto_data = df[df.Borough == 'Downtown Toronto']
downtown_toronto_data

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
31,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
40,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
41,M6G,Downtown Toronto,Christie,43.669542,-79.422564
49,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
58,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
67,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


In [112]:
downtown_toronto_latitude = downtown_toronto_data['Latitude'].mean()
downtown_toronto_longitude = downtown_toronto_data['Longitude'].mean()

## 3b - Display Downtown Toronto map with Neighborhood markers

In [113]:
map_downtown_toronto = folium.Map(location=[downtown_toronto_latitude, downtown_toronto_longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(downtown_toronto_data['Latitude'], downtown_toronto_data['Longitude'], downtown_toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  
    
map_downtown_toronto

In [114]:
CLIENT_ID = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [115]:
VERSION = '20180605'
LIMIT = 100 # limit of number of venues returned by Foursquare API
RADIUS = 500 # define radius in meters

In [116]:
import json
import requests
import numpy as np

In [117]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
      'Neighborhood Latitude', 
      'Neighborhood Longitude', 
      'Venue', 
      'Venue Latitude', 
      'Venue Longitude', 
      'Venue Category']
    
    return(nearby_venues)

In [118]:
downtown_toronto_venues = getNearbyVenues(
    names=downtown_toronto_data['Neighborhood'],
    latitudes=downtown_toronto_data['Latitude'],
    longitudes=downtown_toronto_data['Longitude']
)

In [119]:
print("Downtown Toronto Venues:", downtown_toronto_venues.shape[0])
downtown_toronto_venues.head()

Downtown Toronto Venues: 1218


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


### Venue counts per Downtown Toronto neighborhood

In [120]:
downtown_toronto_venues.groupby('Neighborhood').count()[['Venue']]

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Berczy Park,55
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15
Central Bay Street,64
Christie,17
Church and Wellesley,78
"Commerce Court, Victoria Hotel",100
"First Canadian Place, Underground city",100
"Garden District, Ryerson",100
"Harbourfront East, Union Station, Toronto Islands",100
"Kensington Market, Chinatown, Grange Park",58


In [121]:
print('There are {} uniques categories.'.format(len(downtown_toronto_venues['Venue Category'].unique())))

There are 210 uniques categories.


## 3c - Analyze each neighborhood

In [122]:
# one hot encoding
downtown_toronto_onehot = pd.get_dummies(downtown_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_toronto_onehot['Neighborhood'] = downtown_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_toronto_onehot.columns[-1]] + list(downtown_toronto_onehot.columns[:-1])
downtown_toronto_onehot = downtown_toronto_onehot[fixed_columns]

downtown_toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
downtown_toronto_grouped = downtown_toronto_onehot.groupby('Neighborhood').mean().reset_index()
downtown_toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.025641,0.012821,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0


In [124]:
def get_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [125]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_toronto_grouped['Neighborhood']

for ind in np.arange(downtown_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = get_most_common_venues(downtown_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Beer Bar,Café,Cheese Shop,Bakery,Breakfast Spot,Pub
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Terminal,Airport Lounge,Sculpture Garden,Rental Car Location,Boat or Ferry,Harbor / Marina,Airport Food Court,Airport,Boutique
2,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Japanese Restaurant,Ice Cream Shop,Thai Restaurant,Burger Joint,Bubble Tea Shop,Salad Place
3,Christie,Grocery Store,Café,Park,Baby Store,Nightclub,Coffee Shop,Italian Restaurant,Diner,Restaurant,Athletics & Sports
4,Church and Wellesley,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Café,Pub,Hotel,Gastropub,Yoga Studio


## 3d - Cluster Downtown Toronto neighborhoods

In [126]:
from sklearn.cluster import KMeans

In [127]:
# set number of clusters
kclusters = 5

downtown_toronto_grouped_clustering = downtown_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_toronto_grouped_clustering)

In [128]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_toronto_merged = downtown_toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_toronto_merged = downtown_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtown_toronto_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Café,Theater,Yoga Studio,Cosmetics Shop
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Yoga Studio,Burrito Place,Smoothie Shop,Beer Bar,Italian Restaurant,Sandwich Place,Distribution Center,Diner,Café
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Clothing Store,Coffee Shop,Cosmetics Shop,Restaurant,Bubble Tea Shop,Café,Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Fast Food Restaurant
22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Café,Coffee Shop,Cocktail Bar,American Restaurant,Gastropub,Restaurant,Lingerie Store,Creperie,Art Gallery,Hotel
31,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Beer Bar,Café,Cheese Shop,Bakery,Breakfast Spot,Pub
40,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Japanese Restaurant,Ice Cream Shop,Thai Restaurant,Burger Joint,Bubble Tea Shop,Salad Place
41,M6G,Downtown Toronto,Christie,43.669542,-79.422564,2,Grocery Store,Café,Park,Baby Store,Nightclub,Coffee Shop,Italian Restaurant,Diner,Restaurant,Athletics & Sports
49,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,1,Coffee Shop,Café,Restaurant,Deli / Bodega,Gym,Hotel,Thai Restaurant,Clothing Store,Sushi Restaurant,American Restaurant
58,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,1,Coffee Shop,Aquarium,Hotel,Café,Restaurant,Scenic Lookout,Italian Restaurant,Fried Chicken Joint,Sporting Goods Shop,Brewery
67,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,1,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Japanese Restaurant,Italian Restaurant,Salad Place,Seafood Restaurant,Deli / Bodega


### Visualizing clusters

In [130]:
# create map
map_clusters = folium.Map(location=[downtown_toronto_latitude, downtown_toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_toronto_merged['Latitude'], downtown_toronto_merged['Longitude'], downtown_toronto_merged['Neighborhood'], downtown_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 3e - Examining Clusters

In [152]:
def show_cluster(cluster_label):
    return downtown_toronto_merged.loc[
        downtown_toronto_merged['Cluster Labels'] == cluster_label, 
        downtown_toronto_merged.columns[[1] + list(range(5,downtown_toronto_merged.shape[1]))]
    ]

### 1st Cluster

In [153]:
show_cluster(0)

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
139,Downtown Toronto,0,Airport Service,Airport Terminal,Airport Lounge,Sculpture Garden,Rental Car Location,Boat or Ferry,Harbor / Marina,Airport Food Court,Airport,Boutique


### 2nd Cluster

In [144]:
show_cluster(1)

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Downtown Toronto,1,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Café,Theater,Yoga Studio,Cosmetics Shop
6,Downtown Toronto,1,Coffee Shop,Yoga Studio,Burrito Place,Smoothie Shop,Beer Bar,Italian Restaurant,Sandwich Place,Distribution Center,Diner,Café
13,Downtown Toronto,1,Clothing Store,Coffee Shop,Cosmetics Shop,Restaurant,Bubble Tea Shop,Café,Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Fast Food Restaurant
22,Downtown Toronto,1,Café,Coffee Shop,Cocktail Bar,American Restaurant,Gastropub,Restaurant,Lingerie Store,Creperie,Art Gallery,Hotel
31,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Beer Bar,Café,Cheese Shop,Bakery,Breakfast Spot,Pub
40,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Japanese Restaurant,Ice Cream Shop,Thai Restaurant,Burger Joint,Bubble Tea Shop,Salad Place
49,Downtown Toronto,1,Coffee Shop,Café,Restaurant,Deli / Bodega,Gym,Hotel,Thai Restaurant,Clothing Store,Sushi Restaurant,American Restaurant
58,Downtown Toronto,1,Coffee Shop,Aquarium,Hotel,Café,Restaurant,Scenic Lookout,Italian Restaurant,Fried Chicken Joint,Sporting Goods Shop,Brewery
67,Downtown Toronto,1,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Japanese Restaurant,Italian Restaurant,Salad Place,Seafood Restaurant,Deli / Bodega
76,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Italian Restaurant,Japanese Restaurant,Deli / Bodega,Seafood Restaurant


### 3rd Cluster

In [146]:
show_cluster(2)

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,Downtown Toronto,2,Grocery Store,Café,Park,Baby Store,Nightclub,Coffee Shop,Italian Restaurant,Diner,Restaurant,Athletics & Sports


### 4th Cluster

In [147]:
show_cluster(3)

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
147,Downtown Toronto,3,Park,Playground,Trail,Creperie,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


### 5th Cluster

In [149]:
show_cluster(4)

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
121,Downtown Toronto,4,Café,Italian Restaurant,Bar,Japanese Restaurant,Bookstore,Bakery,Restaurant,Sandwich Place,Beer Bar,Beer Store
130,Downtown Toronto,4,Café,Mexican Restaurant,Vietnamese Restaurant,Bakery,Coffee Shop,Grocery Store,Bar,Dessert Shop,Gaming Cafe,Vegetarian / Vegan Restaurant


## 3f - Observations

1. The largest cluster is one where coffe shops/cafes are most common - West side of Downtown Toronto
2. The following general names might be assigned to each cluster:
    1. Airport services
    2. Coffee shops
    3. Groceries & convenience
    4. Urban outdoor
    5. Restaurants and eateries