# Step 1: Create a table of the neighborhoods in Toronto

### Load packages.

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

### Get the source file of the Wiki page.

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
wiki_page = BeautifulSoup(source, 'lxml')

### Find the text related to the table.

In [25]:
find_table = wiki_page.find('table').text.split('\n')
find_table = [x for x in find_table if x != '']
print(find_table[0:10])

['Postcode', 'Borough', 'Neighbourhood', 'M1A', 'Not assigned', 'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A']


### Create a pd dataframe.

In [119]:
df = pd.DataFrame(np.array(find_table).reshape((int(len(find_table)/3),3)))
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Rename the column of the dataframe.

In [120]:
df = df.rename(columns=df.iloc[0]).drop(df.index[0])
df.columns.values[0] = 'PostalCode'
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Clean this dataframe.

In [121]:
# Drop rows where Borough is 'Not assigned'
df = df.loc[df['Borough']!='Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [122]:
# replace Neighbourhood by Borough if it is 'Not assigned'
df['Neighbourhood'][df['Neighbourhood']=='Not assigned'] = df['Borough'][df['Neighbourhood']=='Not assigned']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [124]:
df = df.groupby(['PostalCode','Borough'])['Neighbourhood'].agg(lambda col: ','.join(col)).to_frame(name = 'Neighbourhood').reset_index()
print(df.head())
print(df.shape)

  PostalCode      Borough                         Neighbourhood
0        M1B  Scarborough                         Rouge,Malvern
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2        M1E  Scarborough       Guildwood,Morningside,West Hill
3        M1G  Scarborough                                Woburn
4        M1H  Scarborough                             Cedarbrae
(103, 3)


# Step 2: Get Location of each postal code

In [126]:
geodata = pd.read_csv('http://cocl.us/Geospatial_data')
geodata.columns.values[0] = 'PostalCode'
geodata.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [128]:
df = df.merge(geodata, left_on='PostalCode', right_on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Step 3: Cluster the neighborhoods in boroughs that contain the word Toronto 

### Select rows where Borough contains the word 'Toronto'

In [133]:
df2 = df[df['Borough'].str.contains("Toronto")]
print(df2.head())
print(df2.shape)

   PostalCode          Borough                  Neighbourhood   Latitude  \
37        M4E     East Toronto                    The Beaches  43.676357   
41        M4K     East Toronto    The Danforth West,Riverdale  43.679557   
42        M4L     East Toronto  The Beaches West,India Bazaar  43.668999   
43        M4M     East Toronto                Studio District  43.659526   
44        M4N  Central Toronto                  Lawrence Park  43.728020   

    Longitude  
37 -79.293031  
41 -79.352188  
42 -79.315572  
43 -79.340923  
44 -79.388790  
(38, 5)


### Import packages.

In [134]:
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### API configurations.

In [None]:
CLIENT_ID = '???' # your Foursquare ID. Deleted.
CLIENT_SECRET = '???' # your Foursquare Secret. Deleted.
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

### A function to get the category of the venue.

In [136]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### A function to repeatedly get information for each neighborhood.

In [137]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Get the data from Foursquare.

In [139]:
toronto_venues = getNearbyVenues(names=df2['PostalCode'],
                                   latitudes=df2['Latitude'],
                                   longitudes=df2['Longitude']
                                  )

M4E
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6G
M6H
M6J
M6K
M6P
M6R
M6S
M7Y


In [140]:
print(toronto_venues.shape)
toronto_venues.head()

(1699, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
1,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,M4E,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4E,43.676357,-79.293031,Beaches Fitness,43.680319,-79.290991,Gym / Fitness Center


### Check the number of categories.

In [141]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 236 uniques categories.


### Calculate the number of places in each category for all neighborhood.

In [156]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()

toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use k-means for clustering and display the results.

In [157]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 3, 2, 2, 2, 4, 2])

In [166]:
cluster_result = toronto_grouped[['PostalCode']].copy()
cluster_result['Cluster Labels'] = kmeans.labels_

df2.merge(cluster_result, left_on='PostalCode', right_on='PostalCode')
df2

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,2
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,2
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,2
43,M4M,East Toronto,Studio District,43.659526,-79.340923,2
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,4
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,2
