In [436]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [437]:
import lxml
import html5lib

# Part 1 -- Web Scrap 

In [438]:
#Use panads read_html to scrape the Toronto data in Wiki 
wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki = wiki[0] #The table store in the [0]

In [439]:
#Only process the cells that have an assigned borough
wiki = wiki[wiki['Borough'] != 'Not assigned'].reset_index(drop = True)

#Merge rows that have repeated Postcode
temp_list = []
for i in range(1,wiki.shape[0]):
    if wiki.iloc[i,0] == wiki.iloc[i-1,0]:
        wiki.iloc[i-1,2] += ',' + wiki.iloc[i,2]
        temp_list.append(i)

wiki = wiki.drop(temp_list).reset_index(drop = True)

#Use temp to store the row that doesn't have an assigned Neighbourhood
temp = wiki[wiki['Neighbourhood'] == 'Not assigned']
temp['Neighbourhood'] = temp['Borough'] #Replace the 'Not assigned' to its Borough
wiki[wiki['Neighbourhood'] == 'Not assigned'] = temp #Assign the processed data into the original wiki table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [440]:
wiki.shape

(103, 3)

In [441]:
wiki

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park"


# Part 2 -- Get Coordinates of Each Neighborhood


In [393]:
import geocoder

In [394]:
#Since Geocoder didn't work, I used the csv file
extra_data = pd.read_csv('Geospatial_Coordinates.csv')
extra_data.shape

(103, 3)

In [395]:
#Create coordiates columns 
wiki['Latitude'] = None
wiki['Longitude'] = None

In [396]:
#Assign location data into wiki dataset
for coord_data in range(extra_data.shape[0]):
    for ori in range(wiki.shape[0]):
        if (extra_data.iloc[coord_data,0] == wiki.iloc[ori,0]):
            wiki.iloc[ori,3] = extra_data.iloc[coord_data,1]
            wiki.iloc[ori,4] = extra_data.iloc[coord_data,2]
            
wiki

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.7185,-79.4648
4,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road",43.6537,-79.5069
99,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.6627,-79.3216
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park",43.6363,-79.4985


# Part 3 -- Explore and Cluster the Neighborhoods in Toronto

In [397]:
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import folium
import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

In [398]:
#Only keep boroughs that contain the word Toronto
temp_list = []

for i in range(wiki.shape[0]):
    if ('Toronto' not in wiki.iloc[i,1].split(' ')):
        temp_list.append(i)

Torondo_wiki = wiki.drop(temp_list).reset_index(drop = True)

In [399]:
#Get Toronto coordinates

address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [247]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11.5)

for lat, lng, borough, neighbourhood in zip(Torondo_wiki['Latitude'], Torondo_wiki['Longitude'], Torondo_wiki['Borough'], Torondo_wiki['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [264]:
CLIENT_ID = 'CR0OYVFLX2GO0ABW5RDUEYOLDENDY0JUABRL1OBKXHEWSXHX' # your Foursquare ID
CLIENT_SECRET = 'DHUCYB5DFBS2TNUYYOUPUEV3A0FHEDMLOM0RLNZ0K3XJXU53' # your Foursquare Secret
VERSION = '20200101' # Foursquare API version
radius = 5
LIMIT = 5

In [400]:
#Function to find near venues for every data point
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [401]:
#Toronto venues 
Toronto_venues = getNearbyVenues(names=Torondo_wiki['Neighbourhood'],
                                   latitudes=Torondo_wiki['Latitude'],
                                   longitudes=Torondo_wiki['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson,Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide,King
Dovercourt Village,Dufferin
Harbourfront East,Toronto Islands
Little Portugal,Trinity
The Danforth West,Riverdale
Design Exchange,Toronto Dominion Centre
Brockton,Exhibition Place
The Beaches West,India Bazaar
Commerce Court,Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North,Forest Hill West
High Park,The Junction South
North Toronto West
The Annex,North Midtown
Parkdale,Roncesvalles
Davisville
Harbord,University of Toronto
Runnymede,Swansea
Moore Park,Summerhill East
Chinatown,Grange Park
Deer Park,Forest Hill SE


KeyError: 'groups'

In [402]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

In [403]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [404]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [442]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King",Vegetarian / Vegan Restaurant,Concert Hall,Hotel,Plaza,Steakhouse,Yoga Studio,Diner,Cosmetics Shop,Creperie,Cuban Restaurant
1,Berczy Park,Concert Hall,Vegetarian / Vegan Restaurant,Museum,Steakhouse,Liquor Store,Diner,Cosmetics Shop,Creperie,Cuban Restaurant,Dance Studio
2,"Brockton,Exhibition Place",Coffee Shop,Italian Restaurant,Café,Bar,Gym,Food & Drink Shop,Dessert Shop,Cosmetics Shop,Creperie,Greek Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Pizza Place,Farmers Market,Burrito Place,Brewery,Comic Shop,Food & Drink Shop,Department Store,Concert Hall,Cosmetics Shop,Creperie
4,"CN Tower,Bathurst Quay",Airport,Airport Lounge,Airport Terminal,Harbor / Marina,Airport Food Court,Eastern European Restaurant,Creperie,Cuban Restaurant,Dance Studio,Department Store


In [443]:
k = 4

data_for_clustering = Toronto_grouped.drop('Neighborhood',1)

kmeans = KMeans(n_clusters = k).fit(data_for_clustering)

kmeans.labels_

array([0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 3, 1, 1, 0, 1, 3, 1, 1, 3, 3,
       3, 0, 2, 1, 0, 3, 3, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0], dtype=int32)

In [444]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Torondo_wiki

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

Toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606,1,Gym / Fitness Center,Bakery,Restaurant,Coffee Shop,Spa,Food & Drink Shop,Food,Concert Hall,Cosmetics Shop,Creperie
1,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895,3,Coffee Shop,Portuguese Restaurant,Italian Restaurant,Park,Gym,American Restaurant,Eastern European Restaurant,Creperie,Cuban Restaurant,Dance Studio
2,M5B,Downtown Toronto,"Ryerson,Garden District",43.6572,-79.3789,0,Pizza Place,Café,Comic Shop,Plaza,Clothing Store,Food & Drink Shop,Food,Greek Restaurant,Concert Hall,Gastropub
3,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754,1,Gym,Creperie,Restaurant,Italian Restaurant,Japanese Restaurant,Dog Run,Cosmetics Shop,Cuban Restaurant,Dance Studio,Department Store
4,M4E,East Toronto,The Beaches,43.6764,-79.293,0,Other Great Outdoors,Trail,Pizza Place,Health Food Store,Pub,Department Store,Comic Shop,Concert Hall,Cosmetics Shop,Creperie


In [445]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Part 4 -- Observation

In [446]:
Toronto_merged.sort_values('Cluster Labels', ascending = False)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M6P,West Toronto,"High Park,The Junction South",43.6616,-79.4648,3,Park,Gastropub,Italian Restaurant,Bar,Speakeasy,Diner,Concert Hall,Cosmetics Shop,Creperie,Cuban Restaurant
21,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.6969,-79.4113,3,Park,Trail,Sushi Restaurant,Jewelry Store,Diner,Comic Shop,Concert Hall,Cosmetics Shop,Creperie,Cuban Restaurant
20,M4P,Central Toronto,Davisville North,43.7128,-79.3902,3,Breakfast Spot,Park,Hotel,Food & Drink Shop,Department Store,Yoga Studio,Cosmetics Shop,Creperie,Cuban Restaurant,Dance Studio
1,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895,3,Coffee Shop,Portuguese Restaurant,Italian Restaurant,Park,Gym,American Restaurant,Eastern European Restaurant,Creperie,Cuban Restaurant,Dance Studio
18,M4N,Central Toronto,Lawrence Park,43.728,-79.3888,3,Photography Studio,Park,Swim School,Bus Line,Dessert Shop,Comic Shop,Concert Hall,Cosmetics Shop,Creperie,Cuban Restaurant
34,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.6464,-79.3748,3,Park,Vegetarian / Vegan Restaurant,Tea Room,Museum,Steakhouse,Dessert Shop,Comic Shop,Concert Hall,Cosmetics Shop,Creperie
33,M4W,Downtown Toronto,Rosedale,43.6796,-79.3775,3,Park,Trail,Playground,Dessert Shop,Comic Shop,Concert Hall,Cosmetics Shop,Creperie,Cuban Restaurant,Dance Studio
10,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands",43.6408,-79.3818,3,Supermarket,Sporting Goods Shop,Park,Salad Place,Yoga Studio,Dessert Shop,Comic Shop,Concert Hall,Cosmetics Shop,Creperie
29,M4T,Central Toronto,"Moore Park,Summerhill East",43.6896,-79.3832,2,Playground,Diner,Comic Shop,Concert Hall,Cosmetics Shop,Creperie,Cuban Restaurant,Dance Studio,Department Store,Dessert Shop
0,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606,1,Gym / Fitness Center,Bakery,Restaurant,Coffee Shop,Spa,Food & Drink Shop,Food,Concert Hall,Cosmetics Shop,Creperie


Cluster 0 could be about dinning. Most common venuues are restaurants-related, like 'Pizza'.


Cluster 1 is highly related to the Gym industry. Most common venues are 'Gym / Fitness Center', 'Yoga Studio', 'Coffee Shop' etc.

Cluster 2 is a special cluster because its 1st most common venus is 'Playground', which is very different with others.

Cluster 3 is like a living area. Most common venues are 'Park', 'Supermarket', 'Breakfast Spot' etc.