# Capstone Project Notebook - Week 3 - Toronto clustering

## Step 1 : Get data from Wikipedia webpage

In [1]:
#Import Library
import requests
import pandas as pd
import wikipedia as wp

In [2]:
# Ignore Unverified HTTPS request
import os
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.environ['CURL_CA_BUNDLE'] = ""
os.environ['PYTHONWARNINGS']="ignore:Unverified HTTPS request"

In [3]:
#Get the html source in Wikipedia
html = wp.page("List_of_postal_codes_of_Canada: M").html().encode("UTF-8")

In [4]:
# Read with pandapd.DataFrame(
df_wiki = pd.read_html(html)
df_wiki = pd.DataFrame(df_wiki[0])
df_wiki.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park / Ontario Provincial Government,M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [5]:
#Initialize the final dataframe
df_toronto = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

In [6]:
# Fill in the final dataframe

# Loop over the lines
for ii in range(df_wiki.shape[0]):
    # Loop over the columns
    for jj in range(df_wiki.shape[1]):
        
        # Cell content
        cell=df_wiki.iloc[ii,jj]
        
        # Postcode
        postcode=cell[0:3]
        
        # Remaining info
        name=cell[3:]
        
        # Ignore Not assigned values
        if name != 'Not assigned':
            
            #print(postcode)
            split_name = name.split('(')
            
            # Borough list
            borough_list = split_name[0].split('/')
            #print(borough_list)
            
            # Neighbourhood list
            if len(split_name)>1:
                neighbourhood_list = name.split('(')[1].split('/')
                neighbourhood_list[-1] = neighbourhood_list[-1].split(')')[0]
            else:
                neighbourhood_list=borough_list
            #print(neighbourhood_list)    
            
            for kk in range(len(borough_list)):
                
                borough = borough_list[kk]
                
                for ll in range(len(neighbourhood_list)):
                        
                    neighbourhood = neighbourhood_list[ll]
                            
                    # Add the row to the dataframe
                    df_toronto.loc[len(df_toronto)]=[postcode,borough,neighbourhood]

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Manor


In [7]:
# Group by
df_toronto = pd.DataFrame(df_toronto.groupby(by=['PostalCode','Borough'])['Neighborhood']\
                          .apply(lambda x: "%s" % ', '.join(x)))

In [8]:
#Reset index
df_toronto= df_toronto.reset_index()
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df_toronto.shape

(104, 3)

## Step 2 : Get latitude and longitude of Neighborhood

In [10]:
# import Libraries
import geocoder 
import numpy as np

In [11]:
def lookfor_coordinates(neighborhood,postal_code):

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        str_lookfor=postal_code +' , '+ neighborhood
        print(str_lookfor)
        g = geocoder.google(str_lookfor)
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude,longitude

In [12]:
## Calls to geocoder are not working well, infinite loop
#lookfor_coordinates(df_toronto['Neighborhood'][0],df_toronto['PostalCode'][0])

In [13]:
# Read lat,long csv
df_loc = pd.read_csv('Geospatial_Coordinates.csv')
df_loc.columns=['PostalCode','Latitude','Longitude']
df_loc.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
# Merge datasets
df_toronto = pd.merge(df_toronto,df_loc,on='PostalCode')
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Step 3 : Cluster analysis

In [15]:
## Import librairies
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors

### Filter on Toronto data

In [16]:
list_borough=list(df_toronto['Borough'].unique())
list_toronto=[ii for ii in list_borough if 'Toronto' in ii]
print(list_toronto)
neighborhoods = df_toronto[df_toronto['Borough'].isin(list_toronto)]
neighborhoods.head()

['East Toronto', 'East YorkEast Toronto', 'Central Toronto', 'Downtown Toronto', 'Downtown TorontoStn A PO Boxes25 The Esplanade', 'West Toronto', 'East TorontoBusiness reply mail Processing Centre969 Eastern']


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
40,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
41,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923


### Use geopy library to get the latitude and longitude values of Toronto

In [17]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of Toronto with neighborhoods superimposed on top.

In [18]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = 'AKC0LB0R50HQDF5ZJNKXKADAVJDRVJLKSOOOUT3GLWMIVRHM' # your Foursquare ID
CLIENT_SECRET = '4JYPUHE2U5TZBMO0ULPBP22A53NFHO32MQFSCVKN0AYECMM2' # your Foursquare Secret
VERSION = '20200301'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: AKC0LB0R50HQDF5ZJNKXKADAVJDRVJLKSOOOUT3GLWMIVRHM
CLIENT_SECRET:4JYPUHE2U5TZBMO0ULPBP22A53NFHO32MQFSCVKN0AYECMM2


### Get the top 100 venues that are in neighbourhoods within a radius of 500 meters.

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url,verify=False).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
# type your answer here
LIMIT=100
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

The Beaches
The Danforth East
The Danforth West ,  Riverdale
India Bazaar ,  The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park ,  Summerhill East
Summerhill West ,  Rathnelly ,  South Hill ,  Forest Hill SE ,  Deer Park
Rosedale
St. James Town ,  Cabbagetown
Church and Wellesley
Regent Park ,  Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond ,  Adelaide ,  King
Harbourfront East ,  Union Station ,  Toronto Islands
Toronto Dominion Centre ,  Design Exchange
Commerce Court ,  Victoria Hotel
Roselawn
Forest Hill North & West
The Annex ,  North Midtown ,  Yorkville
University of Toronto ,  Harbord
Kensington Market ,  Chinatown ,  Grange Park
CN Tower ,  King and Spadina ,  Railway Lands ,  Harbourfront West ,  Bathurst Quay ,  South Niagara ,  Island airport
Enclave of M5E
First Canadian Place ,  Underground city
Christie
Dufferin ,  Dovercourt Village
Little Portugal ,  Trinity
Brockton ,  

In [22]:
print(toronto_venues.shape)
toronto_venues.head()

(1688, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth East,43.685347,-79.338106,The Path,43.683923,-79.335007,Park


In [23]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 240 uniques categories.


### Analyze Each Neighborhood

#### One hot encoding

In [24]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group by

In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower , King and Spadina , Railway Lands ...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cluster Neighborhoods

In [26]:
from sklearn.cluster import KMeans 

In [27]:
# Drop Neighbourhood column
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

In [28]:
# Fit model
K=6
neigh = KMeans(init = "k-means++", n_clusters = K, n_init = 20).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
neigh.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 4,
       0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0])

In [29]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', neigh.labels_)

In [30]:
toronto_merged = toronto_grouped

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(toronto_grouped,neighborhoods, on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,PostalCode,Borough,Latitude,Longitude
0,0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M5E,Downtown Toronto,43.644771,-79.373306
1,0,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M6K,West Toronto,43.636847,-79.428191
2,0,"CN Tower , King and Spadina , Railway Lands ...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,M5V,Downtown Toronto,43.628947,-79.39442
3,0,Central Bay Street,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012048,0.0,0.0,0.0,M5G,Downtown Toronto,43.657952,-79.387383
4,0,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M6G,Downtown Toronto,43.669542,-79.422564


### Map clusters

In [31]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(K)
ys = [i + x + (i*x)**2 for i in range(K)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], \
                                  toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters