## Wrangle and Clean the Data

In [1]:
import pandas as pd
import numpy as np
import requests

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#!conda install -c conda-forge folium=0.5.0
!pip install folium
import folium



In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [3]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df.shape

(180, 3)

In [5]:
#remove the neighborhoods with not assigned Borough

Neighborhoods = df[df.Borough!='Not assigned']
Neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
Neighborhoods.shape

(103, 3)

In [7]:
'{} neighbourhoods had a non assigned value.'.format(Neighborhoods[Neighborhoods.Neighbourhood=='Not assigned'].shape[0])

'0 neighbourhoods had a non assigned value.'

In [8]:
'Our dataframe has {} rows.'.format(Neighborhoods.shape[0])

'Our dataframe has 103 rows.'

## Find the exact location of each Neighborhood

In [9]:
# The code was removed by Watson Studio for sharing.

In [10]:
#geographical data for each postal code - retrieved from a csv file

geographical_data = pd.read_csv(body)
geographical_data.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
geographical_data.shape

(103, 3)

In [12]:
Neighborhoods = pd.merge(Neighborhoods , geographical_data , on='Postal Code')

In [13]:
Neighborhoods.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [14]:
Neighborhoods.shape

(103, 5)

## Explore the neighborhoods of Toronto via Foursquare API

In [15]:
Toronto_Neigh = Neighborhoods[Neighborhoods.Borough.str.contains('Toronto')]
'There are {} Boroughs that contain the word Toronto. We will continue with only these ones.'.format(Toronto_Neigh.shape[0])

'There are 39 Boroughs that contain the word Toronto. We will continue with only these ones.'

In [16]:
Toronto_Neigh.reset_index(drop=True , inplace = True)
Toronto_Neigh.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [17]:
# connect to the Foursquare API to explore the Neighborhoods

CLIENT_ID = '****'
CLIENT_SECRET = '****'
VERSION = '20180605'
LIMIT = 100
RADIUS = 500

In [18]:
#retrieve venues for each neighborhood of Toronto - using data from Foursquare

def getNearbyVenues(Names, Latitudes, Longitudes): 
    
    venues_list=[]
    
    for name, LATITUDE, LONGITUDE in zip (Names, Latitudes, Longitudes):
        url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, LATITUDE, LONGITUDE, RADIUS, LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(name, LATITUDE, LONGITUDE,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                  'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [19]:
Toronto_venues = getNearbyVenues(Names = Toronto_Neigh['Neighbourhood'] , 
                                 Latitudes = Toronto_Neigh['Latitude'] , 
                                 Longitudes = Toronto_Neigh['Longitude'])

In [20]:
Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [21]:
'We acquired {} different venues for the {} Toronto Neighborhoods.'.format(Toronto_venues.shape[0] , Toronto_Neigh.shape[0])

'We acquired 1624 different venues for the 39 Toronto Neighborhoods.'

In [22]:
'Also there are {} unique venue categories!'.format(len(Toronto_venues['Venue Category'].unique()))

'Also there are 237 unique venue categories!'

### Split the venues to their specific category - Using pandas dummies

In [23]:
split_categories = pd.get_dummies(Toronto_venues['Venue Category'])

In [24]:
split_categories.head()

Unnamed: 0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#add the name of the neighborhood

split_categories['Neighborhood Name'] = Toronto_venues['Neighborhood']
cols = split_categories.columns.tolist()
cols = cols[-1:] + cols[:-1]
split_categories = split_categories[cols]
split_categories.head()

Unnamed: 0,Neighborhood Name,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#group them by the neighborhood name where each cell represent the frequency of the venue

Grouped_categories = split_categories.groupby('Neighborhood Name').mean().reset_index()
Grouped_categories.head()

Unnamed: 0,Neighborhood Name,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.014706,0.0,0.014706


## Cluster Neighborhoods by their type of venues

In [27]:
toronto_grouped = Grouped_categories.drop('Neighborhood Name' , axis=1)

In [28]:
from sklearn.cluster import KMeans

kclusters = 5
kmeans = KMeans(init='k-means++' , n_clusters=kclusters , n_init = 15)
kmeans.fit(toronto_grouped)
labels = kmeans.labels_
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 4 0 0 0 0 0 2 3 0 0 0 0 0 0 0 0 0
 0 0]


## Visualize neighborhoods and the cluster they belong to

In [29]:
# find the location of toronto - using geopy library

address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude, longitude = location.latitude, location.longitude
 
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [30]:
map_toronto = folium.Map(location=(latitude , longitude), zoom_start = 12)
map_toronto

In [31]:
# sort coordinates by the neighborhood name 

sorted_coordinates = Toronto_venues.groupby('Neighborhood').mean().reset_index()
sorted_coordinates = sorted_coordinates[['Neighborhood Latitude','Neighborhood Longitude']]
sorted_coordinates.head()

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude
0,43.644771,-79.373306
1,43.636847,-79.428191
2,43.662744,-79.321558
3,43.628947,-79.39442
4,43.657952,-79.387383


In [32]:

d = {'Neighborhood Name' : Grouped_categories['Neighborhood Name'] , 'Cluster Labels' : labels ,
     'Latitude' : sorted_coordinates['Neighborhood Latitude'] , 'Longitude' : sorted_coordinates['Neighborhood Longitude']}
Neighborhoods_Labeling = pd.DataFrame(data = d)
Neighborhoods_Labeling.head(20)

Unnamed: 0,Neighborhood Name,Cluster Labels,Latitude,Longitude
0,Berczy Park,0,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",0,43.636847,-79.428191
2,"Business reply mail Processing Centre, South C...",0,43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",0,43.628947,-79.39442
4,Central Bay Street,0,43.657952,-79.387383
5,Christie,0,43.669542,-79.422564
6,Church and Wellesley,0,43.66586,-79.38316
7,"Commerce Court, Victoria Hotel",0,43.648199,-79.379817
8,Davisville,0,43.704324,-79.38879
9,Davisville North,0,43.712751,-79.390197


In [33]:
# Add markers to the map with each color corresponding to a different cluster

colorList = ['red' , 'green' , 'blue' , 'yellow' , 'cyan']

for cluster, name, lat, lon in zip( Neighborhoods_Labeling['Cluster Labels'], Neighborhoods_Labeling['Neighborhood Name'],
                                   Neighborhoods_Labeling['Latitude'], Neighborhoods_Labeling['Longitude']):
    
    text = name + ' Cluster: ' + str(cluster)
    label = folium.Popup(text)
    cluster_color = colorList[cluster]
   
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=cluster_color,
        fill=True,
        fill_color=cluster_color,
        fill_opacity=0.7).add_to(map_toronto)

map_toronto    