## Notebook for clustering neighbourhoods in Toronto

#### Installing prerequisites

In [1]:
#importing required libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

In [2]:
#installing BeautifulSoup library to help scrape WikiPage (if not installed before)
!pip install beautifulsoup4
#installing lxml parser for BeautifulSoup to use (if not installed before)
!pip install lxml
#installing requests to work with URLs (if not installed before)
!pip install requests

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
#importing BeautifulSoup and requests
from bs4 import BeautifulSoup
import requests

#### Scrapping wiki page and loading data to dataset

In [4]:
#load WikiPage with list of PostalCodes for Toronto
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

In [5]:
#cut a table from page to work with
table = soup.find('table', class_='wikitable sortable')

In [27]:
#create columns out of headers in table
column_names = []
for header in table.find_all('th'):
    column_names.append(header.text.strip())
#create new dataframe with extracted columns
df = pd.DataFrame(columns=column_names)

In [28]:
#extract rows from table based on <tr> tag
rows = table.find_all('tr')
for row in rows[1:]: #skipping zero element as it's header
    cols = row.find_all('td')
    if cols[1].text.strip() == 'Not assigned': #skipping lines with not assigned borough
        continue
    elif cols[2].text.strip() == 'Not assigned': #if neighbourhood not assigned make it equal to borough
        df = df.append([{'Postcode':cols[0].text.strip(), 'Borough':cols[1].text.strip(), 'Neighbourhood':cols[1].text.strip()}], ignore_index=True)
    else:
        df = df.append([{'Postcode':cols[0].text.strip(), 'Borough':cols[1].text.strip(), 'Neighbourhood':cols[2].text.strip()}], ignore_index=True) 

#### Adding coordinates from provided csv (because geocoder failed)

In [29]:
df1 = pd.read_csv('https://cocl.us/Geospatial_data')

In [30]:
df1.rename(columns={'Postal Code':'Postcode'}, inplace=True)

#### Merging coordinates from df1(that was created from csv) with our df neighbourhood dataframe

In [31]:
df2 = pd.merge(df, df1, on='Postcode')
df2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353


In [32]:
df2.shape

(212, 5)

#### Simplify dataframe by slicing and leaving only neighbourhoods that contains Toronto in Borough.

In [33]:
toronto_df = df2[df2['Borough'].str.contains('Toronto')].reset_index(drop=True)

In [34]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


#### Importing additional dependencies to show neighbourhoods in the map

In [16]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Install and import completed')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Install and import completed


#### Use geopy library to get the latitude and longitude values of Toronto.

In [35]:
address = 'Toronto, Ontario'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [36]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

### Now I'm going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define Foursquare Credentials and Version

In [37]:
CLIENT_ID = '0GPMCSTZ5SOR20WJQNQWRXM0YHXQDMIKAQKHMTPQM2FPHPES' #Foursquare ID
CLIENT_SECRET = 'N2VIOQ1Y45VH2MCATYXQJQZS2AEALSIC4UPVWKUGBFU0GSZS' #Foursquare Secret
VERSION = '20181020' # Foursquare API version
LIMIT = 100

#### Creating a function to get nearby venues (within 500 meters) for all the neighborhoods in Toronto using Foursquare

In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Runing the above function on each neighborhood and create a new dataframe called toronto_venues

In [39]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighbourhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

Harbourfront
Regent Park
Ryerson


KeyError: 'groups'

In [40]:
#checking the size of the resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

(1705, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront,Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront,Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront,Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
3,"Harbourfront,Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"Harbourfront,Regent Park",43.65426,-79.360636,Cooper Koo YMCA,43.653191,-79.357947,Gym / Fitness Center


#### Checking how many venues were returned for each neighborhood

In [41]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,53,53,53,53,53,53
"Brockton,Exhibition Place,Parkdale Village",21,21,21,21,21,21
Business reply mail Processing Centre969 Eastern,17,17,17,17,17,17
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",14,14,14,14,14,14
"Cabbagetown,St. James Town",48,48,48,48,48,48
Central Bay Street,82,82,82,82,82,82
"Chinatown,Grange Park,Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,88,88,88,88,88,88
