# New Business Supply Analysis in Toronto - Solution

In [1]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/55/6f/c87dffdd88a54dd26a3a9fef1d14b6384a9933c455c54ce3ca7d64a84c88/lxml-4.5.1-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 6.8MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
#Import necessary libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans   # import k-means from clustering stage
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import wget

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

## 1. Extract and Clean Data

In [4]:
#Import dataframe from Wikipedia having every Neighborhood in Toronto with its respective Borough
data = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df = pd.DataFrame(data=data)
df_dummy = df[df['Borough'] == 'Not assigned'].index.values
df.drop(labels=df_dummy, axis=0, inplace=True)
df = df.reset_index()
df.drop(['index','Postal Code'], axis=1, inplace=True)
df_toronto = df.set_index(df.columns.drop('Neighborhood',1).tolist()).Neighborhood.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Neighborhood'}).loc[:, df.columns]
df_toronto.head(5)

Unnamed: 0,Borough,Neighborhood
0,North York,Parkwoods
1,North York,Victoria Village
2,Downtown Toronto,Regent Park
3,Downtown Toronto,Harbourfront
4,North York,Lawrence Manor


In [5]:
#Define function to get coordinates of Neighborhoods given a dataframe
def getCoordinates(data):
    
    data['Latitude'] = None
    data['Longitude'] = None
    
    for i in range(0,len(data)):   
        address = str(data['Neighborhood'][i])+', Toronto'
        try:
            geolocator = Nominatim(user_agent="to_explorer")
            location = geolocator.geocode(address)
            latitude = location.latitude
            longitude = location.longitude
        except:
            latitude = np.nan
            longitude = np.nan
        data['Latitude'][i] = latitude
        data['Longitude'][i] = longitude
    
    data.dropna(axis=0, inplace=True)
    data = data.reset_index(drop=True)
    return data

In [6]:
#Get dataframe having every Neighborhood in Toronto with its respective Borough, Latitude, and Longitude
toronto = pd.DataFrame(data=getCoordinates(df_toronto))
print('Shape of dataframe: ', toronto.shape)
toronto.head()

Shape of dataframe:  (205, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.7588,-79.3202
1,North York,Victoria Village,43.7327,-79.3112
2,Downtown Toronto,Regent Park,43.6607,-79.3605
3,Downtown Toronto,Harbourfront,43.6401,-79.3801
4,North York,Lawrence Manor,43.7221,-79.4375


## 2. Explore Venues Using Foursquare API

In [7]:
#Define credentials for Foursquare API
CLIENT_ID = 'hidden' # your Foursquare ID
CLIENT_SECRET = 'hidden' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [8]:
#Define a function that gets the top venues of every neighborhood in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius, limit):    
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)
        results = requests.get(url).json()['response']['groups'][0]['items']   
        venues_list.append([(name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    
    return(nearby_venues)

In [9]:
#Get the top 100 venues in each neighborhood within a radius of 500 meters
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                latitudes=toronto['Latitude'],
                                longitudes=toronto['Longitude'],
                                radius=500, limit=100)

In [10]:
print('Shape of dataframe: ', toronto_venues.shape)
toronto_venues.head()

Shape of dataframe:  (5975, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7588,-79.320197,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.7588,-79.320197,LCBO,43.757774,-79.314257,Liquor Store
2,Parkwoods,43.7588,-79.320197,Petro-Canada,43.75795,-79.315187,Gas Station
3,Parkwoods,43.7588,-79.320197,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy
4,Parkwoods,43.7588,-79.320197,Pizza Pizza,43.760231,-79.325666,Pizza Place


## 3. Find Neighborhoods with High and Low Supply

In [11]:
#Define function that gets number of venues within a Neighborhood, which their category 
#matches any of the given 'key words'
def getSupply(data, key_words):
    
    Neighborhood_list = data['Neighborhood'].unique()
    supply_list = []
    
    for i in range(0,len(Neighborhood_list)):
        sol2 = [Neighborhood_list[i]]
        col_list = ['Neighborhood']
       
        for ii in range(0, len(key_words)):
            df = data[data['Neighborhood'] == Neighborhood_list[i]]
            dummy = df['Venue Category'].str.findall(key_words[ii])
            dummy = dummy.to_numpy()
            sol = np.count_nonzero(dummy)
            sol2.append(sol)
            col_list.append(key_words[ii])

        supply_list.append([(sol2)])
    
    Supply = pd.DataFrame([item for supply_list in supply_list for item in supply_list])
    Supply.columns = col_list
    Supply['Total'] = Supply.sum(axis=1)
    #LowSupply = Supply.sort_values(by='Total', ascending=True, axis=0)
    
    return Supply

In [12]:
#Get number of venues for each Neighborhood that match the 'key words' specified
words = ['Restaurant','Café','Coffee','Place','Food','Deli']
Supply = pd.DataFrame(data=getSupply(toronto_venues, words))
Supply = pd.DataFrame(data=getCoordinates(Supply))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
HighSupply = Supply.sort_values(by='Total', ascending=False, axis=0)
print('Top 10 Neighborhoods in Toronto with the HIGHEST SUPPLY of specified venues:')
HighSupply.head(10)

Top 10 Neighborhoods in Toronto with the HIGHEST SUPPLY of specified venues:


Unnamed: 0,Neighborhood,Restaurant,Café,Coffee,Place,Food,Deli,Total,Latitude,Longitude
90,Willowdale,45,0,24,12,6,0,87,43.7615,-79.4109
81,Commerce Court,34,6,11,3,2,3,59,43.6481,-79.379
6,Queen's Park,23,8,19,7,0,0,57,43.6597,-79.3903
70,Design Exchange,27,7,13,3,3,3,56,43.6477,-79.3801
175,First Canadian Place,28,7,9,5,1,3,53,43.6488,-79.3817
69,Toronto Dominion Centre,27,6,10,6,2,2,53,43.6474,-79.3814
11,Don Mills,26,0,10,6,10,0,52,43.7753,-79.3459
46,King,32,5,7,2,3,2,51,43.6489,-79.3778
45,Adelaide,28,6,6,5,3,2,50,43.6505,-79.3795
141,Grange Park,29,5,8,5,0,0,47,43.6522,-79.3923


In [14]:
LowSupply = Supply.sort_values(by='Total', ascending=True, axis=0)
print('Top 10 Neighborhoods in Toronto with the LOWEST SUPPLY of specified venues:')
LowSupply.head(10)

Top 10 Neighborhoods in Toronto with the LOWEST SUPPLY of specified venues:


Unnamed: 0,Neighborhood,Restaurant,Café,Coffee,Place,Food,Deli,Total,Latitude,Longitude
133,Swansea,0,0,0,0,0,0,0,43.6449,-79.4783
21,Port Union,0,0,0,0,0,0,0,43.7755,-79.135
22,Highland Creek,0,0,0,0,0,0,0,43.7901,-79.1733
23,Woodbine Heights,0,0,0,0,0,0,0,43.6999,-79.3191
25,Eringate,0,0,0,0,0,0,0,43.6623,-79.5765
27,Old Burnhamthorpe,0,0,0,0,0,0,0,43.6394,-79.5844
28,Markland Wood,0,0,0,0,0,0,0,43.6312,-79.5854
160,South Steeles,0,0,0,0,0,0,0,43.8162,-79.3145
29,Guildwood,0,0,0,0,0,0,0,43.7552,-79.1982
167,Steeles West,0,0,0,0,0,0,0,43.8162,-79.3145


## 4. Create Map of Toronto's Neighborhoods with High and Low Supply

In [15]:
#Get coordinates of Toronto
geolocator = Nominatim(user_agent="cn_explorer")
location = geolocator.geocode('Toronto, Canada')
toronto_lat = location.latitude
toronto_lon = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(toronto_lat, toronto_lon))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [16]:
#Create map of neighborhoods with high and low supply
map_toronto = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=10)

for lat, lng, label in zip(LowSupply['Latitude'].head(10), LowSupply['Longitude'].head(10), LowSupply['Neighborhood'].head(10)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

for lat, lng, label in zip(HighSupply['Latitude'].head(10), HighSupply['Longitude'].head(10), HighSupply['Neighborhood'].head(10)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto