### Assignment Toronto - Part 1: Build Dataframe

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline

from sklearn.cluster import KMeans # import k-means from clustering stage
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Read values from HTML. After inspection it is clear that the first dataframe [0] contains the required data; thus read this dataframe.

In [2]:
# read html
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_postal_h = pd.read_html(url)
df_postal = df_postal_h[0] # after inspection it is clear that the first dataframe [0] contains the required data
print(df_postal.shape)

(287, 3)


In [3]:
df_postal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Cleaning the data: filter out records with Borough = 'Not assigned' and set Neighbourhood = 'Not assigned' to the value of the Borough.

In [4]:
# filter out records with Borouh = 'Not assigned'
df_postal_a = df_postal[df_postal.Borough != 'Not assigned']
print(df_postal_a.shape)
print(df_postal_a.shape[0])

(210, 3)
210


In [5]:
# split datafram with 'Neighbourhood' == 'Not assigned' and others
dfnb1 = df_postal_a[df_postal_a.Neighbourhood == 'Not assigned'] 
dfnb2 = df_postal_a[df_postal_a.Neighbourhood != 'Not assigned']

print(dfnb1.shape[0])
print(dfnb2.shape[0])

0
210


In [6]:
# if 'Neighbourhood' == 'Not assigned', set 'Neighbourhood' to 'Borough'
# combine split frames 
if dfnb1.shape[0] > 0:
    dfnb1.Neighbourhood = dfnb1.Borough
    frames = [dfnb1, dfnb2]
    df_postal_a = pd.concat(frames, ignore_index=True)
    
print(df_postal_a.shape)

(210, 3)


### Assignment Toronto - Part 2: Add Latitude & Longtitude to Dataframe

#### Read the csv file with the geo locations and combine the lang & lat values with the neighbourhood data

In [7]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [8]:
print(df_geo.shape)

(103, 3)


In [28]:
# merge dataframes on Postal Code
df_toronto = pd.merge(df_postal_a, df_geo, left_on='Postcode', right_on='Postal Code', how='inner')
df_toronto = df_toronto.drop(columns=['Postal Code'])
print(df_toronto.shape)
print(df_toronto.columns)

(210, 5)
Index(['Postcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'], dtype='object')


### Assignment Toronto - Part 3: Explore and Cluster the Toronto neighbourhood.

#### Create a map of Toronto with neighborhoods superimposed on top; starting with "Old Toronto".

In [29]:
address = 'Old Toronto, Toronto, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Old Toronto is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Old Totonto is 43.67368315, -79.37984349253107.


In [30]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], 
                                           df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(df_toronto, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Utilise the Foursquare API to explore the neighbourhoods and segment them.

Define Foursquare Credentials and Version.

In [32]:
CLIENT_ID = 'OS0OYL2ILDKRIOMARYDUNNESHSDFRJ3SGR33URBYJ0RXNGU3' # your Foursquare ID
CLIENT_SECRET = 'KMCIXHUWPS4AVU12TOPMGATQIDE1G5OJZ4OGJKOPSO0RQ3DH' # your Foursquare Secret
VERSION = '20200101' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OS0OYL2ILDKRIOMARYDUNNESHSDFRJ3SGR33URBYJ0RXNGU3
CLIENT_SECRET:KMCIXHUWPS4AVU12TOPMGATQIDE1G5OJZ4OGJKOPSO0RQ3DH


Let's explore the neighborhoods in our dataframe.

In [36]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OS0OYL2ILDKRIOMARYDUNNESHSDFRJ3SGR33URBYJ0RXNGU3&client_secret=KMCIXHUWPS4AVU12TOPMGATQIDE1G5OJZ4OGJKOPSO0RQ3DH&v=20200101&ll=43.67368315,-79.37984349253107&radius=500&limit=100'

Get the results from the Foursqaure API using the avobe URL into a json file.

In [42]:
results = requests.get(url).json()

In [43]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [44]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Pie Squared,Pie Shop,43.672143,-79.377856
1,Starbucks,Coffee Shop,43.671082,-79.380756
2,Rooster Coffee House,Coffee Shop,43.669654,-79.379871
3,Manulife Financial,Office,43.67207,-79.382449
4,Maison Selby,Bistro,43.671232,-79.376618


In [40]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

20 venues were returned by Foursquare.


Create a function to repeat the same process for all the neighborhoods in Toronto.

In [50]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                  
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the above function on each neighborhood and create a new dataframe called toronto_venues.

In [51]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

What is the size of the dataframe?

In [52]:
print(toronto_venues.shape)
toronto_venues.head()

(4366, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Careful & Reliable Painting,43.752622,-79.331957,Construction & Landscaping
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


#### How many unique categories can be curated from all the returned venues.

In [54]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


 #### Analyze Each Neighborhood

In [56]:
# create Toronto data
toronto_data = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_data['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_data.columns[-1]] + list(toronto_data.columns[:-1])
toronto_data = toronto_data[fixed_columns]

print(toronto_data.shape)
toronto_data.head()

(4366, 270)


Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [57]:
toronto_grouped = toronto_data.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

(206, 270)


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.020000,0.0,0.000000,0.0,0.0,0.01,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.111111,0.0,0.0,0.00,0.0,0.00
202,York Mills,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
203,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00
204,York University,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00


#### Create a dataframe with each neighborhood and its top 5 most common venues

In [58]:
# Function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [59]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Adelaide,Coffee Shop,Thai Restaurant,Bar,Restaurant,Café
1,Agincourt,Skating Rink,Breakfast Spot,Lounge,Latin American Restaurant,Women's Store
2,Agincourt North,Park,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
3,Albion Gardens,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Pharmacy,Pizza Place
4,Alderwood,Pizza Place,Coffee Shop,Pharmacy,Sandwich Place,Dance Studio


#### Cluster the Neighborhoods by using k-means to cluster the neighborhood into 5 clusters.

In [61]:
# set number of clusters
kclusters = 5

toronto_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 4, 1, 1, 1, 1, 1, 1, 1])

Create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [64]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_dataset = df_toronto

ValueError: cannot insert Cluster Labels, already exists

In [71]:
# merge toronto grouped with toronto data to add latitude/longitude for each neighborhood
toronto_dataset = toronto_dataset.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_dataset.head()

ValueError: columns overlap but no suffix specified: Index(['Cluster Labels', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue'],
      dtype='object')

#### Visualise the resulting clusters - this map shows the cluste with the number of venues in the cluster.

In [84]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
clusteri = 0

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_dataset['Latitude'], toronto_dataset['Longitude'], toronto_dataset['Neighbourhood'], 
                                  toronto_dataset['Cluster Labels']):
    # nan values were found - replace with 0
    if  pd.notnull(cluster):
        clusteri = int(cluster)
    else:
        clusteri = 0
        
    label = folium.Popup(str(poi) + ' Cluster ' + str(clusteri), parse_html=True)
       
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[clusteri-1],
        fill=True,
        fill_color=rainbow[clusteri-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### End of Assignment