## Imports

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import geocoder
from geopy.geocoders import Nominatim # convert an address into coordinates

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

from sklearn.cluster import KMeans # import K-Means algorithm

%matplotlib inline

## Define user defined functions that to be used later.

In [2]:
# Define a function to return the coordinates of a location
def get_latlng(arcgis_geocoder, location):
    
    # initialise a variable to None
    lat_lng_coords = None
    
    # While loop helps to create a continous run until all the location coordinates are geocoded
    # loop until the coordinates returned
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, {}'.format(arcgis_geocoder, location))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# Define a function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Define a function that return nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Define a function to sort the venues in descending order
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Data Acquisition - Toronto
### Acquire borough and neighbourhood data for Torono through web scraping.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tbl = pd.read_html(url)
len(tbl)

3

In [4]:
df_toronto = tbl[0]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Wrangling - Toronto

In [5]:
# Display number of rows and columns
df_toronto.shape

(180, 3)

### Remove rows with that the borough is having 'Not assigned'.

In [6]:
df_toronto.drop(df_toronto[df_toronto['Borough'] == 'Not assigned'].index, axis = 0, inplace = True)

### To double check that all rows which borough is 'Not assigned' are removed.

In [7]:
df_toronto[df_toronto['Borough'] == 'Not assigned'].any()

Postal Code      False
Borough          False
Neighbourhood    False
dtype: bool

### Check if there's duplicated 'Postal Code' row.

In [8]:
df_toronto['Postal Code'].duplicated().any()

False

### Display number of rows and columns after the dataframe is cleaned.

In [9]:
df_toronto.reset_index(drop = True, inplace = True)

In [10]:
df_toronto.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
df_toronto.shape

(103, 3)

### Get coordinates of all postcodes into a dataframe.

In [12]:
df_toronto1 = pd.read_csv('Geospatial_Coordinates.csv')
df_toronto1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the dataframes

In [13]:
df_toronto2 = df_toronto.merge(df_toronto1)
df_toronto2.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Verify that the dataframes merged successfully

In [14]:
df_toronto2.shape

(103, 5)

### Check number of unique borough.

In [15]:
df_toronto2['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

## Exploratory Data Analysis - Toronto
### I will focus on neighbourhoods in Toronto only.

In [16]:
toronto = ['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']
df_toronto3 = df_toronto2.loc[df_toronto2['Borough'].isin(toronto)].reset_index(drop = True)

In [17]:
df_toronto3.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [18]:
# Drop the column 'Postal Code' as I got the coorindates
df_toronto3.drop('Postal Code', axis = 1, inplace = True)

### Get the coorindates for Toronto, Ontario.

In [19]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="my_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontraio are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontraio are 43.6534817, -79.3839347.


### Visualise the Toronto neighbourhoods by using Folium.

In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_toronto3['Latitude'], df_toronto3['Longitude'], df_toronto3['Neighbourhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
map_toronto

### Define Foursquare credentials and version.
### Due to security concern, the credentials is stored in a json file. So, the credentials information will be read and loaded from the file.

In [21]:
json_file = '4squarecredential.json'
with open(json_file) as f:
    data = json.load(f)

CLIENT_ID = data['credential']['CLIENT_ID']
CLIENT_SECRET = data['credential']['CLIENT_SECRET']
VERSION = data['credential']['VERSION']

LIMIT = 100 # A default Foursquare API limit value

### What's the first neighbourhood?

In [22]:
df_toronto3.loc[0, 'Neighbourhood']

'Regent Park, Harbourfront'

### Store the coorindates and the name of the neighbourhood.

In [23]:
nb_name = df_toronto3.loc[0, 'Neighbourhood']
nb_lat = df_toronto3.loc[0, 'Latitude']
nb_lng = df_toronto3.loc[0, 'Longitude']
print('The latitude and longitude of {} are {} and {}'.format(nb_name, nb_lat, nb_lng))

The latitude and longitude of Regent Park, Harbourfront are 43.6542599 and -79.3606359


### Let's get the top 100 venues that are within radius of 500 metres of Regent Park, Harbourfront.

In [24]:
# build the url string
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, nb_lat, nb_lng, VERSION, radius, LIMIT)

### Send the GET request and examine the results.

In [25]:
results = requests.get(url).json()
results

ss3.4sqi.net/img/categories_v2/shops/gym_yogastudio_',
          'suffix': '.png'},
         'primary': True}],
       'photos': {'count': 0, 'groups': []}},
      'referralId': 'e-0-4b58dd55f964a5208f6f28e3-26'},
     {'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d84d98181fdb1f7d4a704c0',
       'name': 'Caffe Furbo',
       'location': {'address': '12 case goods lane',
        'lat': 43.649969882303814,
        'lng': -79.35884946388191,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.649969882303814,
          'lng': -79.35884946388191}],
        'distance': 498,
        'postalCode': 'M5A 3C4',
        'cc': 'CA',
        'city': 'Toronto',
        'state': 'ON',
        'country': 'Canada',
        'formattedAddress': ['12 case goods lane',
         'Toronto ON M5A 3C4',
         'Canada']},
       'categories': [{'id': '4bf58

### Create a dataframe to store the venues, the corresponding categories and coordinates.


In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698
5,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
6,Corktown Common,Park,43.655618,-79.356211
7,The Extension Room,Gym / Fitness Center,43.653313,-79.359725
8,The Distillery Historic District,Historic Site,43.650244,-79.359323
9,SOMA chocolatemaker,Chocolate Shop,43.650622,-79.358127


### How many venues returned by Foursquare?

In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

44 venues were returned by Foursquare.


### What are the top 10 categories?

In [28]:
nearby_venues_by_categories = nearby_venues['categories'].value_counts().to_frame(name = 'Count')
nearby_venues_by_categories.head(10)

Unnamed: 0,Count
Coffee Shop,8
Park,3
Bakery,3
Pub,3
Theater,2
Breakfast Spot,2
Café,2
Spa,1
Performing Arts Venue,1
Bank,1


### What are the neighbourhoods?

In [29]:
toronto_venues = getNearbyVenues(names = df_toronto3['Neighbourhood'], latitudes = df_toronto3['Latitude'], longitudes = df_toronto3['Longitude'])

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

### Let's examine how many venues retrieved

In [30]:
print(toronto_venues.shape)
toronto_venues.head()

(1624, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### Let's examine number of venues by neighbourhood.

In [31]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,68,68,68,68,68,68
Christie,16,16,16,16,16,16
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9


### Now let's see how many unique categories that can be curated.

In [32]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 235 uniques categories.


## Analysing Each Neighbourhood
### Before analysing the neighbourhoods, I will apply one-hot encoding technique to normalise the dataset first.

In [33]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
toronto_onehot.shape

(1624, 236)

### Group the rows by neighbourhood and by taking the mean of the frequency of occurance of each category.

In [35]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.014706,0.014706
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,...,0.013333,0.013333,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.026667
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
toronto_grouped.shape

(39, 236)

### What are the top 5 most common venues of each neighbourhood?

In [37]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----" + hood + "----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1     Cheese Shop  0.04
2      Restaurant  0.04
3          Bakery  0.04
4  Farmers Market  0.04


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.13
1       Nightclub  0.09
2     Coffee Shop  0.09
3  Breakfast Spot  0.09
4             Gym  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
           venue  freq
0           Park  0.06
1  Auto Workshop  0.06
2        Brewery  0.06
3     Skate Park  0.06
4  Burrito Place  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2             Plane  0.06
3   Harbor / Marina  0.06
4  Sculpture Garden  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.18
1                Café  0.06
2      Sandwich Pl

### Create a dataframe and Display the top 10 venues for each neighbourhood.

In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

# create a new dataframe
nbh_venues_sorted = pd.DataFrame(columns = columns)
nbh_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    nbh_venues_sorted.iloc[ind, 1:] = most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

nbh_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Bakery,Cocktail Bar,Beer Bar,Cheese Shop,Seafood Restaurant,Restaurant,Farmers Market,Sandwich Place,Breakfast Spot
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Nightclub,Coffee Shop,Climbing Gym,Burrito Place,Restaurant,Italian Restaurant,Intersection,Bar
2,"Business reply mail Processing Centre, South C...",Skate Park,Pizza Place,Brewery,Burrito Place,Restaurant,Farmers Market,Fast Food Restaurant,Butcher,Recording Studio,Auto Workshop
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boutique,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Bar,Coffee Shop,Plane
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Bubble Tea Shop,Salad Place,Burger Joint,Department Store,Thai Restaurant


## Cluster Neighbourhoods - Toronto
### Run K-Means to cluster the neighbourhoods into 5 clusters.

In [39]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Create a dataframe to store the cluster as well as the top 10 venues of each neighbourhood.

In [40]:
# add clustering labels
nbh_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto3

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(nbh_venues_sorted.set_index('Neighbourhood'), on = 'Neighbourhood')

toronto_merged.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Event Space,Shoe Store,Hotel
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Clothing Store,Coffee Shop,Café,Japanese Restaurant,Bubble Tea Shop,Cosmetics Shop,Diner,Lingerie Store,Ramen Restaurant,Italian Restaurant
3,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Café,Cocktail Bar,Restaurant,Beer Bar,Gastropub,American Restaurant,Farmers Market,Hotel,Japanese Restaurant
4,East Toronto,The Beaches,43.676357,-79.293031,3,Pub,Trail,Health Food Store,Neighborhood,Yoga Studio,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center


### Visualise the resulting cluster.

In [41]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set colour scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x)**2 for i in range(kclusters)]
colours_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colours_array]

# add markers to the map
markers_colours = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

### Now, let's examine the categories for each cluster.
#### Cluster 1

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Lawrence Park,0,Park,Bus Line,Swim School,Colombian Restaurant,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
21,"Forest Hill North & West, Forest Hill Road Park",0,Park,Sushi Restaurant,Jewelry Store,Trail,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
33,Rosedale,0,Park,Trail,Playground,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


#### Cluster 2

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,"Moore Park, Summerhill East",1,Trail,Playground,Yoga Studio,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


### Cluster 3

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",2,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Event Space,Shoe Store,Hotel
1,"Queen's Park, Ontario Provincial Government",2,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
2,"Garden District, Ryerson",2,Clothing Store,Coffee Shop,Café,Japanese Restaurant,Bubble Tea Shop,Cosmetics Shop,Diner,Lingerie Store,Ramen Restaurant,Italian Restaurant
3,St. James Town,2,Coffee Shop,Café,Cocktail Bar,Restaurant,Beer Bar,Gastropub,American Restaurant,Farmers Market,Hotel,Japanese Restaurant
5,Berczy Park,2,Coffee Shop,Bakery,Cocktail Bar,Beer Bar,Cheese Shop,Seafood Restaurant,Restaurant,Farmers Market,Sandwich Place,Breakfast Spot
6,Central Bay Street,2,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Bubble Tea Shop,Salad Place,Burger Joint,Department Store,Thai Restaurant
7,Christie,2,Grocery Store,Café,Park,Candy Store,Italian Restaurant,Nightclub,Baby Store,Coffee Shop,Athletics & Sports,Restaurant
8,"Richmond, Adelaide, King",2,Coffee Shop,Café,Gym,Restaurant,Hotel,Thai Restaurant,Bar,Clothing Store,Cosmetics Shop,Concert Hall
9,"Dufferin, Dovercourt Village",2,Pharmacy,Bakery,Grocery Store,Brewery,Bank,Middle Eastern Restaurant,Bar,Café,Supermarket,Music Venue
10,"Harbourfront East, Union Station, Toronto Islands",2,Coffee Shop,Aquarium,Café,Hotel,Scenic Lookout,Brewery,Pizza Place,Restaurant,Fried Chicken Joint,Italian Restaurant


### Cluster 4

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,The Beaches,3,Pub,Trail,Health Food Store,Neighborhood,Yoga Studio,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center


### Cluster 5

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Roselawn,4,Music Venue,Garden,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


## Daa Acquisition - London

In [47]:
url = 'https://en.wikipedia.org/wiki/List_of_areas_of_London'
tbl = pd.read_html(url)
len(tbl)

5

## Data Wrangling - London
### According to the table sequence displayed on the Wiki page, the table which stored the location data is the 2nd table.
### So, I will load the 2nd table into a dataframe.

In [48]:
df_london = tbl[1]
df_london.head()

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [49]:
# Rename column names
columns = ['Location', 'Borough', 'Post Town', 'Postcode', 'Dial Code', 'OS Grid Ref']
df_london.columns = columns

In [50]:
# Drop unused columns
df_london.drop(['Dial Code', 'OS Grid Ref'], axis = 1, inplace = True)
df_london.head()

Unnamed: 0,Location,Borough,Post Town,Postcode
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4"
2,Addington,Croydon[8],CROYDON,CR0
3,Addiscombe,Croydon[8],CROYDON,CR0
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14"


In [51]:
# Remove the reference number from Borough
df_london['Borough'] = df_london['Borough'].map(lambda x: x.rstrip(']').rstrip('0123456789').rstrip('['))
df_london.head()

Unnamed: 0,Location,Borough,Post Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4"
2,Addington,Croydon,CROYDON,CR0
3,Addiscombe,Croydon,CROYDON,CR0
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14"


### Since there are more than one postcode for a location, e.g., Acton as seen above, I'm going to split it into multiple rows so that there's one postcode in a row only.

In [52]:
df_london1 = df_london.drop('Postcode', axis=1).join(df_london['Postcode'].str.split(',', expand = True).stack().reset_index(level = 1, drop = True).rename('Postcode'))

In [53]:
df_london1.head()

Unnamed: 0,Location,Borough,Post Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,W3
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,W4
2,Addington,Croydon,CROYDON,CR0
3,Addiscombe,Croydon,CROYDON,CR0


### Since it's about compare how similar or dissimilar between London and Toronoto, I will filter the post town to 'London' only.

In [54]:
df_london2 =  df_london1[df_london1['Post Town'].str.contains('LONDON')]
df_london2

Unnamed: 0,Location,Borough,Post Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,W3
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,W4
6,Aldgate,City,LONDON,EC3
7,Aldwych,Westminster,LONDON,WC2
...,...,...,...,...
522,Woodford,Redbridge,LONDON,E18
523,Woodford Green,"Redbridge, Waltham Forest","LONDON, WOODFORD GREEN",IG8
526,Woodside Park,Barnet,LONDON,N12
527,Woolwich,Greenwich,LONDON,SE18


### I noticed that there are more than one post town for a location, e.g. row 524 (e.g. index = 523) in the above screenshot. I'm going to split it into multiple rows so that that's one post town per row only.

In [55]:
df_london3 = df_london2.drop('Post Town', axis=1).join(df_london2['Post Town'].str.split(',', expand = True).stack().reset_index(level = 1, drop = True).rename('Post Town'))

In [56]:
df_london3['Post Town'] = df_london3['Post Town'].str.strip()
df_london3.reset_index(drop = True, inplace = True)
df_london3.tail()

Unnamed: 0,Location,Borough,Postcode,Post Town
632,Woodford Green,"Redbridge, Waltham Forest",IG8,LONDON
633,Woodford Green,"Redbridge, Waltham Forest",IG8,WOODFORD GREEN
634,Woodside Park,Barnet,N12,LONDON
635,Woolwich,Greenwich,SE18,LONDON
636,Wormwood Scrubs,Hammersmith and Fulham,W12,LONDON


### I'm going to drop rows which the post town not equals to 'London'.

In [57]:
df_london3.drop(df_london3[df_london3['Post Town'] != 'LONDON'].index, axis = 0, inplace = True)
df_london3.reset_index(drop = True, inplace = True)
df_london3.tail()

Unnamed: 0,Location,Borough,Postcode,Post Town
560,Woodford,Redbridge,E18,LONDON
561,Woodford Green,"Redbridge, Waltham Forest",IG8,LONDON
562,Woodside Park,Barnet,N12,LONDON
563,Woolwich,Greenwich,SE18,LONDON
564,Wormwood Scrubs,Hammersmith and Fulham,W12,LONDON


In [58]:
df_london3.drop_duplicates(ignore_index = True, inplace = True)
df_london3.head()

Unnamed: 0,Location,Borough,Postcode,Post Town
0,Abbey Wood,"Bexley, Greenwich",SE2,LONDON
1,Acton,"Ealing, Hammersmith and Fulham",W3,LONDON
2,Acton,"Ealing, Hammersmith and Fulham",W4,LONDON
3,Aldgate,City,EC3,LONDON
4,Aldwych,Westminster,WC2,LONDON


In [59]:
df_london3['Postcode'].is_unique

False

In [60]:
df_london4 = df_london3.groupby(['Post Town', 'Postcode', 'Borough'])['Location'].apply(', '.join).reset_index()
df_london4.head(20)

Unnamed: 0,Post Town,Postcode,Borough,Location
0,LONDON,DA14,Bexley,Longlands
1,LONDON,DA15,Bexley,Longlands
2,LONDON,DA16,"Bexley, Greenwich",Falconwood
3,LONDON,DA18,"Bexley, Greenwich",Thamesmead
4,LONDON,DA7,Bexley,Bexleyheath (also Bexley New Town)
5,LONDON,E13,Newham,Upton Park
6,LONDON,E15,Newham,West Ham
7,LONDON,E15,Waltham Forest,Leyton
8,LONDON,E16,Newham,Beckton
9,LONDON,E18,Redbridge,Woodford


In [61]:
# df_london4[df_london4['Postcode'].isin(df_london4['Postcode'].value_counts()[df_london4['Postcode'].value_counts() > 1].index)]

In [62]:
df_london4.groupby('Postcode').filter(lambda x: len(x) > 1)

Unnamed: 0,Post Town,Postcode,Borough,Location
6,LONDON,E15,Newham,West Ham
7,LONDON,E15,Waltham Forest,Leyton
17,LONDON,N1,Hackney,Hackney
18,LONDON,N1,Islington,Angel
36,LONDON,SE11,Lambeth,Oval
...,...,...,...,...
216,LONDON,W4,"Hounslow, Ealing, Hammersmith and Fulham",Chiswick
222,LONDON,WC1,Camden,"Bloomsbury, Holborn, St Pancras"
223,LONDON,WC1,Camden and Islington,King's Cross
224,LONDON,WC2,Camden,St Giles


### According to Wikipedia https://en.wikipedia.org/wiki/London_postal_district, the postcode of City of London begins with either E, EC, N, NW, SE, SW, W or WC. So, I will drop all rows which the postcode doesn't start with any of these characters.
### But before I drop the rows, I will trim all leading and trailing space for the Postcode column first.

In [63]:
df_london4['Postcode'] = df_london4['Postcode'].str.strip()

In [64]:
# create a new dataframe and store all rows for City of London
city_of_london = ['E', 'EC', 'N', 'NW', 'SE', 'SW', 'W', 'WC']
df_london5 = df_london4[df_london4['Postcode'].str[:2].isin(city_of_london)]

### Before carrying on, I need to test if there's any duplicate rows. If there is, and to simplify the subsequent tasks, I will keep the first occurence of duplicated rows only.

In [65]:
# check if there's duplicated rows
df_london5['Postcode'].is_unique

False

### Since there're duplicated rows, I'm going to keep the first occurence of the duplicated rows and drop the rest.

In [66]:
df_london5.drop_duplicates(subset = ['Postcode'], inplace = True, ignore_index = True)
df_london5.head()

Unnamed: 0,Post Town,Postcode,Borough,Location
0,LONDON,EC2,City,Barbican
1,LONDON,NW10,Brent,Neasden
2,LONDON,NW3,Camden,"Gospel Oak, Primrose Hill"
3,LONDON,NW4,Barnet,Brent Cross
4,LONDON,NW6,Brent,Kensal Green


In [67]:
df_london5['Postcode'].is_unique

True

In [68]:
df_london5.shape

(65, 4)

### Now the dataframe is cleaned. I will get coordinates for all postcodes into a dataframe. To do this, I've prepared a CSV file which contains required coordinates for the postcodes.

In [69]:
df_london_coord = pd.read_csv('London_Coordinates.csv')

### Merge the dataframes.

In [70]:
df_london6 = df_london5.merge(df_london_coord)
df_london6.head(12)

Unnamed: 0,Post Town,Postcode,Borough,Location,Latitude,Longitude
0,LONDON,EC2,City,Barbican,51.518,-0.084
1,LONDON,NW10,Brent,Neasden,51.541,-0.2531
2,LONDON,NW3,Camden,"Gospel Oak, Primrose Hill",51.5517,-0.1706
3,LONDON,NW4,Barnet,Brent Cross,51.5937,-0.2181
4,LONDON,NW6,Brent,Kensal Green,51.5438,-0.1971
5,LONDON,NW7,Barnet,Arkley,51.6147,-0.2301
6,LONDON,NW8,Camden,Primrose Hill,51.5333,-0.1734
7,LONDON,SE11,Lambeth,Oval,51.4913,-0.1085
8,LONDON,SE12,Greenwich,Blackheath Royal Standard,51.4467,-0.0176
9,LONDON,SE13,Lewisham,Ladywell,51.4572,-0.0059


### I will rename the column name from 'Location' to 'Neighbourhood' to align with the naming convention of Toronto's dataframe.

In [71]:
df_london6.rename(columns = {'Location': 'Neighbourhood'}, inplace = True)
df_london6.head()

Unnamed: 0,Post Town,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,LONDON,EC2,City,Barbican,51.518,-0.084
1,LONDON,NW10,Brent,Neasden,51.541,-0.2531
2,LONDON,NW3,Camden,"Gospel Oak, Primrose Hill",51.5517,-0.1706
3,LONDON,NW4,Barnet,Brent Cross,51.5937,-0.2181
4,LONDON,NW6,Brent,Kensal Green,51.5438,-0.1971


### Verify that the dataframes merged successfully.

In [72]:
df_london6.shape

(65, 6)

## Exploratory Data Analysis - London
### Get the coordinates for London.

In [73]:
address = 'London, England'

geolocator = Nominatim(user_agent="my_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of London, England are 51.5073219, -0.1276474.


### Visualise the London neighbourhoods by using Folium.

In [74]:
# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_london6['Latitude'], df_london6['Longitude'], df_london6['Neighbourhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_london)
    
map_london

### What's the first neighbourhood?

In [75]:
df_london6.loc[0, 'Neighbourhood']

'Barbican'

### Store the coorindates and the name of the neighbourhood.

In [76]:
nb_name = df_london6.loc[0, 'Neighbourhood']
nb_lat = df_london6.loc[0, 'Latitude']
nb_lng = df_london6.loc[0, 'Longitude']
print('The latitude and longitude of {} are {} and {}'.format(nb_name, nb_lat, nb_lng))

The latitude and longitude of Barbican are 51.518 and -0.084


### Let's get the top 100 venues that are within radius of 500 metres of Barbican.

In [77]:
# build the url string
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, nb_lat, nb_lng, VERSION, radius, LIMIT)

### Send the GET request and examine the results.

In [78]:
results = requests.get(url).json()
results

      'United Kingdom']},
       'categories': [{'id': '4bf58dd8d48988d10f941735',
         'name': 'Indian Restaurant',
         'pluralName': 'Indian Restaurants',
         'shortName': 'Indian',
         'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/indian_',
          'suffix': '.png'},
         'primary': True}],
       'photos': {'count': 0, 'groups': []}},
      'referralId': 'e-0-4b7d87fdf964a520dbc42fe3-64'},
     {'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '57a39d44498e208b977e3fba',
       'name': 'The Astronomer',
       'location': {'address': '125-129 Middlesex Street',
        'lat': 51.51793950933359,
        'lng': -0.07880447055231343,
        'labeledLatLngs': [{'label': 'display',
          'lat': 51.51793950933359,
          'lng': -0.07880447055231343}],
        'distance': 359,
        'postalCode': 'E1 7JF',
 

### Create a dataframe to store the venues, the corresponding categories and coordinates.

In [79]:
venues = results['response']['groups'][0]['items']
    
nearby_venues_london = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues_london = nearby_venues_london.loc[:, filtered_columns]

# filter the category for each row
nearby_venues_london['venue.categories'] = nearby_venues_london.apply(get_category_type, axis = 1)

# clean columns
nearby_venues_london.columns = [col.split(".")[-1] for col in nearby_venues_london.columns]

nearby_venues_london.head()

Unnamed: 0,name,categories,lat,lng
0,1Rebel,Boxing Gym,51.518378,-0.083861
1,Kobox,Boxing Gym,51.516845,-0.085335
2,Yauatcha,Chinese Restaurant,51.518828,-0.083498
3,Andaz London Liverpool Street - a concept by H...,Hotel,51.517213,-0.081645
4,Franco Manca,Pizza Place,51.51878,-0.083448


### How many venues returned by Foursquare?

In [80]:
print('{} venues were returned by Foursquare.'.format(nearby_venues_london.shape[0]))

81 venues were returned by Foursquare.


### What are the top 10 categories?

In [81]:
nearby_venues_by_categories = nearby_venues_london['categories'].value_counts().to_frame(name = 'Count')
nearby_venues_by_categories.head(10)

Unnamed: 0,Count
Hotel,6
Gym / Fitness Center,4
Food Truck,3
Pub,3
Coffee Shop,3
English Restaurant,3
Pizza Place,3
Cocktail Bar,3
Middle Eastern Restaurant,2
Wine Bar,2


### What are the neighbourhoods?

In [82]:
london_venues = getNearbyVenues(names = df_london6['Neighbourhood'], latitudes = df_london6['Latitude'], longitudes = df_london6['Longitude'])

Barbican
Neasden
Gospel Oak, Primrose Hill
Brent Cross
Kensal Green
Arkley
Primrose Hill
Oval
Blackheath Royal Standard
Ladywell
Elephant and Castle, Newington
Bexleyheath (also Bexley New Town)
Beckenham, Crystal Palace
Crystal Palace
Gipsy Hill, Tulse Hill
Brixton
Earls Court
Southfields
Wimbledon
South Kensington
South Kensington
Brixton, Oval, Stockwell
Holborn
Barbican
Aldgate
Blackfriars
Camden Town, Chalk Farm, Primrose Hill, Somerstown
Golders Green
Brent Cross, Childs Hill
Gospel Oak, Kentish Town
Colindale, Grahame Park, The Hyde, West Hendon
Lambeth
Greenwich, Maze Hill
New Cross
Nunhead, Peckham
Rotherhithe, Surrey Quays
Plumstead, Shooter's Hill, Woolwich
Crystal Palace
Sydenham Hill
East Dulwich
Forest Hill, Honor Oak
Herne Hill, Tulse Hill
Selhurst, South Norwood
Thamesmead
Blackheath Royal Standard, Kidbrooke, Westcombe Park
Brockley, Crofton Park, Ladywell, St Johns
Bellingham, Catford, Southend
Charlton
Deptford
Longlands
Belgravia, Knightsbridge, Millbank, Pimlico, S

### What's the size of the resulting dataframe?

In [83]:
print(london_venues.shape)
london_venues.head()

(2203, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barbican,51.518,-0.084,1Rebel,51.518378,-0.083861,Boxing Gym
1,Barbican,51.518,-0.084,Kobox,51.516845,-0.085335,Boxing Gym
2,Barbican,51.518,-0.084,Yauatcha,51.518828,-0.083498,Chinese Restaurant
3,Barbican,51.518,-0.084,Andaz London Liverpool Street - a concept by H...,51.517213,-0.081645,Hotel
4,Barbican,51.518,-0.084,Franco Manca,51.51878,-0.083448,Pizza Place


### How many venues are there for each of the neighbourhoods?

In [84]:
london_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aldgate,49,49,49,49,49,49
Arkley,2,2,2,2,2,2
Balham,59,59,59,59,59,59
Barbican,170,170,170,170,170,170
"Barnes, Castelnau",16,16,16,16,16,16
...,...,...,...,...,...,...
Southfields,26,26,26,26,26,26
Sydenham Hill,5,5,5,5,5,5
Thamesmead,74,74,74,74,74,74
"Tooting, Tooting Bec",16,16,16,16,16,16


### How many unique categories that can be curated?

In [85]:
print('There are {} uniques categories.'.format(len(london_venues['Venue Category'].unique())))

There are 269 uniques categories.


## Analysing Each Neighbourhood

In [86]:
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighbourhood column back to dataframe
london_onehot['Neighbourhood'] = london_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]

london_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,Barbican,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Barbican,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Barbican,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Barbican,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Barbican,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
london_onehot.shape

(2203, 270)

### Group the rows by neighbourhood and by taking the mean of the frequency of occurrence of each category.

In [88]:
london_grouped = london_onehot.groupby('Neighbourhood').mean().reset_index()
london_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,Aldgate,0.0,0.0,0.0,0.0,0.0,0.0,0.020408,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,Arkley,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
2,Balham,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.016949,0.0
3,Barbican,0.0,0.0,0.0,0.0,0.0,0.0,0.005882,0.011765,0.0,...,0.011765,0.0,0.005882,0.011765,0.005882,0.0,0.005882,0.0,0.005882,0.0
4,"Barnes, Castelnau",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Southfields,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.000000,0.0,...,0.038462,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
57,Sydenham Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
58,Thamesmead,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.013514,0.000000,0.0,0.000000,0.0,0.000000,0.0
59,"Tooting, Tooting Bec",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0


In [89]:
london_grouped.shape

(61, 270)

### What are the top 5 most common venues of each neighbourhood?

In [90]:
num_top_venues = 5

for hood in london_grouped['Neighbourhood']:
    print("----" + hood + "----")
    temp = london_grouped[london_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Aldgate----
                  venue  freq
0                 Hotel  0.10
1           Coffee Shop  0.08
2        Scenic Lookout  0.06
3  Gym / Fitness Center  0.06
4     French Restaurant  0.06


----Arkley----
                        venue  freq
0                    Pharmacy   0.5
1  Construction & Landscaping   0.5
2                 Opera House   0.0
3       Performing Arts Venue   0.0
4            Pedestrian Plaza   0.0


----Balham----
         venue  freq
0          Pub  0.12
1  Coffee Shop  0.12
2  Pizza Place  0.07
3  Supermarket  0.05
4       Bakery  0.05


----Barbican----
                  venue  freq
0           Coffee Shop  0.09
1  Gym / Fitness Center  0.06
2    Italian Restaurant  0.05
3            Food Truck  0.05
4                 Hotel  0.04


----Barnes, Castelnau----
                 venue  freq
0    Food & Drink Shop  0.12
1  Indie Movie Theater  0.06
2      Thai Restaurant  0.06
3        Movie Theater  0.06
4      Nature Preserve  0.06


----Battersea----
       

### Create a dataframe and Display the top 10 venues for each neighbourhood.

In [91]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

# create a new dataframe
nbh_venues_sorted = pd.DataFrame(columns = columns)
nbh_venues_sorted['Neighbourhood'] = london_grouped['Neighbourhood']

for ind in np.arange(london_grouped.shape[0]):
    nbh_venues_sorted.iloc[ind, 1:] = most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

nbh_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aldgate,Hotel,Coffee Shop,Scenic Lookout,French Restaurant,Gym / Fitness Center,Museum,Castle,Hotel Bar,History Museum,Italian Restaurant
1,Arkley,Pharmacy,Construction & Landscaping,Zoo Exhibit,Farmers Market,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Fast Food Restaurant
2,Balham,Coffee Shop,Pub,Pizza Place,Bakery,Italian Restaurant,Indian Restaurant,Supermarket,Grocery Store,Sandwich Place,Bar
3,Barbican,Coffee Shop,Gym / Fitness Center,Food Truck,Italian Restaurant,Pub,Hotel,Bar,Cocktail Bar,English Restaurant,Sushi Restaurant
4,"Barnes, Castelnau",Food & Drink Shop,Indie Movie Theater,Thai Restaurant,Pub,Pizza Place,Recreation Center,Restaurant,Community Center,Coffee Shop,Park


## Cluster Neighbourhoods
### Run K-Means to cluster the neighbourhoods into 5 clusters.

In [92]:
# set number of clusters
kclusters = 5

london_grouped_clustering = london_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(london_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 0, 4, 0, 4, 0])

### Create a dataframe to store the cluster as well as the top 10 venues of each neighbourhood.

In [93]:
# add clustering labels
nbh_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

london_merged = df_london6

# merge london_grouped with london_data to add latitude/longitude for each neighbourhood
london_merged = london_merged.join(nbh_venues_sorted.set_index('Neighbourhood'), on = 'Neighbourhood')

london_merged.head()

Unnamed: 0,Post Town,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,LONDON,EC2,City,Barbican,51.518,-0.084,0,Coffee Shop,Gym / Fitness Center,Food Truck,Italian Restaurant,Pub,Hotel,Bar,Cocktail Bar,English Restaurant,Sushi Restaurant
1,LONDON,NW10,Brent,Neasden,51.541,-0.2531,2,Discount Store,Liquor Store,Caribbean Restaurant,Chinese Restaurant,Zoo Exhibit,Exhibit,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant
2,LONDON,NW3,Camden,"Gospel Oak, Primrose Hill",51.5517,-0.1706,0,Café,Bakery,Coffee Shop,Pub,Greek Restaurant,Bagel Shop,Pizza Place,Museum,Burger Joint,Convenience Store
3,LONDON,NW4,Barnet,Brent Cross,51.5937,-0.2181,4,Hotel,Bus Stop,Entertainment Service,Zoo Exhibit,Fast Food Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Farmers Market
4,LONDON,NW6,Brent,Kensal Green,51.5438,-0.1971,0,Pub,Indian Restaurant,Café,Middle Eastern Restaurant,Park,Coffee Shop,Brazilian Restaurant,Portuguese Restaurant,Thai Restaurant,Korean Restaurant


### Visualise the resulting cluster.

In [94]:
# create map
map_london_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set colour scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x)**2 for i in range(kclusters)]
colours_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colours_array]

# add markers to the map
markers_colours = []
for lat, lon, poi, cluster in zip(london_merged['Latitude'], london_merged['Longitude'], london_merged['Neighbourhood'], london_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_london_clusters)
       
map_london_clusters

### Now, let's examine the categories for each cluster.
#### Cluster 1

In [95]:
london_merged.loc[london_merged['Cluster Labels'] == 0, london_merged.columns[[1] + list(range(4, london_merged.shape[1]))]]

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EC2,51.518,-0.084,0,Coffee Shop,Gym / Fitness Center,Food Truck,Italian Restaurant,Pub,Hotel,Bar,Cocktail Bar,English Restaurant,Sushi Restaurant
2,NW3,51.5517,-0.1706,0,Café,Bakery,Coffee Shop,Pub,Greek Restaurant,Bagel Shop,Pizza Place,Museum,Burger Joint,Convenience Store
4,NW6,51.5438,-0.1971,0,Pub,Indian Restaurant,Café,Middle Eastern Restaurant,Park,Coffee Shop,Brazilian Restaurant,Portuguese Restaurant,Thai Restaurant,Korean Restaurant
6,NW8,51.5333,-0.1734,0,Cricket Ground,Café,Coffee Shop,Deli / Bodega,French Restaurant,Fast Food Restaurant,Sandwich Place,Salad Place,Restaurant,Recording Studio
7,SE11,51.4913,-0.1085,0,Pub,Café,Fish & Chips Shop,Indian Restaurant,Pizza Place,Gastropub,Park,Italian Restaurant,Bar,Museum
8,SE12,51.4467,-0.0176,0,Grocery Store,Supermarket,Coffee Shop,Italian Restaurant,Shopping Mall,Furniture / Home Store,Sandwich Place,Theater,Cocktail Bar,Bakery
9,SE13,51.4572,-0.0059,0,Pub,Electronics Store,Supermarket,Park,Coffee Shop,Turkish Restaurant,Video Game Store,Café,Bus Stop,Fast Food Restaurant
10,SE17,51.4874,-0.0924,0,Café,Bus Stop,Food & Drink Shop,Pub,Fast Food Restaurant,Pharmacy,Bakery,Middle Eastern Restaurant,Caribbean Restaurant,Brewery
11,SE2,51.486,-0.1203,0,Café,Park,Gay Bar,Italian Restaurant,Gym / Fitness Center,Cricket Ground,Pub,Street Food Gathering,Tennis Court,Japanese Restaurant
13,SE26,51.4283,-0.0556,0,Coffee Shop,Pub,Italian Restaurant,Café,Gastropub,Pizza Place,Thai Restaurant,Park,Japanese Restaurant,Portuguese Restaurant


### Cluster 2

In [96]:
london_merged.loc[london_merged['Cluster Labels'] == 1, london_merged.columns[[1] + list(range(4, london_merged.shape[1]))]]

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
55,SW15,51.457,-0.2288,1,Food & Drink Shop,Zoo Exhibit,English Restaurant,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


### Cluster 3

In [97]:
london_merged.loc[london_merged['Cluster Labels'] == 2, london_merged.columns[[1] + list(range(4, london_merged.shape[1]))]]

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,NW10,51.541,-0.2531,2,Discount Store,Liquor Store,Caribbean Restaurant,Chinese Restaurant,Zoo Exhibit,Exhibit,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant


### Cluster 4

In [98]:
london_merged.loc[london_merged['Cluster Labels'] == 3, london_merged.columns[[1] + list(range(4, london_merged.shape[1]))]]

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,NW7,51.6147,-0.2301,3,Pharmacy,Construction & Landscaping,Zoo Exhibit,Farmers Market,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Fast Food Restaurant


### Cluster 5

In [99]:
london_merged.loc[london_merged['Cluster Labels'] == 4, london_merged.columns[[1] + list(range(4, london_merged.shape[1]))]]

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,NW4,51.5937,-0.2181,4,Hotel,Bus Stop,Entertainment Service,Zoo Exhibit,Fast Food Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Farmers Market
12,SE20,51.4126,-0.0614,4,Park,Hotel,Gas Station,Train Station,Fast Food Restaurant,Supermarket,Hardware Store,Grocery Store,Exhibit,Entertainment Service
36,SE18,51.4782,-0.0762,4,Park,Hotel,Bus Station,Bus Stop,Café,Bar,Farmers Market,Event Space,Exhibit,Falafel Restaurant
45,SE4,51.4598,-0.0322,4,Café,Brewery,Pizza Place,Fish & Chips Shop,Furniture / Home Store,Gastropub,Park,Cocktail Bar,Grocery Store,Flower Shop
46,SE6,51.4362,-0.0175,4,Grocery Store,Bus Station,Shopping Plaza,Train Station,Discount Store,Pizza Place,Turkish Restaurant,Furniture / Home Store,Gym,Zoo Exhibit
57,SW17,51.4256,-0.1581,4,Pizza Place,Grocery Store,Fish & Chips Shop,Supermarket,Lebanese Restaurant,Asian Restaurant,Italian Restaurant,Bus Stop,Coffee Shop,Hotel
