In [3]:
import pandas as pd
import numpy as np
import math
import re

In [4]:
import requests
from pandas.io.json import json_normalize
import geopandas as gpd
from sklearn.cluster import KMeans
from shapely.geometry import Polygon, LineString, Point
from shapely.ops  import transform

In [5]:
# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 geopandas=0.4.1 --yes 
import folium # map rendering library

Create a map of Edinburgh, split into its natural neighbourhoods.
https://python-visualization.github.io/folium/quickstart.html This is a link for my use later when I want to colour code the map based on similarity.

In [6]:
#url = ("https://opendata.arcgis.com/datasets/9961be54aa5649aebfb5f2cde53fa432_27.geojson")  #dataset for natural neighbourhoods in Edinburgh from Edinburgh City Council
#neighbourhoods = gpd.read_file(url)
#print(neighbourhoods.head())

In [7]:
url = (
    "https://opendata.arcgis.com/datasets/9961be54aa5649aebfb5f2cde53fa432_27.geojson" #dataset for natural neighbourhoods in Edinburgh from Edinburgh City Council
)
natural_neighbourhoods = f"{url}"


m = folium.Map(
    location=[55.9533, -3.1883],
    tiles="cartodbpositron",
    zoom_start=13,
)

shapes = folium.GeoJson(natural_neighbourhoods, name="geojson")

shapes.add_to(m)
    
folium.LayerControl().add_to(m)

m

https://geopandas.readthedocs.io/en/latest/gallery/polygon_plotting_with_folium.html example I used to help with the next cell.

In [8]:
path = "https://opendata.arcgis.com/datasets/9961be54aa5649aebfb5f2cde53fa432_27.geojson"
df = gpd.read_file(path)

m = folium.Map(location=[55.9533, -3.1883], zoom_start=10, tiles='CartoDB positron')

for _, r in df.iterrows():
    sim_geo = gpd.GeoSeries(r['geometry'])
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(r['NATURALCOM']).add_to(geo_j)
    geo_j.add_to(m)

df = df.to_crs(epsg=2163) #set to a projected crs for centroid calculation accuracy.
df['centroids']=df.centroid.to_crs(epsg=4326)
df['lat'] = df['centroids'].y
df['lon'] = df['centroids'].x
df = df.to_crs(epsg=4326)

for _, r in df.iterrows():
    yo = folium.Marker(location=[r['lat'], r['lon']], popup='Name: {}'.format(r['NATURALCOM']))
    yo.add_to(m)
m

The following must be entered to use the foursquare API. The results of this call have been saved in a file for future use to avoid repeated API calls.

In [175]:
CLIENT_ID = ""
CLIENT_SECRET = ""
VERSION = ''
LIMIT = 100

In [176]:


def getNearbyVenues(names, latitudes, longitudes, area):
    
    venues_list=[]
    for name, lat, lng, area2 in zip(names, latitudes, longitudes, area):
        
        radius = 2*math.sqrt(area2/3) #factor of two to account for weirdly shaped areas. This makes a circle with twice the radius of the circle of the same area as the shape.
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId=4d4b7105d754a06374d81259'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng,
            radius,
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        #print(results)
        
        # return only relevant information for each nearby venue
        venues_list.append([[name, lat, lng, v['id'],v['name'], v['location']['lat'], v['location']['lng'], v['categories'][0]['name']] for v in results])
        
        
            

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude',
                  'Venue ID',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [177]:
edinburgh_venues = getNearbyVenues(names=df['NATURALCOM'],
                                   latitudes=df['lat'],
                                   longitudes=df['lon'],
                                   area = df['Shapearea']
                                  )
edinburgh_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Brunstane/Gilberstoun,55.940545,-3.09246,4c417b3aaf052d7fd7707d79,Cuddie Brae,55.934582,-3.094717,Restaurant
1,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub
2,Brunstane/Gilberstoun,55.940545,-3.09246,51483ee8e4b06af4b55605bb,kingsminor bistrot,55.945272,-3.096003,Breakfast Spot
3,Brunstane/Gilberstoun,55.940545,-3.09246,4c4c67949e6dbe9a15c9700d,Porto Restaurant,55.946333,-3.084878,Fast Food Restaurant
4,Brunstane/Gilberstoun,55.940545,-3.09246,4bd0525e046076b02fb36f71,Costa Coffee,55.933625,-3.105237,Coffee Shop


The DataFrame is transformed into a GeoDataFrame and then is cleaned up by removing any venues that are not in the correct neighbourhood, as up until this point neighbourhoods have been assumed to be a large circle focused on their centres, for the purposes of the API call. 

In [178]:
edinburgh_venues_geo = gpd.GeoDataFrame(edinburgh_venues, geometry = gpd.points_from_xy(edinburgh_venues["Venue Longitude"],edinburgh_venues["Venue Latitude"]))
edinburgh_venues_geo.head()


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry
0,Brunstane/Gilberstoun,55.940545,-3.09246,4c417b3aaf052d7fd7707d79,Cuddie Brae,55.934582,-3.094717,Restaurant,POINT (-3.09472 55.93458)
1,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.09608 55.94454)
2,Brunstane/Gilberstoun,55.940545,-3.09246,51483ee8e4b06af4b55605bb,kingsminor bistrot,55.945272,-3.096003,Breakfast Spot,POINT (-3.09600 55.94527)
3,Brunstane/Gilberstoun,55.940545,-3.09246,4c4c67949e6dbe9a15c9700d,Porto Restaurant,55.946333,-3.084878,Fast Food Restaurant,POINT (-3.08488 55.94633)
4,Brunstane/Gilberstoun,55.940545,-3.09246,4bd0525e046076b02fb36f71,Costa Coffee,55.933625,-3.105237,Coffee Shop,POINT (-3.10524 55.93363)


Define a function to find which neighbourhood the venue is really in.

In [179]:
def which_neighbourhood(test_point):
    for i in range(154):
        if df["geometry"].contains(test_point)[i] == True:
            return df["NATURALCOM"][i]

Run the function for every row of the data frame and drop any rows where the neighbourhood label is not accurate in terms of the natural neighbourhood boundary.

In [181]:
print(edinburgh_venues_geo.shape)
for ind in edinburgh_venues_geo.index:
    location = edinburgh_venues_geo['geometry'][ind]
    if edinburgh_venues_geo['Neighbourhood'][ind] != which_neighbourhood(location):
        edinburgh_venues_geo.drop(ind, inplace = True)
edinburgh_venues_geo.reset_index(inplace = True)
edinburgh_venues_geo.drop(["index"], axis=1, inplace = True)
print(edinburgh_venues_geo.shape)
edinburgh_venues_geo.head()

(4089, 9)
(835, 9)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry
0,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.09608 55.94454)
1,Brunstane/Gilberstoun,55.940545,-3.09246,4e0cd9d27d8bfe35bbc5d23d,eh15 Restaurant & Bar,55.943865,-3.098488,Diner,POINT (-3.09849 55.94386)
2,Brunstane/Gilberstoun,55.940545,-3.09246,4e0b733ab0fbc7fb0be0d383,Options,55.944675,-3.096049,Restaurant,POINT (-3.09605 55.94467)
3,Brunstane/Gilberstoun,55.940545,-3.09246,4cdd6bd114119eb02a35ed33,Burger King,55.942378,-3.101092,Fast Food Restaurant,POINT (-3.10109 55.94238)
4,Brunstane/Gilberstoun,55.940545,-3.09246,5ecfd256fadd010008813b03,Costa Coffee,55.941996,-3.101744,Coffee Shop,POINT (-3.10174 55.94200)


Map all venues that are still being used.

In [182]:
m = folium.Map(location=[55.9533, -3.1883], zoom_start=10, tiles='CartoDB positron')

for _, r in df.iterrows():
    sim_geo = gpd.GeoSeries(r['geometry'])
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(r['NATURALCOM']).add_to(geo_j)
    geo_j.add_to(m)

for _, r in edinburgh_venues_geo.iterrows():
    yo = folium.Marker(location=[r['Venue Latitude'], r['Venue Longitude']], popup='name: {}'.format(r['Venue']))
    yo.add_to(m)
m

Now to add a column with additional data about each venue.

In [183]:
#returns the price tier and likes count of the venues.

def furtherDetails(venue_id):
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(venue_id,CLIENT_ID, CLIENT_SECRET,VERSION)
    results = requests.get(url).json()
    try:
        return results['response']['venue']['attributes']['groups'][0]['items'][0]['priceTier'], results['response']['venue']['likes']['count']
    except:
        return "missing price tier", "missing likes" #if the data is strangely formatted for this venue.

In [184]:
edinburgh_venues_geo['price tier','likes'] = edinburgh_venues_geo['Venue ID'].apply(furtherDetails) #apply the above function
edinburgh_venues_geo.head()


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,"(price tier, likes)"
0,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.09608 55.94454),"(3, 0)"
1,Brunstane/Gilberstoun,55.940545,-3.09246,4e0cd9d27d8bfe35bbc5d23d,eh15 Restaurant & Bar,55.943865,-3.098488,Diner,POINT (-3.09849 55.94386),"(1, 3)"
2,Brunstane/Gilberstoun,55.940545,-3.09246,4e0b733ab0fbc7fb0be0d383,Options,55.944675,-3.096049,Restaurant,POINT (-3.09605 55.94467),"(2, 0)"
3,Brunstane/Gilberstoun,55.940545,-3.09246,4cdd6bd114119eb02a35ed33,Burger King,55.942378,-3.101092,Fast Food Restaurant,POINT (-3.10109 55.94238),"(1, 0)"
4,Brunstane/Gilberstoun,55.940545,-3.09246,5ecfd256fadd010008813b03,Costa Coffee,55.941996,-3.101744,Coffee Shop,POINT (-3.10174 55.94200),"(1, 0)"


In [186]:
edinburgh_venues_geo.columns
missing = edinburgh_venues_geo[edinburgh_venues_geo[('price tier', 'likes')] == ('missing price tier', 'missing likes')]
print(missing.shape)
print(edinburgh_venues_geo.shape)

(46, 10)
(835, 10)


The  number of rows without price tier and likes is relatively small. Need to check if they are evenly or unevenly distributed.

In [187]:
print(missing['Neighbourhood'].value_counts().head())
print(missing['Neighbourhood'].value_counts().tail())

Portobello                   5
Tollcross                    3
Newington                    3
Hillside/Easter Rd           2
South Gyle/Edinburgh Park    2
Name: Neighbourhood, dtype: int64
Braids                       1
Calton Hill                  1
Gracemount                   1
South Queensferry/Dalmeny    1
Corstorphine                 1
Name: Neighbourhood, dtype: int64


The rows with missing data are pretty much evenly distributed  between  the neighbourhoods and so will be dropped from the dataframe.

In [188]:
indexNames = edinburgh_venues_geo[edinburgh_venues_geo[('price tier', 'likes')] == ('missing price tier', 'missing likes')].index
edinburgh_venues_geo.drop(indexNames , inplace=True)
edinburgh_venues_geo.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,"(price tier, likes)"
0,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.09608 55.94454),"(3, 0)"
1,Brunstane/Gilberstoun,55.940545,-3.09246,4e0cd9d27d8bfe35bbc5d23d,eh15 Restaurant & Bar,55.943865,-3.098488,Diner,POINT (-3.09849 55.94386),"(1, 3)"
2,Brunstane/Gilberstoun,55.940545,-3.09246,4e0b733ab0fbc7fb0be0d383,Options,55.944675,-3.096049,Restaurant,POINT (-3.09605 55.94467),"(2, 0)"
3,Brunstane/Gilberstoun,55.940545,-3.09246,4cdd6bd114119eb02a35ed33,Burger King,55.942378,-3.101092,Fast Food Restaurant,POINT (-3.10109 55.94238),"(1, 0)"
4,Brunstane/Gilberstoun,55.940545,-3.09246,5ecfd256fadd010008813b03,Costa Coffee,55.941996,-3.101744,Coffee Shop,POINT (-3.10174 55.94200),"(1, 0)"


Check the above has done the expected operation:

In [189]:
edinburgh_venues_geo.columns
oopsy = edinburgh_venues_geo[edinburgh_venues_geo[('price tier', 'likes')] == ('missing price tier', 'missing likes')]
print(oopsy.shape)
print(edinburgh_venues_geo.shape)

(0, 10)
(789, 10)


The following splits the price tier and likes column into separate columns. 

In [190]:
edinburgh_venues_geo[('price tier','likes')]=edinburgh_venues_geo[('price tier','likes')].astype('string')

re.findall("[0-9]*",edinburgh_venues_geo[('price tier','likes')][0])
#print(re.findall("[0-9]+","(12, 0)"))

def price_extractor(text):
    numberList = re.findall("[0-9]+",text)
    return numberList[0]

def likes_extractor(text):
    numberList = re.findall("[0-9]+",text)
    return numberList[0]

edinburgh_venues_geo['price tier'] = edinburgh_venues_geo[('price tier','likes')].apply(lambda x: price_extractor(x))
edinburgh_venues_geo['likes'] = edinburgh_venues_geo[('price tier','likes')].apply(lambda x: likes_extractor(x))
edinburgh_venues_geo.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,"(price tier, likes)",price tier,likes
0,Brunstane/Gilberstoun,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.09608 55.94454),"(3, 0)",3,3
1,Brunstane/Gilberstoun,55.940545,-3.09246,4e0cd9d27d8bfe35bbc5d23d,eh15 Restaurant & Bar,55.943865,-3.098488,Diner,POINT (-3.09849 55.94386),"(1, 3)",1,1
2,Brunstane/Gilberstoun,55.940545,-3.09246,4e0b733ab0fbc7fb0be0d383,Options,55.944675,-3.096049,Restaurant,POINT (-3.09605 55.94467),"(2, 0)",2,2
3,Brunstane/Gilberstoun,55.940545,-3.09246,4cdd6bd114119eb02a35ed33,Burger King,55.942378,-3.101092,Fast Food Restaurant,POINT (-3.10109 55.94238),"(1, 0)",1,1
4,Brunstane/Gilberstoun,55.940545,-3.09246,5ecfd256fadd010008813b03,Costa Coffee,55.941996,-3.101744,Coffee Shop,POINT (-3.10174 55.94200),"(1, 0)",1,1


Save data to avoid uneccessary calls to the API.

In [191]:
edinburgh_venues_geo.to_csv('/Users/emmaelley/Documents/GeoBackUpEdinFood.csv') #Fill in with a backup location to avoid unecessary API calls.

**Start here to avoid calls to API**

In [11]:
edinburgh_venues_geo = pd.read_csv('/Users/emmaelley/Documents/GeoBackUpEdinFood.csv')
edinburgh_venues_geo.shape

(650, 14)

Remove  neighbourhoods with less than 5 venues from the analysis.

In [12]:
list_to_discard = edinburgh_venues_geo.value_counts('Neighbourhood')[edinburgh_venues_geo.value_counts('Neighbourhood')<5].index.tolist()
print(list_to_discard)
edinburgh_venues_geo.set_index('Neighbourhood', inplace=True)
edinburgh_venues_geo.drop(list_to_discard, axis = 0, inplace=True)
edinburgh_venues_geo.reset_index(inplace=True)
print(edinburgh_venues_geo.shape)
edinburgh_venues_geo.head()

[]
(650, 14)


Unnamed: 0.2,Neighbourhood,Unnamed: 0,Unnamed: 0.1,Neighbourhood Latitude,Neighbourhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,"('price tier', 'likes')",price tier,likes
0,Brunstane/Gilberstoun,0,0,55.940545,-3.09246,54b97471498e26d1ad55ee73,Bar Zest,55.944542,-3.096077,Gastropub,POINT (-3.0960774421691895 55.944541931152344),"(3, 0)",3,3
1,Brunstane/Gilberstoun,1,1,55.940545,-3.09246,4e0cd9d27d8bfe35bbc5d23d,eh15 Restaurant & Bar,55.943865,-3.098488,Diner,POINT (-3.0984878540039062 55.943864908788534),"(1, 3)",1,1
2,Brunstane/Gilberstoun,2,2,55.940545,-3.09246,4e0b733ab0fbc7fb0be0d383,Options,55.944675,-3.096049,Restaurant,POINT (-3.096049 55.944675),"(2, 0)",2,2
3,Brunstane/Gilberstoun,3,3,55.940545,-3.09246,4cdd6bd114119eb02a35ed33,Burger King,55.942378,-3.101092,Fast Food Restaurant,POINT (-3.1010922900120104 55.942377593318554),"(1, 0)",1,1
4,Brunstane/Gilberstoun,4,4,55.940545,-3.09246,5ecfd256fadd010008813b03,Costa Coffee,55.941996,-3.101744,Coffee Shop,POINT (-3.101744055747986 55.94199631334696),"(1, 0)",1,1


In [13]:
edin_onehot = pd.get_dummies(edinburgh_venues_geo['Venue Category'])
edin_onehot['likes'] = edinburgh_venues_geo['likes']
edin_onehot['price tier'] = edinburgh_venues_geo['price tier']
edin_onehot['Neighbourhood'] = edinburgh_venues_geo['Neighbourhood']

# shift column 'Name' to first position
first_column = edin_onehot.pop('Neighbourhood')
second_column = edin_onehot.pop('likes')
third_column = edin_onehot.pop('price tier')
  
# insert column using insert(position,column_name,
# first_column) function
edin_onehot.insert(0, 'Neighbourhood', first_column)
edin_onehot.insert(1, 'likes', second_column)
edin_onehot.insert(2, 'price tier', third_column)
edin_onehot['price tier']=edin_onehot['price tier']/edin_onehot['price tier'].max()
edin_onehot.head()

Unnamed: 0,Neighbourhood,likes,price tier,African Restaurant,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bar,...,Spanish Restaurant,Steakhouse,Sushi Restaurant,Taco Place,Tapas Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Brunstane/Gilberstoun,3,0.75,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Brunstane/Gilberstoun,1,0.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Brunstane/Gilberstoun,2,0.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Brunstane/Gilberstoun,1,0.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Brunstane/Gilberstoun,1,0.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
edin_onehot.to_csv('') # Backup opportunity following further API calls.

FileNotFoundError: [Errno 2] No such file or directory: ''

Sum over each type of venue and only keep venues with more than 5 occurences. 

In [15]:
summed = edin_onehot.sum().drop(["Neighbourhood","likes", "price tier"])

list_to_use = ["Neighbourhood", "price tier"]

for i in range(len(summed)):
    if summed[i]>3:
        list_to_use.append(edin_onehot.columns[i+3])
list_to_use
edin_grouped = edin_onehot[list_to_use]

Take mean of each column for each neighbourhood.

In [16]:
edin_grouped = edin_grouped.groupby('Neighbourhood', as_index=False).mean()
#print(edin_grouped.head())

Add columns giving the top 3 venues for each Neighbourhood. 

In [17]:

num_top_venues = 3

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    print(row_categories_sorted)
    data_to_return = [row_categories_sorted.index.values[0]]
    if row_categories_sorted[1] != 0:
        data_to_return.append(row_categories_sorted.index.values[1])
    else:
        data_to_return.append('....')
    if row_categories_sorted[2] != 0:
        data_to_return.append(row_categories_sorted.index.values[2])
    else:
        data_to_return.append('....')
    print(data_to_return)
    return data_to_return

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = edin_grouped['Neighbourhood']

for ind in np.arange(edin_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(edin_grouped.iloc[ind, :], num_top_venues)
    
neighbourhoods_venues_sorted.head()


Chinese Restaurant       0.333333
Café                     0.166667
Fast Food Restaurant     0.166667
Pizza Place              0.166667
Vietnamese Restaurant    0.083333
Dessert Shop             0.083333
Italian Restaurant            0.0
Turkish Restaurant            0.0
Thai Restaurant               0.0
Steakhouse                    0.0
Snack Place                   0.0
Seafood Restaurant            0.0
Sandwich Place                0.0
Restaurant                    0.0
Pub                           0.0
Bakery                        0.0
Indian Restaurant             0.0
Ice Cream Shop                0.0
Bar                           0.0
Food Truck                    0.0
Fish & Chips Shop             0.0
Diner                         0.0
Deli / Bodega                 0.0
Coffee Shop                   0.0
Burger Joint                  0.0
Breakfast Spot                0.0
Gastropub                     0.0
Name: 0, dtype: object
['Chinese Restaurant', 'Café', 'Fast Food Restaurant']
Café

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Abbeyhill,Chinese Restaurant,Café,Fast Food Restaurant
1,Balgreen/Saughtonhall,Café,Chinese Restaurant,Fish & Chips Shop
2,Bankhead,Bakery,Fast Food Restaurant,Coffee Shop
3,Barnton,Gastropub,Indian Restaurant,Café
4,Blackhall,Indian Restaurant,Café,Chinese Restaurant


Perform clustering based on the proportion of venues of each type in each neighbourhood.

In [18]:
# set number of clusters
kclusters = 10

edin_grouped_clustering = edin_grouped
edin_grouped_clustering = edin_grouped.drop(['Neighbourhood', 'price tier'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(edin_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

  edin_grouped_clustering = edin_grouped.drop(['Neighbourhood', 'price tier'], 1)


array([9, 5, 3, 6, 5, 9, 8, 3, 8, 3], dtype=int32)

In [19]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
edin_merged = df[['NATURALCOM','lat','lon','geometry']]

# merge edin_grouped with the top 3 venues and price  tier dataframes.
edin_merged = edin_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='NATURALCOM')
edin_merged = edin_merged.join(edin_grouped[['Neighbourhood','price tier']].set_index('Neighbourhood'), on='NATURALCOM')

#print(edin_merged.shape)
edin_merged.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

edin_merged.head(5) # check the last columns!

Unnamed: 0,NATURALCOM,lat,lon,geometry,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,price tier
0,Brunstane/Gilberstoun,55.940545,-3.09246,"POLYGON ((-3.10276 55.94435, -3.10276 55.94435...",3.0,Gastropub,Coffee Shop,Diner,0.4
3,Newbridge,55.939694,-3.421174,"POLYGON ((-3.41296 55.95159, -3.41414 55.95153...",0.0,Bakery,Restaurant,Sandwich Place,0.333333
5,Muirhouse,55.976491,-3.259727,"POLYGON ((-3.26528 55.96935, -3.26462 55.96832...",0.0,Bakery,Turkish Restaurant,Café,0.35
6,Granton/West Pilton,55.977386,-3.246877,"POLYGON ((-3.24218 55.97880, -3.24279 55.98026...",9.0,Chinese Restaurant,Café,Coffee Shop,0.285714
8,Ratho Station/Ingliston/Gogar,55.937809,-3.359178,"POLYGON ((-3.32042 55.94104, -3.32275 55.94160...",8.0,Coffee Shop,Sandwich Place,Fast Food Restaurant,0.277778


In [20]:
json = edin_merged[['NATURALCOM','geometry']].to_json() # Convert the geometry data to a geojson format for use with folium.

In [21]:
from folium.features import DivIcon

In [30]:
# create map
map_clusters = folium.Map(location=[55.9533, -3.1883], zoom_start=12, tiles='CartoDB positron')

folium.Choropleth(
            geo_data=json,
            data=edin_merged,
            columns=['NATURALCOM','price tier'],  #Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            key_on='feature.properties.NATURALCOM', #Here we grab the geometries/county boundaries from the geojson file using the key 'coty_code' which is the same as county fips
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            legend_name='Average price', #title of the legend
            highlight=True,
            line_color='black').add_to(map_clusters) 


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.tab20(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat, lon, poi, cluster, top1, top2, top3, sim_geo, price in zip(edin_merged['lat'], edin_merged['lon'], edin_merged['NATURALCOM'], edin_merged['Cluster Labels'], edin_merged['1st Most Common Venue'], edin_merged['2nd Most Common Venue'], edin_merged['3rd Most Common Venue'],edin_merged['geometry'], edin_merged['price tier']):
    
    geo_j = gpd.GeoSeries(sim_geo).to_json()
    geo_j = folium.GeoJson(data=geo_j)
    folium.Popup(poi).add_to(geo_j)
    #geo_j.add_to(map_clusters)
    popup_text = """{}<br>
                Cluster: {}<br>
                Top  venue categories:<br>
                1) {}<br>
                2) {}<br>
                3) {}"""
    popup_text = popup_text.format(poi,int(cluster),top1,top2,top3)

    label = folium.Popup(popup_text)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=1).add_to(map_clusters)

    #number = '<div style="font-size: 9pt">{}</div>'
    #folium.map.Marker(
    #[lat,lon],
    #icon=DivIcon(
        #icon_size=(150,36),
        #icon_anchor=(4,8),
        #html=number.format(str(int(cluster))),
        #)).add_to(map_clusters)
folium.LayerControl().add_to(map_clusters)
        
map_clusters

In [23]:
map_clusters.save("/Users/emmaelley/Documents/clusterMap.html")