<h1>Where in Bristol?</h1>

<h3>In this notebook, we will find the best postcode area of Bristol to open an Italian restaurant, we will do this in stages:</h3>

<ul>Stages
    
    Get information about different postcode areas in Bristol
    Get information about the venues within these
    Create clusters based on the different restaurants in these areas
    Look at the clusters, and make a recommendation based on these
</ul> 

In [1]:
#Import required packages

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import numpy as np
!pip install folium
import folium

#Create dataframe

column_list = ['Prefix', 'PostTown', 'Coverage']
df = pd.DataFrame(columns=column_list)

#Get webpage BS_postcode_area

url = 'https://en.wikipedia.org/wiki/BS_postcode_area'
page = BeautifulSoup(urllib.request.urlopen(url), 'lxml')
page.prettify()

#Get the right table, after inspecting the element on the page

right_table = page.find('table', {'class':'wikitable sortable'})

Prefixes = []
Post_Towns = []
Coverages = []

New_Prefixes = []
New_Post_Towns = []

#Extract from each table row the data, add to list then add to dataframe

for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)!=0:
        Post_Towns.append(cells[0].find(text=True))
        Coverages.append(cells[1].find(text=True))

for row in right_table.findAll('tr'):
    cells = row.findAll('th')
    if len(cells)!=0 and cells[0].find(text=True)!= 'Postcode district\n':
        Prefixes.append(cells[0].find(text=True))

#Clean endings (remove \n)
        
for item in Prefixes:
    en = len(item)-1
    item = item[0:en]
    New_Prefixes.append(item)
    
for item in Post_Towns:
    en = len(item)-1
    item = item[0:en]
    New_Post_Towns.append(item)
        
df['Prefix'] = New_Prefixes
df['PostTown'] = New_Post_Towns
df['Coverage'] = Coverages

df.head()

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 8.4MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


Unnamed: 0,Prefix,PostTown,Coverage
0,BS0,BRISTOL,
1,BS1,BRISTOL,Bristol city centre
2,BS2,BRISTOL,Kingsdown
3,BS3,BRISTOL,Bedminster
4,BS4,BRISTOL,Brislington


In [2]:
#Drop rows where Bristol not post town, or has no coverage data (non geographical postcodes)

filtered = df[df['PostTown']=='BRISTOL']
bristol = filtered[filtered['Coverage']!='\n']
bristol.reset_index(inplace=True)
bristol.head()

Unnamed: 0,index,Prefix,PostTown,Coverage
0,1,BS1,BRISTOL,Bristol city centre
1,2,BS2,BRISTOL,Kingsdown
2,3,BS3,BRISTOL,Bedminster
3,4,BS4,BRISTOL,Brislington
4,5,BS5,BRISTOL,Easton


In [3]:
#Get coordinates for postcodes
coords = pd.read_csv('https://www.freemaptools.com/download/outcode-postcodes/postcode-outcodes.csv')

#rename column for merge
coords.rename(columns={'postcode':'Prefix'}, inplace=True)

#merge with bristol frame
result = bristol.merge(coords, on='Prefix', how='left')

#remove unwanted columns
result.drop(['index', 'id'], axis=1, inplace=True)
result.head(50)

Unnamed: 0,Prefix,PostTown,Coverage,latitude,longitude
0,BS1,BRISTOL,Bristol city centre,51.45309,-2.593
1,BS2,BRISTOL,Kingsdown,51.45945,-2.58013
2,BS3,BRISTOL,Bedminster,51.43776,-2.60144
3,BS4,BRISTOL,Brislington,51.43437,-2.56111
4,BS5,BRISTOL,Easton,51.4614,-2.54992
5,BS6,BRISTOL,Cotham,51.47015,-2.59898
6,BS7,BRISTOL,Bishopston,51.48643,-2.58047
7,BS8,BRISTOL,Clifton,51.45745,-2.62041
8,BS9,BRISTOL,Coombe Dingle,51.4871,-2.62513
9,BS10,BRISTOL,Brentry,51.50606,-2.60954


In [4]:
#Quick map to show areas, using bristol coords

latitude = 51.4545
longitude = -2.5879
map_bristol = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, pre in zip(result['latitude'], result['longitude'], result['Prefix']):
    
    label = 'Prefix {}'.format(pre)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, long],
        radius=8,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bristol)

map_bristol

In [5]:
#Remove postcode areas too far from city centre
import math  
def calculateDistance(x1,y1,x2,y2):  
     dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
     return dist 

#calculate ditances from centre for each area, add to frame
distances = []
rng = range(len(result))
for i in rng:

    temp_lat = result.loc[i][3]
    temp_long = result.loc[i][4]
    ans = calculateDistance(latitude, longitude, temp_lat, temp_long)
    distances.append(ans)

result['CentProx'] = distances
result.head()

Unnamed: 0,Prefix,PostTown,Coverage,latitude,longitude,CentProx
0,BS1,BRISTOL,Bristol city centre,51.45309,-2.593,0.005291
1,BS2,BRISTOL,Kingsdown,51.45945,-2.58013,0.009213
2,BS3,BRISTOL,Bedminster,51.43776,-2.60144,0.02153
3,BS4,BRISTOL,Brislington,51.43437,-2.56111,0.03351
4,BS5,BRISTOL,Easton,51.4614,-2.54992,0.038602


In [6]:
# Remove areas too far from centre (trial and error)

central_bristol = result[result['CentProx']<=0.09]
central_bristol.reset_index(inplace=True)
central_bristol.drop(['index'], axis=1, inplace=True)
central_bristol.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Prefix,PostTown,Coverage,latitude,longitude,CentProx
0,BS1,BRISTOL,Bristol city centre,51.45309,-2.593,0.005291
1,BS2,BRISTOL,Kingsdown,51.45945,-2.58013,0.009213
2,BS3,BRISTOL,Bedminster,51.43776,-2.60144,0.02153
3,BS4,BRISTOL,Brislington,51.43437,-2.56111,0.03351
4,BS5,BRISTOL,Easton,51.4614,-2.54992,0.038602


In [7]:
#New map to show areas, using bristol coords

map_bristol = folium.Map(location=[latitude, longitude], zoom_start=12)

folium.CircleMarker(
    [latitude, longitude],
    radius=8,
    color='red',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.5,
    parse_html=False).add_to(map_bristol)

for lat, long, pre in zip(central_bristol['latitude'], central_bristol['longitude'], central_bristol['Prefix']):
    
    label = 'Prefix {}'.format(pre)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bristol)

map_bristol

<h2>Get venue details for postcode areas</h2>
<p>Now that we have chosen our potential areas for the restaurant, we will find similar venues in the area to give an idea of competition</p>

In [36]:
#Foursquare API credentials

import requests

CLIENT_ID = '2Q2PCKDXE440MLX5LRG2IZOV4DE5MQOSATZ21IFRQZRB0D0P' # your Foursquare ID
CLIENT_SECRET = '1Q5FB1O5PNIU2MSVEQVOODJXEQGSEUCJH4YFKIRFTEHHXA4Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

RADIUS = 2000
LIMIT = 100

In [37]:
#function to get all venues for given area

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Prefix', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
#Run query, see how many results we got for each area

bristol_venues = getNearbyVenues(names=central_bristol['Prefix'], latitudes=central_bristol['latitude'], longitudes=central_bristol['longitude'])
bristol_venues.head()

BS1
BS2
BS3
BS4
BS5
BS6
BS7
BS8
BS9
BS10
BS13
BS14
BS15
BS16
BS34
BS41


Unnamed: 0,Prefix,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,BS1,51.45309,-2.593,Small Street Espresso,51.454766,-2.5944,Coffee Shop
1,BS1,51.45309,-2.593,Small Bar,51.451791,-2.594948,Pub
2,BS1,51.45309,-2.593,BrewDog Bristol,51.453659,-2.592372,Beer Bar
3,BS1,51.45309,-2.593,St. Nicholas Market,51.454248,-2.593286,Market
4,BS1,51.45309,-2.593,Bristol Old Vic,51.45209,-2.594154,Theater


In [13]:
#One hot encoding for analysis

bristol_onehot = pd.get_dummies(bristol_venues[['Venue Category']], prefix='', prefix_sep='')

In [14]:
bristol_onehot['Prefix'] = bristol_venues['Prefix']

#Move columns

cols = [bristol_onehot.columns[-1]] + list(bristol_onehot.columns[:-1])
bristol_onehot = bristol_onehot[cols]

bristol_onehot.head()

Unnamed: 0,Prefix,American Restaurant,Art Gallery,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bar,Beer Bar,Beer Garden,...,Theater,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse,Warehouse Store,Waterfront,Wine Shop
0,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,BS1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,BS1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [15]:
#We are only interested in looking at restaurants, so drop column where headers do not contain the work 'restaurant'

for item in list(bristol_onehot)[1:]:
    if not item.find('Restaurant') > 1:
        bristol_onehot.drop([item], axis=1, inplace=True)
        
bristol_onehot.head()

Unnamed: 0,Prefix,American Restaurant,Asian Restaurant,Chinese Restaurant,English Restaurant,Falafel Restaurant,Fast Food Restaurant,French Restaurant,Greek Restaurant,Indian Restaurant,...,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Persian Restaurant,Portuguese Restaurant,Scandinavian Restaurant,Tapas Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Removing unneccessary (not a restaurant!) rows and reindexing

bristol_onehot['Total'] = bristol_onehot.sum(axis=1)
bristol_result = bristol_onehot[bristol_onehot['Total']>0]
bristol_result.reset_index(inplace=True)
bristol_result.drop(['Total', 'index'], axis=1, inplace=True)
bristol_result.head()

Unnamed: 0,Prefix,American Restaurant,Asian Restaurant,Chinese Restaurant,English Restaurant,Falafel Restaurant,Fast Food Restaurant,French Restaurant,Greek Restaurant,Indian Restaurant,...,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Persian Restaurant,Portuguese Restaurant,Scandinavian Restaurant,Tapas Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,BS1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BS1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,BS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
# Group by postcode prefix, and group by mean to give numerical indicator as to the most common venue type in each area

bristol_mean = bristol_result.groupby('Prefix').mean().reset_index()
bristol_mean.shape

(15, 24)

In [18]:
# Print top 5 venues for each prefix

number_top_venues = 5
 
for area in bristol_mean['Prefix']:
    print('---'+area+'---')
    temp = bristol_mean[bristol_mean['Prefix']==area].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(number_top_venues))
    print('\n')

---BS1---
                venue  freq
0   Indian Restaurant  0.15
1  Italian Restaurant  0.15
2  English Restaurant  0.10
3  Falafel Restaurant  0.10
4   French Restaurant  0.10


---BS10---
                       venue  freq
0        American Restaurant  0.33
1       Fast Food Restaurant  0.17
2         Italian Restaurant  0.17
3         Mexican Restaurant  0.17
4  Latin American Restaurant  0.17


---BS13---
                           venue  freq
0           Fast Food Restaurant  0.67
1            American Restaurant  0.33
2      Latin American Restaurant  0.00
3  Vegetarian / Vegan Restaurant  0.00
4                Thai Restaurant  0.00


---BS14---
                           venue  freq
0            American Restaurant  0.33
1             English Restaurant  0.33
2           Fast Food Restaurant  0.33
3       Mediterranean Restaurant  0.00
4  Vegetarian / Vegan Restaurant  0.00


---BS15---
                           venue  freq
0            American Restaurant  0.33
1             

In [19]:
# Function to return the top venues

def return_top_venues(row, number_top_venues):
    row_cats = row.iloc[1:]
    row_sorted = row_cats.sort_values(ascending=False)
    # Additional lines to drop values where the mean is 0 (stops incorrect top 6 venues where there are not 6 different types in the area)
    row_sorted.where(row_sorted>0, inplace=True)
    row_sorted.dropna(inplace=True)
  
    return row_sorted.index.values[0: number_top_venues]

In [20]:
# Create DF with most common restuarant types by postcode prefix

number_top_venue = 6
indicators = ['st', 'nd', 'rd']
columns = ['Prefix']

for x in np.arange(number_top_venue):
    
    try:
        columns.append('{}{} Most Common Venue'.format(x+1, indicators[x]))
    except:
        columns.append('{}{} Most Common Venue'.format(x+1, 'th'))
        
bristol_venues_sorted = pd.DataFrame(columns=columns)
bristol_venues_sorted['Prefix'] = bristol_mean['Prefix']
                                                   
for y in np.arange(bristol_mean.shape[0]):
    
    bristol_venues_sorted.iloc[y, 1:((return_top_venues(bristol_mean.iloc[y, :], number_top_venue).shape[0])+1)] = return_top_venues(bristol_mean.iloc[y, :], number_top_venue)
        
bristol_venues_sorted.head(50)

Unnamed: 0,Prefix,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
0,BS1,Italian Restaurant,Indian Restaurant,Tapas Restaurant,English Restaurant,Falafel Restaurant,French Restaurant
1,BS10,American Restaurant,Italian Restaurant,Fast Food Restaurant,Mexican Restaurant,Latin American Restaurant,
2,BS13,Fast Food Restaurant,American Restaurant,,,,
3,BS14,American Restaurant,English Restaurant,Fast Food Restaurant,,,
4,BS15,American Restaurant,Chinese Restaurant,Portuguese Restaurant,,,
5,BS16,Thai Restaurant,English Restaurant,Fast Food Restaurant,Greek Restaurant,Indian Restaurant,
6,BS2,Indian Restaurant,Falafel Restaurant,French Restaurant,Italian Restaurant,Vietnamese Restaurant,Greek Restaurant
7,BS3,English Restaurant,Falafel Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Vietnamese Restaurant
8,BS34,Fast Food Restaurant,American Restaurant,Chinese Restaurant,English Restaurant,Portuguese Restaurant,Italian Restaurant
9,BS4,Fast Food Restaurant,,,,,


<h1>Clustering</h1>
<p>Next, we want to cluster the venues to see if we can find an easy way to determine where the restaurant should be opened from here</p>

In [21]:
# Import the right package, define number of clusters

from sklearn.cluster import KMeans
number_clusters = 3

In [22]:
# Run clustering algorithm, add labels into data frame for final map

bristol_clustering = bristol_mean.drop('Prefix', axis=1)
kmeans = KMeans(n_clusters=number_clusters, random_state=0).fit(bristol_clustering)
cluster_labels = kmeans.labels_[0:15]

bristol_venues_sorted['Cluster'] = cluster_labels

#Also add count of number of restaurants in each area

bristol_venues_sorted['Number of Restaurants'] = bristol_result.groupby('Prefix').count().iloc[:, 1].values

bristol_venues_sorted.head()

Unnamed: 0,Prefix,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,Cluster,Number of Restaurants
0,BS1,Italian Restaurant,Indian Restaurant,Tapas Restaurant,English Restaurant,Falafel Restaurant,French Restaurant,1,20
1,BS10,American Restaurant,Italian Restaurant,Fast Food Restaurant,Mexican Restaurant,Latin American Restaurant,,1,6
2,BS13,Fast Food Restaurant,American Restaurant,,,,,0,3
3,BS14,American Restaurant,English Restaurant,Fast Food Restaurant,,,,1,3
4,BS15,American Restaurant,Chinese Restaurant,Portuguese Restaurant,,,,1,3


In [23]:
# Create final DF to create map with and make desicion based on by merging to get latitudes and longitudes

final_result = bristol_venues_sorted.merge(coords, on='Prefix', how='left')
final_result.drop('id', axis=1, inplace=True)
final_result.head(50)

Unnamed: 0,Prefix,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,Cluster,Number of Restaurants,latitude,longitude
0,BS1,Italian Restaurant,Indian Restaurant,Tapas Restaurant,English Restaurant,Falafel Restaurant,French Restaurant,1,20,51.45309,-2.593
1,BS10,American Restaurant,Italian Restaurant,Fast Food Restaurant,Mexican Restaurant,Latin American Restaurant,,1,6,51.50606,-2.60954
2,BS13,Fast Food Restaurant,American Restaurant,,,,,0,3,51.41163,-2.61116
3,BS14,American Restaurant,English Restaurant,Fast Food Restaurant,,,,1,3,51.41278,-2.56084
4,BS15,American Restaurant,Chinese Restaurant,Portuguese Restaurant,,,,1,3,51.4589,-2.50527
5,BS16,Thai Restaurant,English Restaurant,Fast Food Restaurant,Greek Restaurant,Indian Restaurant,,1,5,51.48496,-2.50988
6,BS2,Indian Restaurant,Falafel Restaurant,French Restaurant,Italian Restaurant,Vietnamese Restaurant,Greek Restaurant,1,18,51.45945,-2.58013
7,BS3,English Restaurant,Falafel Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Vietnamese Restaurant,1,18,51.43776,-2.60144
8,BS34,Fast Food Restaurant,American Restaurant,Chinese Restaurant,English Restaurant,Portuguese Restaurant,Italian Restaurant,1,8,51.52338,-2.56364
9,BS4,Fast Food Restaurant,,,,,,0,3,51.43437,-2.56111


In [35]:
#Create final map, colour and size of markers based on cluster labels and number of restaurants

import matplotlib.cm as cm
import matplotlib.colors as colors

final_map = folium.Map([latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(number_clusters)
ys = [i + x + (i*x)**2 for i in range(number_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster, numres in zip(final_result['latitude'], final_result['longitude'], final_result['Prefix'], final_result['Cluster'], final_result['Number of Restaurants']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=(numres/22)*35,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(final_map)
    
final_map

<h1>Conclusion</h1>
<p>From looking at the makeup of the clusters, and the map - I think BS4 or BS13 would be the best places to potentially open an Italian restaurants, neither of them are flooded with too many restaurants, and both are in a cluster with no other Italian restaurants</p>