In [340]:
import pandas as pd 
import numpy as np
import requests 
import matplotlib.cm as cm 
import matplotlib.colors as colors
import folium 

from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [189]:
london_areas = pd.read_csv('london_postal_districts.csv')
london_areas.postal_district = london_areas.postal_district.str.strip()

In [306]:
client_id = '<THIS VALUE HAS BEEN MASKED>' # your Foursquare ID
client_secret = '<THIS VALUE HAS BEEN MASKED>' # your Foursquare Secret

version = '20181031' # Foursquare API version
limit = 50
radius = 0

url_search = 'https://api.foursquare.com/v2/venues/explore?'
url_params_static= '&client_id={}&client_secret={}&v={}'.format(client_id, client_secret, version)

In [186]:
def getNearbyVenues(districts, names, section):
    
    venues_list=[]
    
    for district, name in zip(districts, names):
      
        url_params_dynamic ='&near={},London,UK&radius={}&section={}&limit={}'.format(district, radius, section, limit)
        results = requests.get(url_search+url_params_static+url_params_dynamic).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            district, 
            name,
            v['venue']['id'], 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['shortName']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['postal_district', 
                  'district_description', 
                  'venue_id',
                  'venue_name', 
                  'venue_lat', 
                  'venue_long',
                  'venue_category']
        
    return(nearby_venues) 

In [191]:
london_food = getNearbyVenues(london_areas['postal_district'], london_areas['district_description'], 'food')
london_coffee = getNearbyVenues(london_areas['postal_district'], london_areas['district_description'], 'coffee')

In [196]:
london_food_coffee = pd.concat([london_food, london_coffee]) #Add coffee and food places to one DF
london_dedupe = london_food_coffee.sample(frac=1).reset_index(drop=True).drop_duplicates('venue_id') #Shuffle rows, and drop duplicated venues

In [343]:
#Drop districts with fewer than 30 venues, drop categories with fewer than 3 venues
london_filtered = london_dedupe.groupby('postal_district').filter(lambda x: x['venue_name'].count() > 30).reset_index(drop=True) 
london_filtered = london_filtered = london_dedupe.groupby('venue_category').filter(lambda x: x['venue_name'].count() > 2).reset_index(drop=True) 

In [522]:
pd.DataFrame(london_filtered.venue_category.value_counts()).head(15)

Unnamed: 0,venue_category
Café,1406
Coffee Shop,1175
Italian,383
Pizza,291
Indian,275
Sandwiches,237
Bakery,227
Restaurant,198
Fast Food,193
Burgers,134


In [345]:
#Approximate coordinates for each ditrict through mean of its venues
average_venue_coordinates = london_filtered.groupby(['postal_district', 'district_description'], as_index = False)[['venue_lat', 'venue_long']].mean()

In [346]:
average_venue_coordinates.head()

Unnamed: 0,postal_district,district_description,venue_lat,venue_long
0,E1,Eastern Head district,51.515294,-0.069507
1,E10,Leyton,51.55208,-0.006394
2,E11,Leytonstone,51.571478,-0.004237
3,E12,Manor Park,51.557584,0.060603
4,E13,Plaistow,51.537468,0.001598


In [406]:
#One-hot encoding of restaraunt attributes, and summarise to postcode district level
onehot = pd.get_dummies(london_filtered['venue_category'])
onehot.insert(0, 'postal_district', london_filtered['postal_district'])
features = onehot.groupby('postal_district', as_index=False).sum()

In [407]:
features.head()

Unnamed: 0,postal_district,African,American,Argentinian,Asian,Australian,BBQ,Bagels,Bakery,Bar,Bike Shop,Bistro,Bookstore,Brazilian,Breakfast,Burgers,Burritos,Cafeteria,Café,Cantonese,Caribbean,Caucasian,Chinese,Cocktail,Coffee Shop,Creperie,Cuban,Cupcakes,Deli / Bodega,Desserts,Dim Sum,Diner,Doner,Donuts,Dumplings,Eastern European,English,Ethiopian,Event Space,Falafel,Fast Food,Fish & Chips,Food,Food & Drink,Food Court,Food Stand,Food Truck,French,Fried Chicken,Gastropub,German,Gourmet,Greek,Halal,Hookah Bar,Ice Cream,Indian,Irish,Italian,Japanese,Jewish,Juice Bar,Kebab,Korean,Latin American,Lebanese,Malay,Mediterranean,Mexican,Middle Eastern,Modern European,Moroccan,New American,Noodles,Organic Grocery,Pakistani,Persian,Peruvian,Pie Shop,Pizza,Poke Place,Polish,Portuguese,Pub,Ramen,Restaurant,Salad,Sandwiches,Scandinavian,Seafood,Snacks,South American,South Indian,Spanish,Sri Lankan,Steakhouse,Sushi,Szechuan,Tacos,Tapas,Tea Room,Thai,Theater,Turkish,Vegetarian / Vegan,Vietnamese,Wine Bar,Wings
0,E1,0,0,1,1,0,1,0,2,0,0,0,0,0,0,2,0,0,19,0,0,0,2,0,28,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,2,0,0,0,1,0,0,2,1,1,0,0,1,0,0,0,13,0,2,1,1,0,0,2,0,0,1,1,1,1,1,0,0,0,0,2,0,0,0,6,0,0,1,0,0,1,2,4,0,0,1,0,0,0,0,2,1,2,0,0,0,3,0,3,2,2,0,1
1,E10,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,12,0,1,0,1,0,7,1,0,0,0,2,0,0,1,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,E11,1,0,0,1,0,0,0,4,1,0,0,0,0,0,1,0,0,8,0,1,0,1,0,7,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,5,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,E12,0,0,0,0,0,0,0,5,0,0,0,0,0,0,1,0,0,5,0,0,0,3,0,6,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,1,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,E13,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,4,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,4,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0


In [476]:
def top_categories(district, dataset):

    temp = dataset.loc[district].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    #temp['freq'] = temp['freq'].astype(float)
    #temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(10))

In [392]:
print(top_categories('E1'))

                venue  freq
0         Coffee Shop    28
1                Café    19
2              Indian    13
3               Pizza     6
4          Sandwiches     4
5             Turkish     3
6                Thai     3
7  Vegetarian / Vegan     2
8              Bakery     2
9              Korean     2
None


In [411]:
X_sparse = csr_matrix(features.iloc[:,1:]) #transforming into sparse matrix
tsvd = TruncatedSVD(n_components=10) #Instantiating truncated SVD to 10 components
X_sparse_tsvd = tsvd.fit(X_sparse).transform(X_sparse) #Fit-transform

In [424]:
print('Original number of features:', X_sparse.shape[1])
print('Reduced number of features:', X_sparse_tsvd.shape[1])
print('Explained variance ratio {0:.0%}'.format(tsvd.explained_variance_ratio_[0:10].sum()))

Original number of features: 107
Reduced number of features: 10
Explained variance ratio 85%


In [502]:
components = pd.DataFrame(X_sparse_tsvd)

In [505]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(pd.DataFrame(components))
components['cluster'] = pd.Series(kmeans.predict(components))

In [506]:
components.cluster.value_counts()

1    68
3    24
2    21
0     7
Name: cluster, dtype: int64

In [526]:
final.to_csv('final.csv')

In [507]:
components['postal_district'] = features['postal_district']

In [519]:
final = components.merge(average_venue_coordinates, how='left', on = 'postal_district').iloc[:,10:]
cluster_profile = final[['cluster', 'postal_district']].merge(features, how = 'left', on = 'postal_district').groupby('cluster').mean()

In [521]:
clus = (cluster_profile.T)

for i in range(4):
    
    print('Top categories for cluster {} '.format(i))
    print(clus[i].sort_values(ascending = False).head(10))
    print('\n')

Top categories for cluster 0 
Coffee Shop    27.714286
Café           19.000000
Italian         7.428571
Sandwiches      7.285714
Indian          5.714286
Restaurant      4.142857
Pizza           4.000000
Burgers         2.714286
Sushi           2.714286
Bakery          2.714286
Name: 0, dtype: float64


Top categories for cluster 1 
Café           7.882353
Coffee Shop    5.647059
Pizza          1.926471
Italian        1.852941
Indian         1.735294
Fast Food      1.676471
Bakery         1.602941
Sandwiches     1.205882
Restaurant     0.955882
Turkish        0.941176
Name: 1, dtype: float64


Top categories for cluster 2 
Café           22.190476
Coffee Shop    12.095238
Italian         4.857143
Pizza           2.904762
Restaurant      2.809524
Bakery          2.476190
Gastropub       1.857143
French          1.761905
Sandwiches      1.761905
Thai            1.619048
Name: 2, dtype: float64


Top categories for cluster 3 
Coffee Shop    14.291667
Café           11.291667
Italian     

In [515]:
map_clusters = folium.Map(location=[51.531217, -0.130755], zoom_start=12)

In [516]:
# set color scheme for the clusters
x = np.arange(5)
ys = [i+x+(i*x)**2 for i in range(4)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
        final['venue_lat'], 
        final['venue_long'], 
        final['district_description'], 
        final['cluster']):
    label = folium.Popup(str(poi) + ' cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters