# This notebook will be used for Capstone project

### Import in all packages

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Import in a list of adjacent sectors combinations in Sutton

In [2]:
sutton_adjacent=pd.read_csv("sutton_postcode.csv")
sutton_adjacent.head(5)

Unnamed: 0,target_sector,adjacent_sector,latitude,longitude
0,SM11,SM12,51.3644,-0.202339
1,SM11,SM13,51.3721,-0.187179
2,SM11,SM14,51.3636,-0.187645
3,SM11,SM25,51.3542,-0.189184
4,SM11,SM26,51.3541,-0.198247


### Generate a list of all postal sectors in Sutton with latitude and longitude

In [3]:
sutton=sutton_adjacent[['adjacent_sector','latitude','longitude']]\
.drop_duplicates().rename(columns={'adjacent_sector': 'sector'}).reset_index()
sutton.head(50)

Unnamed: 0,index,sector,latitude,longitude
0,0,SM12,51.3644,-0.202339
1,1,SM13,51.3721,-0.187179
2,2,SM14,51.3636,-0.187645
3,3,SM25,51.3542,-0.189184
4,4,SM26,51.3541,-0.198247
5,5,SM39,51.3759,-0.211163
6,6,SM11,51.3646,-0.19466
7,8,SM27,51.3499,-0.211821
8,9,SM38,51.3629,-0.218128
9,14,SM46,51.3908,-0.189243


### Foursquare Details (personal info masked)

In [4]:
CLIENT_ID = 'XXX' # your Foursquare ID
CLIENT_SECRET = 'XXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

### Making calls to Foursquare and extract venues in Sutton

In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
sutton_venues = getNearbyVenues(names=sutton['sector'],
                                   latitudes=sutton['latitude'],
                                   longitudes=sutton['longitude']
                                  )

SM12
SM13
SM14
SM25
SM26
SM39
SM11
SM27
SM38
SM46
SM51
SM52
SM53
SM54
SM71
SM72
SM73
SM44
SM45
SM60
SM67
SM68
SM69


In [8]:
sutton_venues.head(5)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,SM12,51.3644,-0.202339,Gander Green Lane (Borough Sports Ground),51.36751,-0.204132,Soccer Stadium
1,SM12,51.3644,-0.202339,Holiday Inn London - Sutton,51.361713,-0.196645,Hotel
2,SM12,51.3644,-0.202339,Blue Inc Clothing,51.364114,-0.201108,Clothing Store
3,SM12,51.3644,-0.202339,West Sutton Railway Station (WSU),51.366471,-0.204285,Train Station
4,SM12,51.3644,-0.202339,Fitness4less,51.360453,-0.205412,Athletics & Sports


### Rename and group categories together

In [9]:
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Gym'),['Venue Category']]='Sports Club'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Soccer'),['Venue Category']]='Sports Club'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Restaurant'),['Venue Category']]='Restaurant'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Fish'),['Venue Category']]='Restaurant'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Pizza'),['Venue Category']]='Restaurant'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Pub'),['Venue Category']]='Bar'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Bar'),['Venue Category']]='Bar'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Bakery'),['Venue Category']]='Café'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Breakfast Spot'),['Venue Category']]='Café'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Coffee'),['Venue Category']]='Café'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Sandwich'),['Venue Category']]='Café'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Bus'),['Venue Category']]='Transport'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Station'),['Venue Category']]='Transport'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Convenience'),['Venue Category']]='Supermarket'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Grocery'),['Venue Category']]='Supermarket'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Store'),['Venue Category']]='Shops'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Shop'),['Venue Category']]='Shops'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Bookstore'),['Venue Category']]='Shops'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Pharmacy'),['Venue Category']]='Shops'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Construction'),['Venue Category']]='Service'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Services'),['Venue Category']]='Service'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Cleaner'),['Venue Category']]='Service'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Cleaner'),['Venue Category']]='Service'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Garden'),['Venue Category']]='Other'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Park'),['Venue Category']]='Other'
sutton_venues.loc[sutton_venues['Venue Category'].str.contains('Platform'),['Venue Category']]='Other'

### One Hot Encoding on the venue categories

In [10]:
# one hot encoding
sutton_onehot = pd.get_dummies(sutton_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sutton_onehot['Neighborhood'] = sutton_venues['Neighborhood'] 

### Append in the latitude and longitude, and normalise them

In [11]:
sutton_grouped = sutton_onehot.groupby('Neighborhood').mean().reset_index()
sutton_grouped=sutton_grouped.merge(sutton,left_on='Neighborhood',right_on='sector')

In [12]:
sutton_grouped['normalised_latitude']=(sutton_grouped['latitude']-sutton_grouped['latitude'].mean())/sutton_grouped['latitude'].std()
sutton_grouped['normalised_longitude']=(sutton_grouped['longitude']-sutton_grouped['longitude'].mean())/sutton_grouped['longitude'].std()
sutton_grouped=sutton_grouped.drop(['index', 'sector'], axis=1)
sutton_grouped.head(10)

Unnamed: 0,Neighborhood,Athletics & Sports,Bar,Café,Diner,Event Service,Flea Market,Hotel,Other,Restaurant,Service,Shops,Sports Club,Supermarket,Transport,Veterinarian,latitude,longitude,normalised_latitude,normalised_longitude
0,SM11,0.0,0.111111,0.277778,0.0,0.0,0.0,0.027778,0.027778,0.083333,0.0,0.361111,0.027778,0.083333,0.0,0.0,51.3646,-0.19466,0.141346,-0.444626
1,SM12,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,51.3644,-0.202339,0.131598,-0.755066
2,SM14,0.0,0.151515,0.333333,0.0,0.0,0.0,0.0,0.030303,0.151515,0.0,0.272727,0.0,0.060606,0.0,0.0,51.3636,-0.187645,0.092606,-0.161031
3,SM25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.25,0.0,0.0,51.3542,-0.189184,-0.36555,-0.223248
4,SM26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,51.3541,-0.198247,-0.370424,-0.589638
5,SM38,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.285714,0.142857,0.0,0.0,0.0,0.285714,0.142857,0.0,51.3629,-0.218128,0.058488,-1.393369
6,SM39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.25,0.25,0.0,51.3759,-0.211163,0.692108,-1.111794
7,SM44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,51.392,-0.214729,1.476823,-1.255957
8,SM45,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.4,0.0,0.0,0.0,0.2,0.0,51.3945,-0.197055,1.598673,-0.541449
9,SM46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,51.3908,-0.189243,1.418335,-0.225633


### K Means Clustering

In [16]:
kclusters = 5

sutton_grouped_clustering =sutton_grouped.drop(['Neighborhood','latitude','longitude'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sutton_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 4, 4, 4], dtype=int32)

In [17]:
# add clustering labels
sutton_grouped['Cluster Labels'] = kmeans.labels_

sutton_grouped.head(10)

Unnamed: 0,Neighborhood,Athletics & Sports,Bar,Café,Diner,Event Service,Flea Market,Hotel,Other,Restaurant,Service,Shops,Sports Club,Supermarket,Transport,Veterinarian,latitude,longitude,normalised_latitude,normalised_longitude,Cluster Labels
0,SM11,0.0,0.111111,0.277778,0.0,0.0,0.0,0.027778,0.027778,0.083333,0.0,0.361111,0.027778,0.083333,0.0,0.0,51.3646,-0.19466,0.141346,-0.444626,2
1,SM12,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,51.3644,-0.202339,0.131598,-0.755066,2
2,SM14,0.0,0.151515,0.333333,0.0,0.0,0.0,0.0,0.030303,0.151515,0.0,0.272727,0.0,0.060606,0.0,0.0,51.3636,-0.187645,0.092606,-0.161031,2
3,SM25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.25,0.0,0.0,51.3542,-0.189184,-0.36555,-0.223248,2
4,SM26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,51.3541,-0.198247,-0.370424,-0.589638,2
5,SM38,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.285714,0.142857,0.0,0.0,0.0,0.285714,0.142857,0.0,51.3629,-0.218128,0.058488,-1.393369,2
6,SM39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.25,0.25,0.0,51.3759,-0.211163,0.692108,-1.111794,2
7,SM44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,51.392,-0.214729,1.476823,-1.255957,4
8,SM45,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.4,0.0,0.0,0.0,0.2,0.0,51.3945,-0.197055,1.598673,-0.541449,4
9,SM46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,51.3908,-0.189243,1.418335,-0.225633,4


### Plotting the clusters on map

In [18]:
address = 'Sutton'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sutton are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sutton are 51.3575114, -0.173640164220496.




In [19]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sutton_grouped['latitude'], sutton_grouped['longitude'], sutton_grouped['Neighborhood'], sutton_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters