# Fit and Healthy Sydney


An analysis of the fittest suburbs in Sydney, using the following data:
- Sydney Postcodes from: https://www.prospectshop.com.au/Files/SydneyMetro_Postcodes.xls'
- Australian Postcodes location and population data - From ABS
- Foursquare to identify Gyms, Bars and takeaway location in nearby location

Methods to analyse fittest locations:
- Algorithm 1: 10 x Gyms - Bars - Takeaway
- K means Cluster using Gyms, Bars and Takeaways
- K means Cluster using Gyms, Bars and Takeaways per capita


### Import Libraries 

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd
import xlrd 
from sklearn.cluster import KMeans
print('Libraries imported.')

Libraries imported.


## 1. IMPORT AND CREATE DATAFRAME OF SYDNEY POSTCODES

We create our master list of postcodes and suburb names

In [2]:
# Find Postcodes in Sydney

link ='https://www.prospectshop.com.au/Files/SydneyMetro_Postcodes.xls'
sydney_pc = pd.read_excel(link,'Sheet1')
sydney_pc.drop(sydney_pc.columns[2], axis=1)
sydney_pc.columns = ['Suburb','Postcode','Region']
del sydney_pc['Region']
sydney_pc = sydney_pc[np.isfinite(sydney_pc['Postcode'])]
sydney_pc.dropna()
sydney_pc = sydney_pc.astype({'Postcode': int})

#Output is Postcode and Suburb

#WARNING - Due to limitations in foursquare, I am only going to analyse the first 75 cells
sydney_pc = sydney_pc[:75]
sydney_pc.head()

Unnamed: 0,Suburb,Postcode
1,Sydney City,2000
2,Ultimo,2007
3,Chippendale,2008
4,Pyrmont,2009
5,Surry Hills,2010


## 2.FIND LATITUDE AND LONGITUDE OF EACH POSTCODE

In [3]:
#Get Latitude and Longitide for each postcode in Australia- Open File

import requests, zipfile, io
zip_file_url = 'http://www.corra.com.au/downloads/Australian_Post_Codes_Lat_Lon.zip'

r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

loc_pc = pd.read_csv(z.open('Australian_Post_Codes_Lat_Lon.csv'))


In [4]:
#Add Empty Latitude, Longitude Columns 
sydney_pc = sydney_pc.assign(Latitude="",Longitude="") 

#Empty Arrays for Lat and Long
mlatitude = []
mlongitude = []

#Loop each row (postcode) to get Longitude/Latutude
for rows in sydney_pc['Postcode']:
    a = loc_pc.loc[loc_pc['postcode'] == rows]
    mlatitude.append(a.iloc[0]['lat'])
    mlongitude.append(a.iloc[0]['lon'])
  
sydney_pc['Latitude'] = mlatitude
sydney_pc['Longitude'] = mlongitude

sydney_pc.head()

Unnamed: 0,Suburb,Postcode,Latitude,Longitude
1,Sydney City,2000,-33.855601,151.20822
2,Ultimo,2007,-33.884366,151.196502
3,Chippendale,2008,-33.886844,151.201715
4,Pyrmont,2009,-33.869709,151.19393
5,Surry Hills,2010,-33.879825,151.21956


## 3. FIND POPULATION PER POSTCODE

In [5]:
pop_pc = pd.read_csv('Postcode_pop.csv')
pop_pc.head()

#Add Population
sydney_pc = sydney_pc.assign(Population="")

#Empty Arrays for Lat and Long
mpop = []

#Loop each row (postcode) to get Longitude/Latutude
for rows in sydney_pc['Postcode']:
    a = pop_pc.loc[pop_pc['Postcode'] == rows]
    mpop.append(a.iloc[0]['Population'])

sydney_pc['Population'] = mpop
sydney_pc.head()

Unnamed: 0,Suburb,Postcode,Latitude,Longitude,Population
1,Sydney City,2000,-33.855601,151.20822,22758
2,Ultimo,2007,-33.884366,151.196502,7111
3,Chippendale,2008,-33.886844,151.201715,6296
4,Pyrmont,2009,-33.869709,151.19393,11617
5,Surry Hills,2010,-33.879825,151.21956,25404


## 4. MAP SYDNEY SUBURBS

In [6]:
#Map the Suburbs that are included in the study
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
print('Mapping Libraries imported.')

Mapping Libraries imported.


In [7]:
#Use Paramatta as centre of Map
latitude = -33.886166
longitude = 151.139472
print('The geograpical coordinate of Sydney are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney are -33.886166, 151.139472.


In [8]:
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, suburb, pcode in zip(sydney_pc['Latitude'], sydney_pc['Longitude'], sydney_pc['Suburb'], sydney_pc['Postcode']):
    
    label = '{},{}'.format(suburb, pcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    

map_sydney

In [9]:
#Get Shape of Table
print("The shape of the table is: ")
sydney_pc.shape

The shape of the table is: 


(75, 5)

## 5 GET VENUE DATA FROM FOURSQUARE

In [10]:
#Foursquare Credentials
#CLIENT_ID = '2NV0PGZHBOTMGTXM1HBKZNKSQ5N1DAH5DXWE0J0OY03HHMUH' # your Foursquare ID
#CLIENT_SECRET = 'VHELQMWDHGSLBE1BLA52CXKVWYWBS4DJWMRDPZPTSW1CKMMU' # your Foursquare Secret
CLIENT_ID = 'WP2N5OFAOZSE1KV0FBUF1C4QQ5LCPKUTCHX0KDY1VIO34XCN' # your Foursquare ID
CLIENT_SECRET = '2K4ZXTRTOWBTVDNWKVCRNMAYBMQRMGJR3R0MQFM5MYWOGW1P' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 700

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WP2N5OFAOZSE1KV0FBUF1C4QQ5LCPKUTCHX0KDY1VIO34XCN
CLIENT_SECRET:2K4ZXTRTOWBTVDNWKVCRNMAYBMQRMGJR3R0MQFM5MYWOGW1P


In [11]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [12]:
#function to search neighbourhoods and filter on categories
def getNearbyVenuesX(names, latitudes, longitudes, categories, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                 
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            categories)
            
        # make the GET request
        results = requests.get(url).json()["response"]["venues"]
                     
        venues_list.append([(
            name, 
            v['name'], 
            v['categories'][0]['name']) for v in results])
            

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Suburb','Venue','Category']
    return(nearby_venues)


# Get Gyms and Sporting fields

In [13]:
# Now write the code to run the above function on each neighborhood and create a new dataframe called sydney_venues.

gym_cat = '4bf58dd8d48988d175941735' #from Foursquare website

sydney_venues_gym = getNearbyVenuesX(names=sydney_pc['Suburb'],
                                   latitudes=sydney_pc['Latitude'],
                                   longitudes=sydney_pc['Longitude'],
                                   categories = gym_cat
                                   )                                

sydney_venues_gym.groupby('Suburb').count()

Unnamed: 0_level_0,Venue,Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Alexandria,9,9
Annandale,9,9
Artarmon,4,4
Balgowlah,8,8
Balmain,8,8
Bellevue Hill,5,5
Belrose,2,2
Berowra Waters,2,2
Bondi,11,11
Bondi Junction,44,44


# Get Bars

In [14]:
# Now write the code to run the above function on each neighborhood and create a new dataframe called sydney_venues.

bar_cat = "4bf58dd8d48988d116941735"

sydney_venues_bar = getNearbyVenuesX(names=sydney_pc['Suburb'],
                                   latitudes=sydney_pc['Latitude'],
                                   longitudes=sydney_pc['Longitude'],
                                   categories = bar_cat
                                   )                                
sydney_venues_bar.groupby('Suburb').count()

Unnamed: 0_level_0,Venue,Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Alexandria,7,7
Annandale,4,4
Balgowlah,1,1
Balmain,16,16
Bellevue Hill,3,3
Berowra Waters,1,1
Bondi,2,2
Bondi Junction,13,13
Botany,2,2
Cammeray,5,5


# Get Fast Food Outlets

In [15]:
# Now write the code to run the above function on each neighborhood and create a new dataframe called sydney_venues.

ff_cat = '4bf58dd8d48988d16e941735,4edd64a0c7ddd24ca188df1a,4bf58dd8d48988d16c941735,4bf58dd8d48988d1d0941735,4d4ae6fc7a7b7dea34424761,4bf58dd8d48988d1ca941735'
#ff_cat = '4bf58dd8d48988d16e941735'

sydney_venues_ff = getNearbyVenuesX(names=sydney_pc['Suburb'],
                                   latitudes=sydney_pc['Latitude'],
                                   longitudes=sydney_pc['Longitude'],
                                   categories = ff_cat
                                   )     
sydney_venues_ff.groupby('Suburb').count()



Unnamed: 0_level_0,Venue,Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Alexandria,15,15
Annandale,8,8
Artarmon,7,7
Balgowlah,4,4
Balmain,16,16
Bellevue Hill,1,1
Belrose,3,3
Berowra Waters,3,3
Bondi,12,12
Bondi Junction,41,41


In [16]:
sydney_venues_ff.groupby('Suburb').count()

Unnamed: 0_level_0,Venue,Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Alexandria,15,15
Annandale,8,8
Artarmon,7,7
Balgowlah,4,4
Balmain,16,16
Bellevue Hill,1,1
Belrose,3,3
Berowra Waters,3,3
Bondi,12,12
Bondi Junction,41,41


In [17]:
# Check out the dataframe
print(sydney_venues_gym.shape)
print(sydney_venues_bar.shape)
print(sydney_venues_ff.shape)





(499, 3)
(480, 3)
(829, 3)


In [18]:
# Count Gyms, Fast Food Takeaway and 
gymcount = sydney_venues_gym.groupby('Suburb').count()
gymcount = pd.DataFrame(gymcount)
barcount = sydney_venues_bar.groupby('Suburb').count()
barcount = pd.DataFrame(barcount)
ffcount = sydney_venues_ff.groupby('Suburb').count()
ffcount = pd.DataFrame(ffcount)

In [24]:
import numpy

#Add Bar, Gym and Fast food counts to our table
sydney_pc = sydney_pc.assign(gyms="",bars="",ffood="",total="",algorithm="",normed="",gymcap="",barcap="",ffoodcap="",gratio="",bratio="",fratio="") 

#Create empty matrices
mbar = []    
mgym = []
mffood = []
malgorithm = []
mnormed = []

for  rows in sydney_pc['Suburb']:
    try:
        gymno = gymcount.loc[rows, 'Venue']
    except KeyError:
        gymno = 0
    try:
        barno = barcount.loc[rows, 'Venue']
    except KeyError:
        barno = 0
    try:
        ffno = ffcount.loc[rows, 'Venue']
    except KeyError:
        ffno = 0
 
    
    mgym.append(gymno)
    mbar.append (barno)  
    mffood.append(ffno)
    malgorithm.append(4*gymno-2*barno-ffno)
    
    
    
    
sydney_pc['gyms'] = mgym
sydney_pc['bars'] = mbar
sydney_pc['ffood'] = mffood 
sydney_pc['total'] = sydney_pc['gyms'] + sydney_pc['bars'] + sydney_pc['ffood']
sydney_pc['algorithm'] = malgorithm
sydney_pc['normed'] = sydney_pc['algorithm']/sydney_pc['Population']
sydney_pc['gymcap'] = 1000*sydney_pc['gyms']/sydney_pc['Population'] #Divided by population in postcode
sydney_pc['barcap'] =1000*sydney_pc['bars']/sydney_pc['Population']
sydney_pc['ffoodcap'] =1000*sydney_pc['ffood']/sydney_pc['Population']
sydney_pc['gratio'] = sydney_pc['gyms']/sydney_pc['total']
sydney_pc['bratio'] = sydney_pc['bars']/sydney_pc['total']
sydney_pc['fratio'] = sydney_pc['ffood']/sydney_pc['total']

sydney_pc.head()    

Unnamed: 0,Suburb,Postcode,Latitude,Longitude,Population,gyms,bars,ffood,total,algorithm,normed,gymcap,barcap,ffoodcap,gratio,bratio,fratio
1,Sydney City,2000,-33.855601,151.20822,22758,5,28,16,49,-52,-0.002285,0.219703,1.230337,0.703049,0.102041,0.571429,0.326531
2,Ultimo,2007,-33.884366,151.196502,7111,21,20,47,88,-3,-0.000422,2.953171,2.812544,6.609478,0.238636,0.227273,0.534091
3,Chippendale,2008,-33.886844,151.201715,6296,26,25,41,92,13,0.002065,4.129606,3.970775,6.512071,0.282609,0.271739,0.445652
4,Pyrmont,2009,-33.869709,151.19393,11617,31,22,34,87,46,0.00396,2.668503,1.893776,2.926745,0.356322,0.252874,0.390805
5,Surry Hills,2010,-33.879825,151.21956,25404,24,50,50,124,-54,-0.002126,0.944733,1.968194,1.968194,0.193548,0.403226,0.403226


In [25]:
sydney_pc['ffood'].sum()

829

In [26]:
sydney_pc

Unnamed: 0,Suburb,Postcode,Latitude,Longitude,Population,gyms,bars,ffood,total,algorithm,normed,gymcap,barcap,ffoodcap,gratio,bratio,fratio
1,Sydney City,2000,-33.855601,151.20822,22758,5,28,16,49,-52,-0.002285,0.219703,1.230337,0.703049,0.102041,0.571429,0.326531
2,Ultimo,2007,-33.884366,151.196502,7111,21,20,47,88,-3,-0.000422,2.953171,2.812544,6.609478,0.238636,0.227273,0.534091
3,Chippendale,2008,-33.886844,151.201715,6296,26,25,41,92,13,0.002065,4.129606,3.970775,6.512071,0.282609,0.271739,0.445652
4,Pyrmont,2009,-33.869709,151.19393,11617,31,22,34,87,46,0.00396,2.668503,1.893776,2.926745,0.356322,0.252874,0.390805
5,Surry Hills,2010,-33.879825,151.21956,25404,24,50,50,124,-54,-0.002126,0.944733,1.968194,1.968194,0.193548,0.403226,0.403226
6,Kings Cross,2011,-33.872829,151.226593,18040,18,35,48,101,-46,-0.00255,0.997783,1.940133,2.660754,0.178218,0.346535,0.475248
7,Alexandria,2015,-33.897571,151.195567,8427,9,7,15,31,7,0.000831,1.067996,0.830663,1.779993,0.290323,0.225806,0.483871
8,Redfern,2016,-33.892778,151.203901,12033,15,22,18,55,-2,-0.000166,1.246572,1.828305,1.495886,0.272727,0.4,0.327273
9,Waterloo,2017,-33.9004,151.206144,14469,16,5,11,32,43,0.002972,1.105812,0.345566,0.760246,0.5,0.15625,0.34375
10,Rosebery,2018,-33.925133,151.213199,15397,0,1,2,3,-4,-0.00026,0.0,0.064948,0.129895,0.0,0.333333,0.666667


In [27]:
sydney_pc['bars'].sum()

480

In [28]:
sydney_pc['gyms'].sum()

499

# Method 3 – Weighted Number



In [29]:
#sydney_pc['bucket'] = pd.cut(sydney_pc['Population'], 4)
#sydney_pc

# set number of clusters
kclusters = 4

sydney_pc_clusteringA = sydney_pc[['algorithm']]
sydney_pc_clusteringA                              

# run k-means clustering
kmeansA = KMeans(n_clusters=kclusters, random_state=0).fit(sydney_pc_clusteringA)

# check cluster labels generated for each row in the dataframe
kmeansA.labels_[:10] 


array([3, 0, 1, 1, 3, 3, 0, 0, 1, 0], dtype=int32)

In [30]:
##Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
sydney_venues_sortedA = sydney_pc[['Suburb','Latitude','Longitude','gyms','bars','ffood','algorithm']]

# add clustering labels
sydney_venues_sortedA.insert(0, 'Cluster Labels', kmeansA.labels_)
sydney_venues_sortedA.head()

Unnamed: 0,Cluster Labels,Suburb,Latitude,Longitude,gyms,bars,ffood,algorithm
1,3,Sydney City,-33.855601,151.20822,5,28,16,-52
2,0,Ultimo,-33.884366,151.196502,21,20,47,-3
3,1,Chippendale,-33.886844,151.201715,26,25,41,13
4,1,Pyrmont,-33.869709,151.19393,31,22,34,46
5,3,Surry Hills,-33.879825,151.21956,24,50,50,-54


In [31]:
#Visualise clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sydney_venues_sortedA['Latitude'], sydney_venues_sortedA['Longitude'], sydney_venues_sortedA['Suburb'], sydney_venues_sortedA['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

###  Examine Clusters

In [32]:
#Cluster 1
sydney_venues_sortedA.loc[sydney_venues_sortedA['Cluster Labels'] == 0, sydney_venues_sortedA.columns[[1] + list(range(4, sydney_venues_sortedA.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,algorithm
2,Ultimo,21,20,47,-3
7,Alexandria,9,7,15,7
8,Redfern,15,22,18,-2
10,Rosebery,0,1,2,-4
11,Botany,0,2,2,-6
12,Mascot,2,3,9,-7
13,Paddington,3,4,4,0
16,Waverley,1,3,7,-9
17,Woollahra,2,2,0,4
20,Double Bay,9,9,7,11


In [33]:
#Cluster 2
sydney_venues_sortedA.loc[sydney_venues_sortedA['Cluster Labels'] == 1, sydney_venues_sortedA.columns[[1] + list(range(4, sydney_venues_sortedA.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,algorithm
3,Chippendale,26,25,41,13
4,Pyrmont,31,22,34,46
9,Waterloo,16,5,11,43
15,Bellevue Hill,5,3,1,13
18,Bondi,11,2,12,28
19,Edgecliff,5,2,2,14
29,Glebe,9,4,8,20
30,Annandale,9,4,8,20
31,Rozelle,15,13,14,20
36,St Peters,9,2,4,28


In [34]:
#Cluster 3
sydney_venues_sortedA.loc[sydney_venues_sortedA['Cluster Labels'] == 2, sydney_venues_sortedA.columns[[1] + list(range(4, sydney_venues_sortedA.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,algorithm
14,Bondi Junction,44,13,41,109


In [35]:
#Cluster 4
sydney_venues_sortedA.loc[sydney_venues_sortedA['Cluster Labels'] == 3, sydney_venues_sortedA.columns[[1] + list(range(4, sydney_venues_sortedA.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,algorithm
1,Sydney City,5,28,16,-52
5,Surry Hills,24,50,50,-54
6,Kings Cross,18,35,48,-46
34,Newtown,4,15,25,-39
75,Manly,17,42,42,-58


# A2. ANALYSIS - Weighted Score per Capita (# Gym, Bar and Fast Food)


In [67]:
# set number of clusters
kclusters = 4

sydney_pc_clusteringA2 = sydney_pc[['normed']]
sydney_pc_clusteringA2                              

# run k-means clustering
kmeansA2 = KMeans(n_clusters=kclusters, random_state=0).fit(sydney_pc_clusteringA2)

# check cluster labels generated for each row in the dataframe
kmeansA2.labels_[:10] 

array([0, 2, 1, 1, 0, 0, 2, 2, 1, 2], dtype=int32)

In [68]:
##Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
sydney_venues_sortedA2 = sydney_pc[['Suburb','Latitude','Longitude','gyms','bars','ffood','normed']]

# add clustering labels
sydney_venues_sortedA2.insert(0, 'Cluster Labels', kmeansA2.labels_)
sydney_venues_sortedA2.head()

Unnamed: 0,Cluster Labels,Suburb,Latitude,Longitude,gyms,bars,ffood,normed
1,0,Sydney City,-33.855601,151.20822,5,28,16,-0.002285
2,2,Ultimo,-33.884366,151.196502,21,20,47,-0.000422
3,1,Chippendale,-33.886844,151.201715,26,25,41,0.002065
4,1,Pyrmont,-33.869709,151.19393,31,22,34,0.00396
5,0,Surry Hills,-33.879825,151.21956,24,50,50,-0.002126


In [69]:
#Visualise clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sydney_venues_sortedA2['Latitude'], sydney_venues_sortedA2['Longitude'], sydney_venues_sortedA2['Suburb'], sydney_venues_sortedA2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [70]:
#Cluster 1
sydney_venues_sortedA2.loc[sydney_venues_sortedA['Cluster Labels'] == 0, sydney_venues_sortedA2.columns[[1] + list(range(4, sydney_venues_sortedA2.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,normed
2,Ultimo,21,20,47,-0.000422
7,Alexandria,9,7,15,0.000831
8,Redfern,15,22,18,-0.000166
10,Rosebery,0,1,2,-0.00026
11,Botany,0,2,2,-0.000674
12,Mascot,2,3,9,-0.000688
13,Paddington,3,4,4,0.0
16,Waverley,1,3,7,-0.000812
17,Woollahra,2,2,0,0.000557
20,Double Bay,9,9,7,0.002347


In [71]:
#Cluster 2
sydney_venues_sortedA2.loc[sydney_venues_sortedA['Cluster Labels'] == 1, sydney_venues_sortedA2.columns[[1] + list(range(4, sydney_venues_sortedA2.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,normed
3,Chippendale,26,25,41,0.002065
4,Pyrmont,31,22,34,0.00396
9,Waterloo,16,5,11,0.002972
15,Bellevue Hill,5,3,1,0.001208
18,Bondi,11,2,12,0.000921
19,Edgecliff,5,2,2,0.001916
29,Glebe,9,4,8,0.001445
30,Annandale,9,4,8,0.002307
31,Rozelle,15,13,14,0.002525
36,St Peters,9,2,4,0.003893


In [72]:
#Cluster 3
sydney_venues_sortedA2.loc[sydney_venues_sortedA['Cluster Labels'] == 2, sydney_venues_sortedA2.columns[[1] + list(range(4, sydney_venues_sortedA2.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,normed
14,Bondi Junction,44,13,41,0.009401


In [42]:
#Cluster 4
sydney_venues_sortedA2.loc[sydney_venues_sortedA['Cluster Labels'] == 3, sydney_venues_sortedA2.columns[[1] + list(range(4, sydney_venues_sortedA2.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood,normed
1,Sydney City,5,28,16,-0.002285
5,Surry Hills,24,50,50,-0.002126
6,Kings Cross,18,35,48,-0.00255
34,Newtown,4,15,25,-0.002201
75,Manly,17,42,42,-0.003848


# Method 1 – Absolute Number

In [43]:
# set number of clusters
kclusters = 4

sydney_pc_clusteringB = sydney_pc[['gyms','bars','ffood']]
sydney_pc_clusteringB                              

# run k-means clustering
kmeansB = KMeans(n_clusters=kclusters, random_state=0).fit(sydney_pc_clusteringB)

# check cluster labels generated for each row in the dataframe
kmeansB.labels_[:10] 


array([3, 1, 1, 1, 2, 2, 3, 3, 3, 0], dtype=int32)

In [44]:
##Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
sydney_venues_sortedB = sydney_pc[['Suburb','Latitude','Longitude','gyms','bars','ffood']]

# add clustering labels
sydney_venues_sortedB.insert(0, 'Cluster Labels', kmeansB.labels_)
sydney_venues_sortedB.head()


Unnamed: 0,Cluster Labels,Suburb,Latitude,Longitude,gyms,bars,ffood
1,3,Sydney City,-33.855601,151.20822,5,28,16
2,1,Ultimo,-33.884366,151.196502,21,20,47
3,1,Chippendale,-33.886844,151.201715,26,25,41
4,1,Pyrmont,-33.869709,151.19393,31,22,34
5,2,Surry Hills,-33.879825,151.21956,24,50,50


In [45]:
#Visualise clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sydney_venues_sortedB['Latitude'], sydney_venues_sortedB['Longitude'], sydney_venues_sortedB['Suburb'], sydney_venues_sortedB['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


###  Examine Clusters

In [46]:
#Cluster 1
sydney_venues_sortedB.loc[sydney_venues_sortedB['Cluster Labels'] == 0, sydney_venues_sortedB.columns[[1] + list(range(4, sydney_venues_sortedB.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood
10,Rosebery,0,1,2
11,Botany,0,2,2
12,Mascot,2,3,9
13,Paddington,3,4,4
15,Bellevue Hill,5,3,1
16,Waverley,1,3,7
17,Woollahra,2,2,0
19,Edgecliff,5,2,2
21,Rose Bay,0,2,1
22,Vaucluse,2,0,0


In [47]:
#Cluster 2
sydney_venues_sortedB.loc[sydney_venues_sortedB['Cluster Labels'] == 1, sydney_venues_sortedB.columns[[1] + list(range(4, sydney_venues_sortedB.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood
2,Ultimo,21,20,47
3,Chippendale,26,25,41
4,Pyrmont,31,22,34
14,Bondi Junction,44,13,41
49,Crows Nest,26,13,28
51,Chatswood,13,5,49


In [48]:
#Cluster 3
sydney_venues_sortedB.loc[sydney_venues_sortedB['Cluster Labels'] == 2, sydney_venues_sortedB.columns[[1] + list(range(4, sydney_venues_sortedB.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood
5,Surry Hills,24,50,50
6,Kings Cross,18,35,48
75,Manly,17,42,42


In [49]:
#Cluster 4
sydney_venues_sortedB.loc[sydney_venues_sortedB['Cluster Labels'] == 3, sydney_venues_sortedB.columns[[1] + list(range(4, sydney_venues_sortedB.shape[1]))]]

Unnamed: 0,Suburb,gyms,bars,ffood
1,Sydney City,5,28,16
7,Alexandria,9,7,15
8,Redfern,15,22,18
9,Waterloo,16,5,11
18,Bondi,11,2,12
20,Double Bay,9,9,7
26,Coogee,11,13,27
31,Rozelle,15,13,14
32,Leichhardt,5,10,25
33,Balmain,8,16,16


# Method 2 – Per Capita 

In [50]:
# set number of clusters
kclusters = 4

sydney_pc_clusteringC = sydney_pc[['gymcap','barcap','ffoodcap']]
sydney_pc_clusteringC                              

# run k-means clustering
kmeansC = KMeans(n_clusters=kclusters, random_state=0).fit(sydney_pc_clusteringC)

# check cluster labels generated for each row in the dataframe
kmeansC.labels_[:10] 


array([3, 2, 2, 1, 1, 1, 3, 1, 3, 0], dtype=int32)

In [51]:
##Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
sydney_venues_sortedC = sydney_pc[['Suburb','Latitude','Longitude','gymcap','barcap','ffoodcap']]

# add clustering labels
sydney_venues_sortedC.insert(0, 'Cluster Labels', kmeansC.labels_)
sydney_venues_sortedC.head()


Unnamed: 0,Cluster Labels,Suburb,Latitude,Longitude,gymcap,barcap,ffoodcap
1,3,Sydney City,-33.855601,151.20822,0.219703,1.230337,0.703049
2,2,Ultimo,-33.884366,151.196502,2.953171,2.812544,6.609478
3,2,Chippendale,-33.886844,151.201715,4.129606,3.970775,6.512071
4,1,Pyrmont,-33.869709,151.19393,2.668503,1.893776,2.926745
5,1,Surry Hills,-33.879825,151.21956,0.944733,1.968194,1.968194


In [52]:
#Visualise clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sydney_venues_sortedC['Latitude'], sydney_venues_sortedC['Longitude'], sydney_venues_sortedC['Suburb'], sydney_venues_sortedC['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


###  Examine Clusters


In [53]:
#Cluster 1
sydney_venues_sortedC.loc[sydney_venues_sortedC['Cluster Labels'] == 0, sydney_venues_sortedC.columns[[1] + list(range(4, sydney_venues_sortedC.shape[1]))]]

Unnamed: 0,Suburb,gymcap,barcap,ffoodcap
10,Rosebery,0.0,0.064948,0.129895
11,Botany,0.0,0.224719,0.224719
13,Paddington,0.210689,0.280919,0.280919
15,Bellevue Hill,0.464468,0.278681,0.092894
16,Waverley,0.090269,0.270807,0.631883
17,Woollahra,0.278474,0.278474,0.0
18,Bondi,0.36189,0.065798,0.394789
19,Edgecliff,0.684369,0.273748,0.273748
21,Rose Bay,0.0,0.212766,0.106383
22,Vaucluse,0.14374,0.0,0.0


In [54]:
#Cluster 2
sydney_venues_sortedC.loc[sydney_venues_sortedC['Cluster Labels'] == 1, sydney_venues_sortedC.columns[[1] + list(range(4, sydney_venues_sortedC.shape[1]))]]

Unnamed: 0,Suburb,gymcap,barcap,ffoodcap
4,Pyrmont,2.668503,1.893776,2.926745
5,Surry Hills,0.944733,1.968194,1.968194
6,Kings Cross,0.997783,1.940133,2.660754
8,Redfern,1.246572,1.828305,1.495886
14,Bondi Junction,3.795066,1.12127,3.536312
20,Double Bay,1.920615,1.920615,1.493811
31,Rozelle,1.893461,1.641,1.76723
42,Camperdown,2.087994,0.447427,2.386279
45,Milsons Point,0.704101,1.584228,1.936279
70,Neutral Bay,1.624256,0.812128,2.075438


In [55]:
#Cluster 3
sydney_venues_sortedC.loc[sydney_venues_sortedC['Cluster Labels'] == 2, sydney_venues_sortedC.columns[[1] + list(range(4, sydney_venues_sortedC.shape[1]))]]

Unnamed: 0,Suburb,gymcap,barcap,ffoodcap
2,Ultimo,2.953171,2.812544,6.609478
3,Chippendale,4.129606,3.970775,6.512071


In [56]:
#Cluster 4
sydney_venues_sortedC.loc[sydney_venues_sortedC['Cluster Labels'] == 3, sydney_venues_sortedC.columns[[1] + list(range(4, sydney_venues_sortedC.shape[1]))]]

Unnamed: 0,Suburb,gymcap,barcap,ffoodcap
1,Sydney City,0.219703,1.230337,0.703049
7,Alexandria,1.067996,0.830663,1.779993
9,Waterloo,1.105812,0.345566,0.760246
12,Mascot,0.196502,0.294753,0.88426
26,Coogee,0.576158,0.680913,1.414205
30,Annandale,1.038182,0.461414,0.922828
32,Leichhardt,0.239682,0.479363,1.198409
33,Balmain,0.542263,1.084525,1.084525
34,Newtown,0.225734,0.846501,1.410835
35,Erskineville,0.437956,1.021898,1.751825


# Method 4 – Ratio

In [58]:
# set number of clusters
kclusters = 4
sydney_pc_nan = sydney_pc.dropna(0)

sydney_pc_clusteringD = sydney_pc[['gratio','bratio','fratio']]
sydney_pc_clusteringD = sydney_pc_clusteringD.dropna(0)
sydney_pc_clusteringD
                             
# run k-means clustering
kmeansD = KMeans(n_clusters=kclusters, random_state=0).fit(sydney_pc_clusteringD)

# check cluster labels generated for each row in the dataframe
kmeansD.labels_[:10] 


array([0, 0, 0, 1, 0, 0, 0, 0, 1, 3], dtype=int32)

In [59]:
##Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
sydney_venues_sortedD = sydney_pc_nan[['Suburb','Latitude','Longitude','gratio','bratio','fratio']]

# add clustering labels
sydney_venues_sortedD.insert(0, 'Cluster Labels', kmeansD.labels_)
sydney_venues_sortedD.head()

Unnamed: 0,Cluster Labels,Suburb,Latitude,Longitude,gratio,bratio,fratio
1,0,Sydney City,-33.855601,151.20822,0.102041,0.571429,0.326531
2,0,Ultimo,-33.884366,151.196502,0.238636,0.227273,0.534091
3,0,Chippendale,-33.886844,151.201715,0.282609,0.271739,0.445652
4,1,Pyrmont,-33.869709,151.19393,0.356322,0.252874,0.390805
5,0,Surry Hills,-33.879825,151.21956,0.193548,0.403226,0.403226


In [60]:
#Visualise clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sydney_venues_sortedD['Latitude'], sydney_venues_sortedD['Longitude'], sydney_venues_sortedD['Suburb'], sydney_venues_sortedD['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


In [73]:
#Cluster 1
sydney_venues_sortedD.loc[sydney_venues_sortedD['Cluster Labels'] == 0, sydney_venues_sortedD.columns[[1] + list(range(4, sydney_venues_sortedD.shape[1]))]]

Unnamed: 0,Suburb,gratio,bratio,fratio
1,Sydney City,0.102041,0.571429,0.326531
2,Ultimo,0.238636,0.227273,0.534091
3,Chippendale,0.282609,0.271739,0.445652
5,Surry Hills,0.193548,0.403226,0.403226
6,Kings Cross,0.178218,0.346535,0.475248
7,Alexandria,0.290323,0.225806,0.483871
8,Redfern,0.272727,0.4,0.327273
11,Botany,0.0,0.5,0.5
13,Paddington,0.272727,0.363636,0.363636
17,Woollahra,0.5,0.5,0.0


In [74]:
#Cluster 2
sydney_venues_sortedD.loc[sydney_venues_sortedD['Cluster Labels'] == 1, sydney_venues_sortedD.columns[[1] + list(range(4, sydney_venues_sortedD.shape[1]))]]

Unnamed: 0,Suburb,gratio,bratio,fratio
4,Pyrmont,0.356322,0.252874,0.390805
9,Waterloo,0.5,0.15625,0.34375
14,Bondi Junction,0.44898,0.132653,0.418367
15,Bellevue Hill,0.555556,0.333333,0.111111
18,Bondi,0.44,0.08,0.48
19,Edgecliff,0.555556,0.222222,0.222222
27,Pagewood,0.333333,0.166667,0.5
29,Glebe,0.428571,0.190476,0.380952
30,Annandale,0.428571,0.190476,0.380952
36,St Peters,0.6,0.133333,0.266667


In [75]:
#Cluster 3
sydney_venues_sortedD.loc[sydney_venues_sortedD['Cluster Labels'] == 2, sydney_venues_sortedD.columns[[1] + list(range(4, sydney_venues_sortedD.shape[1]))]]

Unnamed: 0,Suburb,gratio,bratio,fratio
22,Vaucluse,1.0,0.0,0.0
24,Kingsford,1.0,0.0,0.0
54,Lindfield,1.0,0.0,0.0
76,Harbord,1.0,0.0,0.0


In [76]:
#Cluster 4
sydney_venues_sortedD.loc[sydney_venues_sortedD['Cluster Labels'] == 3, sydney_venues_sortedD.columns[[1] + list(range(4, sydney_venues_sortedD.shape[1]))]]

Unnamed: 0,Suburb,gratio,bratio,fratio
10,Rosebery,0.0,0.333333,0.666667
12,Mascot,0.142857,0.214286,0.642857
16,Waverley,0.090909,0.272727,0.636364
25,Kensington,0.2,0.2,0.6
32,Leichhardt,0.125,0.25,0.625
37,Haberfield,0.071429,0.214286,0.714286
51,Chatswood,0.19403,0.074627,0.731343
56,Gordon,0.0,0.111111,0.888889
61,Hornsby,0.25,0.0,0.75
67,Frenchs Forest,0.0,0.166667,0.833333
