In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [2]:
# Set up Foursquare url
client_secret = 'V0UNEULWH0ERAKIQURTUSLPZELBOI3EDXR4MT2U1IQOMSO0M';
client_id = 'DCSVHCHVV2GZ440YWHFBBX2UXSIQCO5B1WVK4RS4ZQHY01P5';
{
    "tags": [
        "hide_input",
        "hide_output"
    ]
}

In [33]:
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?client_secret={}&client_id={}&v=20180605&near=Bangkok&section=food&limit=50'.format(client_secret,client_id )
results = requests.get(url).json()    

In [34]:
# Obtain a dataframe with name, latitude, longitude from json file
venues = results['response']['groups'][0]['items']
venues = json_normalize(venues) # flatten JSON
filtered_columns = ['venue.name', 'venue.location.lat', 'venue.location.lng']
venues = venues.loc[:, filtered_columns]
venues.head()

Unnamed: 0,venue.name,venue.location.lat,venue.location.lng
0,Ginzado,13.729023,100.58068
1,Hanazen (ฮานะเซน),13.779563,100.538591
2,Look Chin Sri Yan (ลูกชิ้นศรีย่าน),13.784805,100.512836
3,Baan Somtum (บ้านส้มตำ),13.767756,100.395465
4,Kenji's Lab,13.73677,100.58444


In [37]:
import math
# Inspect total result 
total = results['response']['totalResults'];
print("There are total", total, "restaurants from database.")
totaln = math.ceil(total/50)
venueall = venues
# Use for loop and off set to retrieve the rest of data
for i in range(1, totaln):
    offset = str(50 * i )
    url2 = 'https://api.foursquare.com/v2/venues/explore?client_secret={}&client_id={}&v=20180605&near=Bangkok&section=food&offset={}&limit=50'.format(client_secret,client_id ,offset)
    results2 = requests.get(url2).json() 
    venues2 = results2['response']['groups'][0]['items']
    venues2 = json_normalize(venues2) # flatten JSON
    venues2 = venues2.loc[:, filtered_columns]
    venueall = venueall.append(venues2)
    
    

There are total 232 restaurants from database.


In [54]:
# Confirm shape of resulting dataframe
df = venueall.reset_index(drop=True)
df.columns = ['Name', 'Latitude','Longitude']
df.shape

(232, 3)

In [44]:
# Get coordinate of Bangkok
address = 'Bangkok'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bangkok are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Bangkok are 13.7542529, 100.493087.


In [51]:
# create map
map1 = folium.Map(location=[latitude, longitude], zoom_start=11)


# add marker to the map for each restaurant

for lat, lon, poi,  in zip(df['Latitude'], df['Longitude'], df['Name']):
    label = folium.Popup(str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        fill=True,
        fill_opacity=0.5
        ).add_to(map1)
       
map1

In [53]:
# Find global center
df[['Latitude','Longitude']].mean()

Latitude      13.756922
Longitude    100.552256
dtype: float64

In [58]:
# Calculate distance from the center
from geopy.distance import vincenty

df['Distance from center'] = df.apply(lambda row:  vincenty((row.Latitude,row.Longitude),(13.756922,100.552256)).meters ,  axis = 1)



In [63]:
# Obtain basic statistics
df.describe()

Unnamed: 0,Latitude,Longitude,Distance from center
count,232.0,232.0,232.0
mean,13.756922,100.552256,7662.920955
std,0.051048,0.067044,5049.135087
min,13.624164,100.330559,783.413572
25%,13.726294,100.515178,4135.601633
50%,13.74353,100.548054,5562.653568
75%,13.781693,100.588881,10107.702133
max,13.948147,100.752902,24387.635034


In [145]:
kmeans3 = KMeans(n_clusters=3, random_state=0).fit(df[['Latitude','Longitude']])
df3 = df
df3['Cluster'] = kmeans3.fit_predict(df[['Latitude','Longitude']])
df3.head()

Unnamed: 0,Name,Latitude,Longitude,Distance from center,Cluster,CenterLat,CenterLon,ClusterDist
0,Ginzado,13.729023,100.58068,4356.353281,2,13.736171,100.587822,1105.54566
1,Hanazen (ฮานะเซน),13.779563,100.538591,2908.38287,2,13.76944,100.510476,3240.041319
2,Look Chin Sri Yan (ลูกชิ้นศรีย่าน),13.784805,100.512836,5262.059285,2,13.76944,100.510476,1719.045385
3,Baan Somtum (บ้านส้มตำ),13.767756,100.395465,16998.314328,0,13.774117,100.411462,1867.634186
4,Kenji's Lab,13.73677,100.58444,4133.577579,2,13.736171,100.587822,371.787634


In [146]:
# Add cluster center and distance to center to dataframe
centers3 = np.array(kmeans3.cluster_centers_)
df3['CenterLat'] = df3.apply(lambda row: centers[ row.Cluster , : ][0] , axis=1)
df3['CenterLon'] = df3.apply(lambda row: centers[ row.Cluster , : ][1] , axis=1)
df3['ClusterDist'] = df3.apply(lambda row: vincenty((row.Latitude,row.Longitude),(row.CenterLat,row.CenterLon)).meters , axis=1)
df3.head()



Unnamed: 0,Name,Latitude,Longitude,Distance from center,Cluster,CenterLat,CenterLon,ClusterDist
0,Ginzado,13.729023,100.58068,4356.353281,2,13.734454,100.542709,4150.55872
1,Hanazen (ฮานะเซน),13.779563,100.538591,2908.38287,2,13.734454,100.542709,5010.633326
2,Look Chin Sri Yan (ลูกชิ้นศรีย่าน),13.784805,100.512836,5262.059285,2,13.734454,100.542709,6439.762464
3,Baan Somtum (บ้านส้มตำ),13.767756,100.395465,16998.314328,0,13.739624,100.430467,4900.617196
4,Kenji's Lab,13.73677,100.58444,4133.577579,2,13.734454,100.542709,4520.709576


In [104]:
#Create map with clustering

map3 = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(3)
ys = [i + x + (i*x)**2 for i in range(3)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Name'], df3['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map3)

for i in range(3):
     folium.Marker(
        [centers3[i][0], centers3[i][1]]
        ).add_to(map3)

       
map3

In [147]:
df3i = pd.DataFrame(columns = ['Count','Avg_Dist'])
df3i['Count'] = df3.groupby('Cluster').count()['Name'] # Count restaurants in each cluster
df3i['Avg_Dist'] = df3.groupby('Cluster').mean()[['ClusterDist']] #find average distance to center
df3i['Max_Dist'] = df3.groupby('Cluster').max()[['ClusterDist']] # find max distance to center
df3i.loc['mean'] = df3i.mean()
df3i

Unnamed: 0_level_0,Count,Avg_Dist,Max_Dist
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,26.0,6298.982141,13901.149524
1,67.0,6712.180603,17406.899167
2,139.0,3908.029785,12658.730993
mean,77.333333,5639.730843,14655.593228


In [148]:
#Repeat k = 4,...
from IPython.display import display
for k in range(4,9):
    kmeansk = KMeans(n_clusters=k, random_state=0).fit(df[['Latitude','Longitude']])
    dfk = df
    dfk['Cluster'] = kmeansk.fit_predict(df[['Latitude','Longitude']])
    centersk = np.array(kmeansk.cluster_centers_)
    dfk['CenterLat'] = dfk.apply(lambda row: centersk[ row.Cluster , : ][0] , axis=1)
    dfk['CenterLon'] = dfk.apply(lambda row: centersk[ row.Cluster , : ][1] , axis=1)
    dfk['ClusterDist'] = dfk.apply(lambda row: vincenty((row.Latitude,row.Longitude),(row.CenterLat,row.CenterLon)).meters , axis=1)
    dfki = pd.DataFrame(columns = ['Count','Avg_Dist'])
    dfki['Count'] = dfk.groupby('Cluster').count()['Name']
    dfki['Avg_Dist'] = dfk.groupby('Cluster').mean()[['ClusterDist']]
    dfki['Max_Dist'] = dfk.groupby('Cluster').max()[['ClusterDist']]
    dfki.loc['mean'] = dfki.mean()
    mapk = folium.Map(location=[latitude, longitude], zoom_start=11)
    x = np.arange(k)
    ys = [i + x + (i*x)**2 for i in range(k)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    for lat, lon, poi, cluster in zip(dfk['Latitude'], dfk['Longitude'], dfk['Name'], dfk['Cluster']):
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(mapk)
    for i in range(k):
     folium.Marker(
        [centersk[i][0], centersk[i][1]]
        ).add_to(mapk)
    print('Stats for k = ' + str(k))
    print(dfki)
    print('Map for k = ' + str(k))
    display(mapk)



Stats for k = 4
         Count     Avg_Dist      Max_Dist
Cluster                                  
0         45.0  5645.778011  15604.923228
1        104.0  3387.534943  12618.190370
2         61.0  4031.539228  16312.829045
3         22.0  6282.039110  13235.705750
mean      58.0  4836.722823  14442.912099
Map for k = 4




Stats for k = 5
         Count     Avg_Dist      Max_Dist
Cluster                                  
0         21.0  6335.162367  13045.088365
1         66.0  2903.023666   8746.769558
2         95.0  3337.956136  12571.507917
3         14.0  5313.260570   9900.315726
4         36.0  4934.370370  11713.345296
mean      46.4  4564.754622  11195.405372
Map for k = 5




Stats for k = 6
             Count     Avg_Dist      Max_Dist
Cluster                                      
0        75.000000  2576.032752  11332.658017
1        29.000000  4686.182157  11645.158075
2        64.000000  2844.520736   8605.396724
3        22.000000  6282.039110  13235.705750
4        28.000000  3407.125208   6108.330099
5        14.000000  5313.260570   9900.315726
mean     38.666667  4184.860089  10137.927399
Map for k = 6




Stats for k = 7
             Count     Avg_Dist      Max_Dist
Cluster                                      
0        74.000000  2456.450779   5660.611212
1        35.000000  4910.288696  12505.677479
2        25.000000  3254.186241   5662.869902
3        68.000000  3051.999015   8840.394830
4        14.000000  3930.780675   9994.800677
5         9.000000  4143.897058   9282.037535
6         7.000000  4906.286055   6076.570856
mean     33.142857  3807.698360   8288.994642
Map for k = 7




Stats for k = 8
         Count     Avg_Dist      Max_Dist
Cluster                                  
0         61.0  2093.941871   5416.278387
1          8.0  5187.798681   6519.116970
2         27.0  3184.162082   7100.575239
3         12.0  3694.166929   9118.476983
4         66.0  2903.023666   8746.769558
5         34.0  2893.325533   4885.137356
6          9.0  4143.897058   9282.037535
7         15.0  4647.147412  10325.251860
mean      29.0  3593.432904   7674.205486
Map for k = 8
