In [None]:
import json, requests
import os
import geopandas
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import folium

pd.options.display.max_rows = None
pd.options.display.max_columns = None


## Search for coordinates using Geopy package for each neighbourhood

In [None]:
address = 'Sandakan'

geolocator = Nominatim(user_agent="foursquare_agent")

location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

In [None]:
searchurl = 'https://api.foursquare.com/v2/venues/search?ll=5.8402, 118.1179'

In [None]:
resp1 = requests.get(url=searchurl, params=params)

In [None]:
data1 = json.loads(resp1.text)

In [None]:
data1

### Sandakan neighbourhood data description:

**Number** = Index number

**Name of neighbourhood** = Neighbourhood Names

**Area** = Area in acres

**Residential units** = Number of residential homes

**Location** = Location of neighbourhood

**Latitude** = Latitude coordinates

**Longtitude** = Longtitude coordinates

## Load data

In [None]:
df = pd.read_csv('sandakan.csv', index_col="Number")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

The dataset consists of 73 rows and 6 columns

In [None]:
df.info()

In [None]:
df.isnull().sum()  #Count NaN values

Since we need to explore and plot neighbourhoods, I decided to drop NaNs for Latitude and Longtitude

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.reset_index()

Drop number and location columns from dataframe

In [None]:
df = df[['Neighbourhood','Area','Residential Units','Latitude','Longtitude']]

In [None]:
df.reset_index(drop="Number", inplace=True)

In [None]:
#save a cleaned csv file for backup

#df.to_csv('skanclean.csv', index=False)

## Create visualizations for data exploration

In [None]:
df.head()

In [None]:
plt.figure(figsize=(30,30))
plt.title('Sandakan neighbourhoods by area size', fontsize=30)
plt.xlabel('xlabel', fontsize=30)
plt.ylabel('ylabel', fontsize=30)
plt.xticks(rotation='vertical')
sns.barplot(x=df.Neighbourhood,y=df.Area)
plt.show()

In [None]:
plt.figure(figsize=(30,30))
plt.title('Sandakan neighbourhoods by residential units', fontsize=30)
plt.xlabel('xlabel', fontsize=30)
plt.ylabel('ylabel', fontsize=30)
plt.xticks(rotation='vertical')
sns.barplot(x=df.Neighbourhood,y=df['Residential Units'])
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
sns.pairplot(df, kind='reg')
plt.show()

There seems to be a small linear relationship between Area and Residential Units

In [None]:
plt.figure(figsize=(30,30))
plt.title('Heatmap for Correlation', fontsize=30)

sns.heatmap(df.corr(), annot=True, linewidth = 0.5, cmap='coolwarm')
plt.show()

## Create maps

In [None]:
#Load the cleaned csv file

df = pd.read_csv("skanclean.csv")

In [None]:
address = 'Sandakan'

geolocator = Nominatim(user_agent="foursquare_agent")

location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

In [None]:
#Sandakan Map
map = folium.Map(location=[latitude,longitude], zoom_start=12)
map

In [None]:
#Segment suburbs coordinates

df_suburbs = df[['Latitude','Longtitude']]

In [None]:
df_suburbs.head()

In [None]:
df_suburbs.shape

In [None]:
suburbs_list = df_suburbs.values.tolist()

In [None]:
suburbs_list_size = len(suburbs_list)

In [None]:
suburbs_list_size

In [None]:
#Add Markers
for point in range(0,suburbs_list_size):
    folium.Marker(suburbs_list[point]).add_to(map)

In [None]:
map

In [None]:
#Add Markers with Popup
for point in range(0,suburbs_list_size):
    folium.Marker(suburbs_list[point], popup=df['Neighbourhood'][point]).add_to(map)

In [None]:
map

## Segment and focus Mile 4 to Mile 6 neighbourhoods

In [None]:
df1 = pd.read_csv("segment.csv")

In [None]:
df1

In [None]:
df1.shape

In [None]:
address = 'Sandakan'

geolocator = Nominatim(user_agent="foursquare_agent")

location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

In [None]:
#Sandakan Map
map1 = folium.Map(location=[latitude,longitude], zoom_start=12)
map1

In [None]:
#Segment Mile 4 to Mile 6 suburbs coordinates

df1_suburbs = df1[['Latitude','Longtitude']]

In [None]:
df1_suburbs

In [None]:
df1_suburbs.shape

In [None]:
suburbs1_list = df1_suburbs.values.tolist()

In [None]:
suburbs1_list

In [None]:
suburbs1_list_size = len(suburbs1_list)

In [None]:
suburbs1_list_size

In [None]:
#Add Markers with Popup
for point in range(0,suburbs1_list_size):
    folium.Marker(suburbs1_list[point], popup=df1['Neighbourhood'][point]).add_to(map1)

In [None]:
map1

## Using Foursquare API

**Explore Neighborhoods with that focused segment**

In [None]:
#define our Foursquare credentials and version
CLIENT_ID = 'ZA1DQF403ZFDBZRXJPTGZTZOCFLEFLEKGN0HCDSEZEP4E4WH' # your Foursquare ID
CLIENT_SECRET = '30UY4KEFYWPITP32JWZIRM1I1NPC42EQ5FVEG2LJV5PISLHY' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 15

In [None]:
neighborhoods_subset = df1[['Neighbourhood','Latitude','Longtitude']]

In [None]:
neighborhoods_subset

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
target_venues = getNearbyVenues(names=neighborhoods_subset['Neighbourhood'],
                                   latitudes=neighborhoods_subset['Latitude'],
                                   longitudes=neighborhoods_subset['Longtitude']
                                  )

In [None]:
print(target_venues.shape)
target_venues

In [None]:
target_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(target_venues['Venue Category'].unique())))

## Analyze Each Neighborhood

In [None]:
# one hot encoding
target_onehot = pd.get_dummies(target_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
target_onehot['Neighborhood'] = target_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [target_onehot.columns[-1]] + list(target_onehot.columns[:-1])
target_onehot = target_onehot[fixed_columns]

target_onehot.head()

In [None]:
target_onehot.shape

In [None]:
target_grouped = target_onehot.groupby('Neighborhood').mean().reset_index()
target_grouped

In [None]:
target_grouped.shape

In [None]:
num_top_venues = 5

for hood in target_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = target_grouped[target_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = target_grouped['Neighborhood']

for ind in np.arange(target_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(target_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

## Cluster Neighborhoods

In [None]:
# set number of clusters
kclusters = 5

target_grouped_clustering = target_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(target_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:15] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

target_merged = neighborhoods_subset

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
target_merged = target_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

target_merged.head() # check the last columns!

In [None]:
target_merged

In [None]:
target_merged.drop(index=1, inplace=True)

In [None]:
target_merged

In [None]:
target_merged['Cluster Labels'] = target_merged['Cluster Labels'].astype(int)

In [None]:
target_merged

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(target_merged['Latitude'], target_merged['Longtitude'], target_merged['Neighbourhood'], target_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
target_merged.loc[target_merged['Cluster Labels'] == 0]

In [None]:
target_merged.loc[target_merged['Cluster Labels'] == 1]

In [None]:
target_merged.loc[target_merged['Cluster Labels'] == 2]

In [None]:
target_merged.loc[target_merged['Cluster Labels'] == 3]

In [None]:
target_merged.loc[target_merged['Cluster Labels'] == 4]

##  Results and Discussion

**Reserved Code for Search**

In [None]:
# Select Bunga Matahari:
neighborhood_name = neighborhoods_subset.loc[0, 'Neighbourhood']
neighborhood_latitude = neighborhoods_subset.loc[0, 'Latitude'] 
neighborhood_longitude = neighborhoods_subset.loc[0, 'Longtitude'] # limit of number of venues returned by Foursquare API
LIMIT = 5 
radius = 500

In [None]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
neighborhood_latitude,
neighborhood_longitude,
radius,
LIMIT
)

In [None]:
results = requests.get(url).json()

In [None]:
results  #search results in JSON format

In [None]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

In [None]:
dataframe.shape

In [None]:
databungam = dataframe[['name','location.lat','location.lng']]

In [None]:
#Search results for Tmn Bunga Matahari within 500m
databungam

-------------------------------------End of Reserve Code------------------------------------------------------

**RESERVED CODE FOR EXPLORE**

In [None]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

In [None]:
results = requests.get(url).json()

In [None]:
results

In [None]:
# assign relevant part of JSON to venues
venues = results['response']['groups'][0]['items']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

In [None]:
bungamatahari = dataframe[['venue.name','venue.location.lat','venue.location.lng','venue.categories']]

In [None]:
bungamatahari

**Only one Chinese restaurant result for Taman Bunga Matahari -- END of EXPLORE code**