In [1]:
import pandas as pd

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
import numpy as np # library to handle data in a vectorized manner

print('Libraries imported.')

Libraries imported.


In [2]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

## 1. Explore Neighborhoods

In [3]:
neighborhoods = pd.read_csv('singapore_geo.csv')

In [4]:
print('The dataframe has {} postcodes and {} neighborhoods.'.format(
        len(neighborhoods['Postcode'].unique()),
        neighborhoods.shape[0]
    )
)

In [5]:
neighborhoods.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
0,0,18906,SINGAPORE CHINESE CULTURAL CENTRE,1.275829,103.849576
1,1,18907,TEMPORARY SITE OFFICE,1.27495,103.851665
2,3,18915,TEMPORARY SITE OFFICE,1.273682,103.860075
3,4,18925,CITIBANK TRADE_BRANCH,1.276424,103.854759
4,5,18925,DBS Marina Bay MRT Station,1.276427,103.854598


##### Randomly select only 500 of neighborhoods.

In [6]:
neighborhoods_subset = neighborhoods.sample(500
                                           )
neighborhoods_subset.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
98105,156040,460158,Stepping Stones Education Centre,1.318736,103.945087
41387,67362,461186,FENGSHAN GREENVILLE,1.329914,103.940311
108462,169594,529915,KIDSKINGDOM @BEDOK RESERVOIR LLP,1.344775,103.95513
53391,83732,536357,FENGLI GARDENS,1.348414,103.879919
111389,174316,538459,BARTLEY VILLAS,1.344828,103.87773


In [7]:
neighborhoods_subset=neighborhoods_subset.loc[:, ~neighborhoods_subset.columns.str.contains('^Unnamed')]
print(neighborhoods_subset.shape)
neighborhoods_subset.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
98105,460158,Stepping Stones Education Centre,1.318736,103.945087
41387,461186,FENGSHAN GREENVILLE,1.329914,103.940311
108462,529915,KIDSKINGDOM @BEDOK RESERVOIR LLP,1.344775,103.95513
53391,536357,FENGLI GARDENS,1.348414,103.879919
111389,538459,BARTLEY VILLAS,1.344828,103.87773


In [8]:
neighborhoods_subset.shape

(500, 4)

#### Some neighborhood corresponds to the CONSERVATION AREA, which is a multiple locations, we will exclude all CONSERVATION AREAS from our analysis.

In [10]:
to_drop = ['CONSERVATION', 'AREA']
neighborhoods_subset=neighborhoods_subset[~neighborhoods_subset.Neighborhood.str.contains("CONSERVATION AREA")]
neighborhoods_subset.shape

(461, 4)

So we removed 39 neighborhoods.

In [11]:
neighborhoods_subset.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
98105,460158,Stepping Stones Education Centre,1.318736,103.945087
41387,461186,FENGSHAN GREENVILLE,1.329914,103.940311
108462,529915,KIDSKINGDOM @BEDOK RESERVOIR LLP,1.344775,103.95513
53391,536357,FENGLI GARDENS,1.348414,103.879919
111389,538459,BARTLEY VILLAS,1.344828,103.87773


### Cleaning data

In [12]:
group_by_occurence=neighborhoods_subset.groupby('Neighborhood').count().reset_index()
group_by_occurence.head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
0,3BHC,1,1,1
1,68 DUXTON,1,1,1
2,ACETECH CENTRE,1,1,1
3,AIRLINE HOUSE,1,1,1
4,ANG MO KIO INDUSTRIAL PARK 2,1,1,1


In [13]:
group_by_occurence.sort_values(by="Latitude",ascending=False).head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
267,SENNETT ESTATE,12,12,12
268,SERANGOON GARDEN ESTATE,11,11,11
103,FRANKEL ESTATE,9,9,9
70,DEFU INDUSTRIAL ESTATE,4,4,4
262,SELETAR HILLS ESTATE,4,4,4


#### Some places occured multiple times. Only one occurence will leave:

In [14]:
df_tmp = group_by_occurence[group_by_occurence['Latitude']>1]
df_tmp.head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
18,BEDOKVILLE,3,3,3
27,BRADDELL HEIGHTS ESTATE,2,2,2
29,BUKIT LOYANG ESTATE,2,2,2
34,CAPITOL PARK,2,2,2
36,CASHEW GREEN,2,2,2


In [15]:
df_tmp.shape

(58, 4)

In [16]:
df_tmp_2 = group_by_occurence[group_by_occurence['Latitude']==1]
df_tmp_2.head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
0,3BHC,1,1,1
1,68 DUXTON,1,1,1
2,ACETECH CENTRE,1,1,1
3,AIRLINE HOUSE,1,1,1
4,ANG MO KIO INDUSTRIAL PARK 2,1,1,1


#### 58 Neighborhoods are occured multiple times in our data. So we  take only 1 occurence

In [20]:

def subset_data_frame(input_df1):
    Nghbr = []
    Lat = []
    Long =[]

    for name in input_df1.Neighborhood :
        Nghbr.append(name)
        coordinates = neighborhoods_subset[neighborhoods_subset["Neighborhood"]==name][["Latitude","Longitude"]]
        Lat.append(coordinates.iloc[0,0])
        Long.append(coordinates.iloc[0,1])
    
    df_temp_coord = pd.DataFrame({"Neighborhood":Nghbr,"Latitude":Lat,"Longitude":Long})
    df_temp_coord=df_temp_coord[["Neighborhood","Latitude","Longitude"]]
    
    return df_temp_coord

In [21]:
df_part1 = subset_data_frame(df_tmp)
df_part1.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,BEDOKVILLE,1.323681,103.947566
1,BRADDELL HEIGHTS ESTATE,1.349106,103.865046
2,BUKIT LOYANG ESTATE,1.361056,103.964245
3,CAPITOL PARK,1.328713,103.817978
4,CASHEW GREEN,1.373039,103.770301


In [22]:
df_part2 = subset_data_frame(df_tmp_2)
df_part2.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,3BHC,1.356428,103.832513
1,68 DUXTON,1.277996,103.842907
2,ACETECH CENTRE,1.284936,103.808918
3,AIRLINE HOUSE,1.377163,103.997294
4,ANG MO KIO INDUSTRIAL PARK 2,1.375618,103.860945


In [23]:
df_part1=df_part1.append(df_part2)

In [24]:
df_part1.shape

(353, 3)

## 3. Create a map of Singapore with neighborhoods superimposed on top.¶

#### Use geopy library to get the latitude and longitude values of Singapore.

In [3]:
neighborhoods_subset = pd.read_csv("sg_random_samles_clean.csv")

In [4]:
address = 'Singapore'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore are {}, {}.'.format(latitude, longitude))

In [14]:
map_sg = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(neighborhoods_subset['Latitude'], neighborhoods_subset['Longitude'], neighborhoods_subset['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sg)  
    
map_sg

<img src="1map.png">

## Using Foursquare API

In [5]:
CLIENT_ID = 'MY' # your Foursquare ID
CLIENT_SECRET = 'MY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Let's explore the first neighborhood in our dataframe.

Get the neighborhood's name.

In [6]:
neighborhoods_subset=neighborhoods_subset.loc[:, ~neighborhoods_subset.columns.str.contains('^Unnamed')]
print(neighborhoods_subset.shape)
neighborhoods_subset.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,BEDOKVILLE,1.323681,103.947566
1,BRADDELL HEIGHTS ESTATE,1.349106,103.865046
2,BUKIT LOYANG ESTATE,1.361056,103.964245
3,CAPITOL PARK,1.328713,103.817978
4,CASHEW GREEN,1.373039,103.770301


In [7]:
neighborhoods_subset.loc[0, 'Neighborhood']

'BEDOKVILLE'

Get the neighborhood's latitude and longitude values.

In [8]:
neighborhoods_subset.loc[0, 'Neighborhood']
neighborhood_latitude = neighborhoods_subset.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods_subset.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods_subset.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Now, let's get the top 100 venues that are in BEDOKVILLE within a radius of 500 meters.

In [9]:
# type your answer here

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url



'https://api.foursquare.com/v2/venues/explore?&client_id=OQK105GVMIB5HN5VVZAASP1MJ1KX0Y3SRJ4S20CWIVO2WNNX&client_secret=ERUED4WWGGJGQMW1NMFIGAF0MTHX4OUTTQPWDMOJXAFJ2ILN&v=20180605&ll=1.32368059857,103.947565974&radius=500&limit=100'

In [22]:
results = requests.get(url).json()
results

{u'meta': {u'code': 200, u'requestId': u'5bc9755d351e3d70ba326083'},
 u'response': {u'groups': [{u'items': [{u'reasons': {u'count': 0,
       u'items': [{u'reasonName': u'globalInteractionReason',
         u'summary': u'This spot is popular',
         u'type': u'general'}]},
      u'referralId': u'e-0-4dcdfd0a183899ddfac7c6c3-0',
      u'venue': {u'categories': [{u'icon': {u'prefix': u'https://ss3.4sqi.net/img/categories_v2/food/ramen_',
          u'suffix': u'.png'},
         u'id': u'4bf58dd8d48988d1d1941735',
         u'name': u'Noodle House',
         u'pluralName': u'Noodle Houses',
         u'primary': True,
         u'shortName': u'Noodles'}],
       u'id': u'4dcdfd0a183899ddfac7c6c3',
       u'location': {u'address': u'#01-176, The Marketplace @ 58',
        u'cc': u'SG',
        u'city': u'Singapore',
        u'country': u'Singapore',
        u'crossStreet': u'58 New Upper Changi Rd.',
        u'distance': 189,
        u'formattedAddress': [u'#01-176, The Marketplace @ 58 (58 

In [10]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Amy's Laksa,Noodle House,1.323111,103.945957
1,Kimly Seafood,Coffee Shop,1.32311,103.945939
2,New Changi Eating House,Asian Restaurant,1.323117,103.945954
3,Tanah Merah MRT Interchange (EW4),Train Station,1.327309,103.946443
4,Netball Court Quadrangle Bedok South,Basketball Court,1.325625,103.950991


#### And how many venues were returned by Foursquare?

In [25]:
(nearby_venues.shape[0])

9

## Explore Neighborhoods in Singapore

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
res=getNearbyVenues(names=neighborhoods_subset.Neighborhood, latitudes=neighborhoods_subset.Latitude,
                    longitudes=neighborhoods_subset.Longitude)
res

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,BEDOKVILLE,1.323681,103.947566,Amy's Laksa,1.323111,103.945957,Noodle House
1,BEDOKVILLE,1.323681,103.947566,Kimly Seafood,1.323110,103.945939,Coffee Shop
2,BEDOKVILLE,1.323681,103.947566,New Changi Eating House,1.323117,103.945954,Asian Restaurant
3,BEDOKVILLE,1.323681,103.947566,Tanah Merah MRT Interchange (EW4),1.327309,103.946443,Train Station
4,BEDOKVILLE,1.323681,103.947566,Netball Court Quadrangle Bedok South,1.325625,103.950991,Basketball Court
5,BEDOKVILLE,1.323681,103.947566,Changi Naval Base Pick Up Point @ Tanah Merah,1.327365,103.945903,Bus Station
6,BEDOKVILLE,1.323681,103.947566,East Meadows Swimming Pool,1.327519,103.946264,Pool
7,BEDOKVILLE,1.323681,103.947566,Parit Puaka Dalam,1.327666,103.946820,Lake
8,BEDOKVILLE,1.323681,103.947566,Indoor Sports Hall @ Bedok South,1.326071,103.951059,School
9,BRADDELL HEIGHTS ESTATE,1.349106,103.865046,La Pizzaiola,1.347443,103.867599,Pizza Place


In [13]:
# save our results to a file
res.to_csv("neighborhood_samples_venues_500m.csv")

In [14]:
singapore_venues = res
(singapore_venues.shape)

(7371, 7)

In [15]:
(singapore_venues.shape)
singapore_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,BEDOKVILLE,1.323681,103.947566,Amy's Laksa,1.323111,103.945957,Noodle House
1,BEDOKVILLE,1.323681,103.947566,Kimly Seafood,1.32311,103.945939,Coffee Shop
2,BEDOKVILLE,1.323681,103.947566,New Changi Eating House,1.323117,103.945954,Asian Restaurant
3,BEDOKVILLE,1.323681,103.947566,Tanah Merah MRT Interchange (EW4),1.327309,103.946443,Train Station
4,BEDOKVILLE,1.323681,103.947566,Netball Court Quadrangle Bedok South,1.325625,103.950991,Basketball Court


In [70]:
len(singapore_venues.Venue.unique())

3815

In [71]:
len(singapore_venues["Venue Category"].unique())

320

### Let's check how many venues were returned for each neighborhood

In [16]:
sg_grouped=singapore_venues.groupby('Neighborhood').count()
sg_grouped.head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3BHC,61,61,61,61,61,61
68 DUXTON,100,100,100,100,100,100
ACETECH CENTRE,33,33,33,33,33,33
AIRLINE HOUSE,4,4,4,4,4,4
ANG MO KIO INDUSTRIAL PARK 2,6,6,6,6,6,6


In [17]:
sg_grouped.sort_values('Venue', ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MAGHAIN ABOTH SYNAGOGUE,100,100,100,100,100,100
PAGODA HOUSE,100,100,100,100,100,100
DBS Resorts World Sentosa Level 1 Visitor Centre,100,100,100,100,100,100
HEAP SENG HOUSE,100,100,100,100,100,100
WATERLOO CENTRE,100,100,100,100,100,100
HERITAGE COURT,100,100,100,100,100,100
ASIA STAR HOTEL,100,100,100,100,100,100
FORT CANNING MRT STATION (DT20),100,100,100,100,100,100
KIMSIA PARK,100,100,100,100,100,100
68 DUXTON,100,100,100,100,100,100


As we see 68 Duxton has exactly 100 venues whithin 500 m. Which is not surprising because the area is located in the heart of Singapore - Chinatown. 

We also noted that all neighborhoods have at least 1 venue.

### Let's find out how many unique categories can be curated from all the returned venues



In [18]:
len(singapore_venues['Venue Category'].unique())

320

## Analyze Each Neighborhood

In [19]:
# one hot encoding
sg_onehot = pd.get_dummies(singapore_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sg_onehot['Neighborhood'] = singapore_venues['Neighborhood'] 

# move neighborhood column to the first column
cols = sg_onehot.columns.tolist()
cols.insert(0, cols.pop(cols.index('Neighborhood')))

sg_onehot = sg_onehot.reindex(columns= cols)
sg_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Aquarium,Arcade,...,Warehouse Store,Water Park,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zhejiang Restaurant
0,BEDOKVILLE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BEDOKVILLE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,BEDOKVILLE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BEDOKVILLE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,BEDOKVILLE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category¶

In [20]:
sg_grouped = sg_onehot.groupby('Neighborhood').mean().reset_index()
sg_grouped.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Aquarium,Arcade,...,Warehouse Store,Water Park,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zhejiang Restaurant
0,3BHC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,68 DUXTON,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0
2,ACETECH CENTRE,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AIRLINE HOUSE,0.0,0.0,0.5,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ANG MO KIO INDUSTRIAL PARK 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
sg_grouped.describe()

Unnamed: 0,ATM,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,...,Warehouse Store,Water Park,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zhejiang Restaurant
count,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,...,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0
mean,0.000226,0.000282,0.001634,7.3e-05,0.000291,0.000708,0.006369,5.7e-05,0.00149,2.8e-05,...,0.000708,0.00026,5.7e-05,0.00032,0.00252,0.001196,0.000419,0.000161,0.001873,0.000218
std,0.001899,0.002113,0.026914,0.001365,0.005459,0.013306,0.02943,0.001064,0.012211,0.000532,...,0.013306,0.003129,0.000752,0.002226,0.010024,0.014138,0.006706,0.001375,0.010889,0.004094
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.018182,0.025641,0.5,0.025641,0.102564,0.25,0.333333,0.02,0.166667,0.01,...,0.25,0.043478,0.01,0.022222,0.111111,0.2,0.125,0.016393,0.105263,0.076923


In [82]:
(sg_grouped.sum(axis=0).sort_values())

Burrito Place                                                             0.01
Nail Salon                                                                0.01
Souvenir Shop                                                             0.01
Falafel Restaurant                                                        0.01
Dongbei Restaurant                                                        0.01
Peruvian Restaurant                                                       0.01
Argentinian Restaurant                                                    0.01
Exhibit                                                              0.0103093
Planetarium                                                          0.0153846
Science Museum                                                       0.0153846
Post Office                                                           0.016129
Optical Shop                                                          0.016129
Aquarium                                            

### The most common venue's  categories are Bus Station, Coffee Shop,  Food Court, Cafe, Chinese Restaurant.

## CASE 1: Explore all neighborhoods which is near to the art gallery.

In [21]:
#get a list of a neighboorhoods names
art_buildings=sg_onehot[sg_onehot["Art Gallery"]>0].Neighborhood.unique().tolist()
art_buildings

['SUNRISE VILLA',
 'ACETECH CENTRE',
 'FORT CANNING MRT STATION  (DT20)',
 'HERITAGE COURT',
 'MAGHAIN ABOTH SYNAGOGUE',
 'NANYANG ACADEMY OF FINE ARTS',
 'RIVERGATE',
 'TAMIL METHODIST CHURCH',
 'THE DAULAT',
 'THE SULTAN',
 'UE SQUARE',
 'VALLEY LODGE',
 'WATERLOO CENTRE']

In [22]:
singapore_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,BEDOKVILLE,1.323681,103.947566,Amy's Laksa,1.323111,103.945957,Noodle House
1,BEDOKVILLE,1.323681,103.947566,Kimly Seafood,1.32311,103.945939,Coffee Shop
2,BEDOKVILLE,1.323681,103.947566,New Changi Eating House,1.323117,103.945954,Asian Restaurant
3,BEDOKVILLE,1.323681,103.947566,Tanah Merah MRT Interchange (EW4),1.327309,103.946443,Train Station
4,BEDOKVILLE,1.323681,103.947566,Netball Court Quadrangle Bedok South,1.325625,103.950991,Basketball Court


In [23]:
# subset all geo locations and highlight on the map:
sg_art_neighboors = singapore_venues[(singapore_venues["Neighborhood"].isin(art_buildings))]
sg_art_neighboors=sg_art_neighboors.iloc[:,0:3]
sg_art_neighboors.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
869,SUNRISE VILLA,1.38759,103.855811
870,SUNRISE VILLA,1.38759,103.855811
871,SUNRISE VILLA,1.38759,103.855811
872,SUNRISE VILLA,1.38759,103.855811
873,SUNRISE VILLA,1.38759,103.855811


In [24]:
sg_art_neighboors.shape

(872, 3)

In [25]:
# dropping duplicates:
sg_art_neighboors=sg_art_neighboors.drop_duplicates()
sg_art_neighboors.shape

(13, 3)

In [27]:
map_sg = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(sg_art_neighboors['Neighborhood Latitude'], sg_art_neighboors['Neighborhood Longitude'], sg_art_neighboors['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sg)  
    
map_sg

As expected, most places, which are close to some art gelery, are located in the Central parts of Singapore.

<img src="2map.png">

##  Cluster Neighborhoods

In [54]:
# set number of clusters
kclusters = 8

sg_grouped_clustering = sg_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sg_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 3, 2, 2, 3, 3, 6, 3, 3], dtype=int32)

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [30]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sg_grouped['Neighborhood']

for ind in np.arange(sg_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sg_grouped.iloc[ind, :], num_top_venues)


print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3BHC,Café,Thai Restaurant,Ice Cream Shop,Chinese Restaurant,Asian Restaurant,Vegetarian / Vegan Restaurant,Spa,Bakery,Food Court,Bus Stop
1,68 DUXTON,Café,Korean Restaurant,Japanese Restaurant,Italian Restaurant,Hotel,Ramen Restaurant,Cocktail Bar,Bakery,Coffee Shop,Gym / Fitness Center
2,ACETECH CENTRE,Chinese Restaurant,Food Court,Bus Station,Asian Restaurant,Gourmet Shop,Malay Restaurant,Furniture / Home Store,Sporting Goods Shop,Miscellaneous Shop,Buffet
3,AIRLINE HOUSE,Airport,Airport Service,Food Court,Zhejiang Restaurant,Fishing Store,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant
4,ANG MO KIO INDUSTRIAL PARK 2,Food Court,Bridal Shop,Surf Spot,Arcade,Coffee Shop,Zhejiang Restaurant,Fishing Spot,Farm,Farmers Market,Fast Food Restaurant


In [31]:
o_drop = ['CONSERVATION', 'AREA']
neighborhoods_venues_sorted=neighborhoods_venues_sorted[~neighborhoods_venues_sorted.Neighborhood.str.contains("CONSERVATION AREA")]
neighborhoods_venues_sorted.shape

(353, 11)

##### Not all neighborhoods have a venues. So we will removed those from our neighborhoods_subset

In [32]:
names_k=neighborhoods_venues_sorted.Neighborhood.unique().tolist()
len(names_k)


353

In [34]:
neighborhoods_subset.shape


(353, 3)

#### Now we are fixing multiple occurences:

In [55]:
sg_merged = neighborhoods_subset

# add clustering labels
sg_merged['Cluster Labels'] = kmeans.labels_

# merge sg_grouped with toronto_data to add latitude/longitude for each neighborhood
sg_merged = sg_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

sg_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,BEDOKVILLE,1.323681,103.947566,3,Pool,Bus Station,Basketball Court,Train Station,Asian Restaurant,Lake,Noodle House,Coffee Shop,School,Fishing Spot
1,BRADDELL HEIGHTS ESTATE,1.349106,103.865046,3,Café,Tennis Court,Pizza Place,Dessert Shop,Bus Station,Zhejiang Restaurant,Fish & Chips Shop,Farmers Market,Fast Food Restaurant,Field
2,BUKIT LOYANG ESTATE,1.361056,103.964245,3,Bus Line,Bus Station,Grocery Store,Gym / Fitness Center,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Fish & Chips Shop
3,CAPITOL PARK,1.328713,103.817978,2,Japanese Restaurant,Hotel,Pet Store,Modern European Restaurant,Park,Dentist's Office,Department Store,Farm,Farmers Market,Fast Food Restaurant
4,CASHEW GREEN,1.373039,103.770301,2,Bus Station,Park,Housing Development,Karaoke Bar,Soccer Field,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field


## Visualize the resulting clusters

In [56]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
#rainbow = ["#FF33F6","#33E0FF","#7A33FF","#FF7D33","#7E0548","#2DBB1F","#05127E"]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sg_merged['Latitude'], sg_merged['Longitude'], sg_merged['Neighborhood'], sg_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters

<img src="3map.png">

In [57]:
sg_merged.groupby("Cluster Labels").count()

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,44,44,44,44,44,44,44,44,44,44,44,44,44
1,41,41,41,41,41,41,41,41,41,41,41,41,41
2,26,26,26,26,26,26,26,26,26,26,26,26,26
3,195,195,195,195,195,195,195,195,195,195,195,195,195
4,2,2,2,2,2,2,2,2,2,2,2,2,2
5,13,13,13,13,13,13,13,13,13,13,13,13,13
6,25,25,25,25,25,25,25,25,25,25,25,25,25
7,7,7,7,7,7,7,7,7,7,7,7,7,7


####  Let's see closer CLUSTER 4 and 7, as they have a few members only.

In [65]:
cluster_4_7=sg_merged[(sg_merged["Cluster Labels"].isin([7,4]))]
cluster_4_7.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
72,BEDOK PARK,1.321654,103.950704,7,School,Ice Cream Shop,Supermarket,Sandwich Place,Spa,Dance Studio,English Restaurant,Falafel Restaurant,Farm,Farmers Market
75,BEDOK SHOPPING COMPLEX,1.331124,103.948107,7,Noodle House,Chinese Restaurant,Bus Station,Pool,Japanese Restaurant,Food Court,Soup Place,Seafood Restaurant,Malay Restaurant,BBQ Joint
83,BOON TONG SAN TEMPLE,1.350764,103.721626,7,Food Court,Asian Restaurant,Fast Food Restaurant,Coffee Shop,Video Game Store,Café,Farmers Market,Dessert Shop,Gas Station,Thai Restaurant
101,CITIBANK Jurong East,1.335056,103.739883,7,Japanese Restaurant,Department Store,Shopping Mall,Chinese Restaurant,Coffee Shop,Bakery,Café,Sandwich Place,Burger Joint,Food Court
135,EVANIA,1.350308,103.879056,4,Coffee Shop,Noodle House,Korean Restaurant,Chinese Restaurant,Convenience Store,Seafood Restaurant,Vegetarian / Vegan Restaurant,Taiwanese Restaurant,Thai Restaurant,Japanese Restaurant
172,HOLLAND GREEN,1.32576,103.78422,7,Canal,Pool,Playground,Fish & Chips Shop,Exhibit,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
223,MOSQUE,1.219285,103.849158,7,Beach,Pier,Island,Zhejiang Restaurant,Fishing Spot,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant
289,SPRING PARK ESTATE,1.315102,103.937585,4,Gym,Pool,Bus Station,Bus Stop,Asian Restaurant,Convenience Store,Korean Restaurant,Chinese Restaurant,Farm,Farmers Market
315,THE SEAWIND,1.311228,103.914626,7,Italian Restaurant,Bus Station,Bus Stop,Noodle House,Asian Restaurant,Bar,Café,Soup Place,Park,Cocktail Bar


## CASE 2 :THE PATERSON EDGE	

### The Business Problem: we have a customer who recently relocated to Singapore. The first month he was staying in the Paterson Edge Condo. Unfortunately, his monthes fees are very high in this area, but the neirborhood is very lovely. Therefore, our customer is looking for a new house in a less expencive area but with a nice community around him.

### SOLUTION:

### Using our clusters, we are going to find to which cluster The Paterson Edge is belong. Then we will highlight all members of this cluster on the SG MAP, and analise it.

In [67]:
property_name =  "THE PATERSON EDGE"

property_cluster = sg_merged[sg_merged["Neighborhood"]== property_name][["Cluster Labels"]]
property_cluster

Unnamed: 0,Cluster Labels
312,3


### We found that The Paterson Edge belongs to the 3d cluster. Lets look closer to the 3d cluster:

In [68]:
cluster_3=sg_merged[(sg_merged["Cluster Labels"].isin([3]))]
cluster_3

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,BEDOKVILLE,1.323681,103.947566,3,Pool,Bus Station,Basketball Court,Train Station,Asian Restaurant,Lake,Noodle House,Coffee Shop,School,Fishing Spot
1,BRADDELL HEIGHTS ESTATE,1.349106,103.865046,3,Café,Tennis Court,Pizza Place,Dessert Shop,Bus Station,Zhejiang Restaurant,Fish & Chips Shop,Farmers Market,Fast Food Restaurant,Field
2,BUKIT LOYANG ESTATE,1.361056,103.964245,3,Bus Line,Bus Station,Grocery Store,Gym / Fitness Center,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Fish & Chips Shop
5,CASHEW VILLAS,1.371827,103.768709,3,Bus Station,Park,Basketball Court,Housing Development,Karaoke Bar,Fishing Spot,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant
6,CHANGI HEIGHTS,1.362937,103.973532,3,Café,Coffee Shop,Chinese Restaurant,Department Store,History Museum,Zhejiang Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
8,DUNEARN ESTATE,1.323532,103.820570,3,Bus Station,Convenience Store,Gas Station,Café,Park,Track Stadium,Garden,Botanical Garden,Flower Shop,Playground
9,EAST COAST HILL,1.316934,103.935362,3,Bus Stop,Breakfast Spot,Convenience Store,Chinese Restaurant,Bus Station,Asian Restaurant,Food Court,Dessert Shop,Cosmetics Shop,Coffee Shop
14,FABER HILLS,1.320624,103.760430,3,Bus Station,Park,Food & Drink Shop,Japanese Restaurant,French Restaurant,Food,Flower Shop,Flea Market,Fishing Store,Fishing Spot
17,GOLDHILL SHOPPING CENTRE,1.318158,103.843303,3,Café,Coffee Shop,Bakery,Pharmacy,Japanese Restaurant,Asian Restaurant,Italian Restaurant,Dessert Shop,Chinese Restaurant,Thai Restaurant
18,HDB PUBLIC SHELTERS,1.358815,103.935940,3,Bus Line,Playground,Food & Drink Shop,Department Store,Supermarket,Trail,Bus Station,Zhejiang Restaurant,Fish & Chips Shop,Farmers Market


### First, this is the biggest cluster. WHich is good, because we have many options. For instance we can propose to our customer consider the SEA VIEW PARK condo in the East Coast Area. It will be much cheaper than Orchard area, but with a lovely neighborhood.

In [73]:
map_sg = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(cluster_3['Latitude'], cluster_3['Longitude'], cluster_3['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sg)  
    
map_sg

<img src="4map.png">