In [1]:
# import libraries
import numpy as np # library to handle data in a vectorized manner

!conda install -c anaconda pandas --yes
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - pandas


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    numpy-base-1.15.4          |   py36h81de0dd_0         4.2 MB  anaconda
    numpy-1.15.4               |   py36h1d66e8a_0          35 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    certifi-2019.11.28         |           py36_1         157 KB  anaconda
    mkl_fft-1.0.6              |   py36h7dd41cf_0         150 KB  anaconda
    pandas-1.0.3               |   py36h0573a6f_0        11.1 MB  anaconda
    pytz-2019.3                |             py_0         231 KB  anaconda
    mkl_random-1.0.1           |   py36h4414c95_1  

In [2]:
# run this codes to load data file if not yet 
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [5]:
# create dataframe neighborhood_df from data file
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']
column_names = ['Neighborhood_ID', 'Neighborhood_name', 'Borough', 'Latitude', 'Longitude'] # define the dataframe columns
neighborhood_df = pd.DataFrame(columns=column_names) # instantiate the dataframe
for data in neighborhoods_data:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhood_id = neighborhood_name + str(neighborhood_lat) + str(neighborhood_lon)
    
    nneighborhood_df = neighborhood_df.append({'Neighborhood_ID' : neighborhood_id,
                                              'Neighborhood_name': neighborhood_name,
                                              'Borough': borough,
                                              'Latitude': neighborhood_lat,
                                              'Longitude': neighborhood_lon}, ignore_index=True)

In [3]:
# review dataframe neighborhood_df
print('The dataframe has {} unique boroughs and {} unique neighborhoods.'.format(
        len(neighborhood_df['Borough'].unique()),
        len(neighborhood_df['Neighborhood_ID'].unique())
    )
)
neighborhood_df.tail()

The dataframe has 5 unique boroughs and 306 unique neighborhoods.


Unnamed: 0,Neighborhood_ID,Neighborhood_name,Borough,Latitude,Longitude
301,Hudson Yards40.75665808227519-74.00011136202637,Hudson Yards,Manhattan,40.756658,-74.000111
302,Hammels40.58733774018741-73.80553002968718,Hammels,Queens,40.587338,-73.80553
303,Bayswater40.611321691283834-73.76596781445627,Bayswater,Queens,40.611322,-73.765968
304,Queensbridge40.756091297094706-73.94563070334091,Queensbridge,Queens,40.756091,-73.945631
305,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,Staten Island,40.617311,-74.08174


In [2]:
# run this code if there is any error with dataframe neighborhood_df
# neighborhood_df = pd.read_csv('neighborhood_df.csv')

In [9]:
# get credentials to access Foursquare
CLIENT_ID = 'UGGJRJRUW4YQACAX2XDMXN4HD55N5XUC4RVLWZ525CXHIJTA' # your Foursquare ID
CLIENT_SECRET = 'BDCGCIRWKG1S1J2S2G0MJFB2RZZPYBDMNL0EAQZF1CMDIKYI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UGGJRJRUW4YQACAX2XDMXN4HD55N5XUC4RVLWZ525CXHIJTA
CLIENT_SECRET:BDCGCIRWKG1S1J2S2G0MJFB2RZZPYBDMNL0EAQZF1CMDIKYI


In [10]:
# limitation of getting vevues
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius of 1000km

In [11]:
# create a function to get all the neighborhoods in New York
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
                
        # return only relevant information for each nearby venue
        venues_list.append([(name+str(lat)+str(lng),
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood_ID',
                            'Neighborhood_name',
                            'Neighborhood_Lat', 
                            'Neighborhood_Long', 
                            'Venue', 
                            'Venue_Lat', 
                            'Venue_Long', 
                            'Venue_Category']
    
    return(nearby_venues)

In [14]:
# create dataframe NY_venues to store all venues in New York
NY_venues = getNearbyVenues(names=neighborhood_df['Neighborhood_name'],
                            latitudes=neighborhood_df['Latitude'],
                            longitudes=neighborhood_df['Longitude']
                            )

In [15]:
# review dataframe NY_venues
print(NY_venues.shape)
NY_venues.tail()

(20673, 8)


Unnamed: 0,Neighborhood_ID,Neighborhood_name,Neighborhood_Lat,Neighborhood_Long,Venue,Venue_Lat,Venue_Long,Venue_Category
20668,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,40.617311,-74.08174,Bayview Deli,40.620981,-74.072566,Deli / Bodega
20669,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,40.617311,-74.08174,Guild Patio,40.614399,-74.091551,Scenic Lookout
20670,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,40.617311,-74.08174,Al-Humza,40.611311,-74.0889,Indian Restaurant
20671,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,40.617311,-74.08174,Kum Fung Kitchen,40.621842,-74.072305,Chinese Restaurant
20672,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,40.617311,-74.08174,MTA SIR - Clifton,40.621543,-74.071498,Train Station


In [None]:
# run this code if there is any error with dataframe NY_venues
# NY_venues = pd.read_csv('NY_venues.csv')

In [16]:
# Analyze data of New York neighborhood (dataframe NY_venues)

# one hot encoding
NY_onehot = pd.get_dummies(NY_venues[['Venue_Category']], prefix="", prefix_sep="")

# add neighborhood_ID column back to dataframe
NY_onehot['Neighborhood_ID'] = NY_venues['Neighborhood_ID'] 

# move neighborhood_ID column to the first column
fixed_columns = [NY_onehot.columns[-1]] + list(NY_onehot.columns[:-1])
NY_onehot = NY_onehot[fixed_columns]

In [17]:
# review dataframe NY_onehot
print(NY_onehot.shape)
NY_onehot.tail()

(20673, 479)


Unnamed: 0,Neighborhood_ID,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
20668,Fox Hills40.61731079252983-74.08173992211962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20669,Fox Hills40.61731079252983-74.08173992211962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20670,Fox Hills40.61731079252983-74.08173992211962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20671,Fox Hills40.61731079252983-74.08173992211962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20672,Fox Hills40.61731079252983-74.08173992211962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# group rows by neighborhood_ID and by taking the mean of the frequency of occurrence of each category
NY_grouped = NY_onehot.groupby('Neighborhood_ID').mean().reset_index()

In [5]:
# review dataframe NY_grouped
print(NY_grouped.shape)
NY_grouped.tail()

(306, 481)


Unnamed: 0,Neighborhood_ID,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
301,Woodhaven40.68988687915789-73.8581104655432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302,Woodlawn40.89827261213805-73.86731496814176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
303,Woodrow40.541967622888755-74.20524582480326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304,Woodside40.74634908860222-73.90184166838284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,Yorkville40.775929849884875-73.94711784471826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.02,0.0,0.0


In [4]:
# run this code if there is any error with dataframe NY_grouped
# NY_grouped = pd.read_csv('NY_grouped.csv')

In [6]:
# find the most venue category (the venue category has max value in dataframe NY_grouped)
most_venue_category = NY_grouped.set_index('Neighborhood_ID').T.max(axis=1).idxmax()
most_venue_category

'Beach'

In [7]:
# clustering dataframe NY_grouped

# set number of clusters
kclusters = 6

# create dataframe NY_grouped_clustering from NY_grouped without column 'Neighborhood_ID'
NY_grouped_clustering = NY_grouped.drop('Neighborhood_ID', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NY_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 0, 4, 0, 1, 3, 5, 3, 3], dtype=int32)

In [8]:
# merge dataframe NY_grouped & neighborhood_df into NY_merged including full info about neighborhoods in New York
NY_grouped.insert(0, 'Cluster Labels', kmeans.labels_) # add clustering labels
NY_merged = neighborhood_df
NY_merged = NY_merged.join(NY_grouped.set_index('Neighborhood_ID'), on='Neighborhood_ID')

In [9]:
# review dataframe NY_merged
NY_merged.shape
NY_merged.tail()

Unnamed: 0,Neighborhood_ID,Neighborhood_name,Borough,Latitude,Longitude,Cluster Labels,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
301,Hudson Yards40.75665808227519-74.00011136202637,Hudson Yards,Manhattan,40.756658,-74.000111,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
302,Hammels40.58733774018741-73.80553002968718,Hammels,Queens,40.587338,-73.80553,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
303,Bayswater40.611321691283834-73.76596781445627,Bayswater,Queens,40.611322,-73.765968,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304,Queensbridge40.756091297094706-73.94563070334091,Queensbridge,Queens,40.756091,-73.945631,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,Fox Hills40.61731079252983-74.08173992211962,Fox Hills,Staten Island,40.617311,-74.08174,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# run this code if there is any error with dataframe NY_merged
# NY_merged = pd.read_csv('NY_merged.csv')

In [10]:
# check number of neighborhoods having the most venue category to find the cluster having no neighborhood including the most venue category
target_cluster_list = []
for c in range(kclusters):
    seriesObj = NY_merged.loc[NY_merged['Cluster Labels'] == c].apply(lambda x: True if x[most_venue_category] != 0 else False , axis=1)
    n = len(seriesObj[seriesObj == True].index)
    print('Cluster ',c, ' has ', n, 'neighborhoods including ', most_venue_category)
    if n == 0:
        target_cluster_list.append(c)
print('The target clusters is ',  target_cluster_list)

Cluster  0  has  3 neighborhoods including  Beach
Cluster  1  has  3 neighborhoods including  Beach
Cluster  2  has  9 neighborhoods including  Beach
Cluster  3  has  14 neighborhoods including  Beach
Cluster  4  has  0 neighborhoods including  Beach
Cluster  5  has  5 neighborhoods including  Beach
The target clusters is  [4]


In [22]:
# Create dataframe target_clusters including all neighborhoods having no the most venue category
# column_names = NY_merged.columns
target_clusters = pd.DataFrame(columns=NY_merged.columns)
target_clusters

Unnamed: 0,Neighborhood_ID,Neighborhood_name,Borough,Latitude,Longitude,Cluster Labels,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit


In [23]:
# list all neighborhoods of target_clusters
for i in target_cluster_list:
    target_clusters = NY_merged.loc[NY_merged['Cluster Labels'] == i]
    target_clusters.append(target_clusters, ignore_index = True)

In [24]:
target_clusters

Unnamed: 0,Neighborhood_ID,Neighborhood_name,Borough,Latitude,Longitude,Cluster Labels,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
207,Port Ivory40.63968297845542-74.17464532993542,Port Ivory,Staten Island,40.639683,-74.174645,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,Arlington40.63532509911492-74.16510420241124,Arlington,Staten Island,40.635325,-74.165104,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0


The target_clusters includes 2 neighborhoods Port Ivory & Arlington having no 'Beach'. Therefore, business directly related to beach are opportunities to get profit, such as shuttle between hotels or center of the neighborhood & attractive beaches, restaurants or swimming pools on top of buildings to view beautiful beaches 