# Battle of the Cities: Getting Venues from Foursquare API

In [1]:
import requests # library to handle requests
import pandas as pd 
import numpy as np
import json

### Bring in data

In [8]:
sf_hoods = pd.read_csv('../data/sf-locations.csv',delimiter=",")
nyc_hoods = pd.read_csv('../data/nyc-locations.csv')

sf_hoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Anza Vista,37.7808,-122.4432
1,Alamo Square,37.7764,-122.4346
2,Sutro Heights,37.7782,-122.5083
3,Seacliff,37.7868,-122.489
4,Lake Street,37.7856,-122.4794


## Foursquare requests

In [3]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IW1DNBRJOUHECGXR0ST002ZPDJYOBCIRTROZZ5YJZFHCIP3Q
CLIENT_SECRET:TWMIYINYFXOLI0IGNPXXLCIRHFWLUNJ4VP2TJZXSJGUP2ZBN


### Generate url for request

In [9]:
LIMIT = 50    
def getNearbyVenues(names, latitudes, longitudes, radius=300):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Turn `nearby_venues` into its own dataframe
#### San Francisco

In [70]:
sf_venues = getNearbyVenues(names=sf_hoods['Neighborhood'],
                                   latitudes=sf_hoods['Latitude'],
                                   longitudes=sf_hoods['Longitude']
                                  )

print(sf_venues.shape)
sf_venues.head()

Anza Vista
Alamo Square
Sutro Heights
Seacliff
Lake Street
Presidio
Presidio Terrace
Outer Richmond
Outer Sunset
Lakeshore
Inner Richmond
Inner Sunset
Fishermans Wharf
Aquatic Park / Ft. Mason
Union Street
Presidio Heights
Laurel Heights / Jordan Park
Northern Waterfront
Lone Mountain
Panhandle
Haight Ashbury
Ashbury Heights
Castro
Noe Valley
Glen Park
Outer Mission
Marina
Cow Hollow
Pacific Heights
Lower Pacific Heights
Japantown
Polk Gulch
Rincon Hill
Western Addition
Lower Haight
Duboce Triangle
Mint Hill
Mission Dolores
Hayes Valley
North Beach
Russian Hill
Chinatown
Nob Hill
Lower Nob Hill
Cathedral Hill
Downtown
Tenderloin
Civic Center
South of Market
Mission
Bernal Heights
Telegraph Hill
Financial District
South Beach
Showplace Square
Protrero Hill
Mission Bay
Dogpatch
Bayview
Central Waterfront
Golden Gate Heights
Buena Vista
Corona Heights
Cole Valley
Parnassus Heights
Eureka Valley
Dolores Heights
Forest Knolls
Clarendon Heights
Upper Market
Midtown Terrace
Laguna Honda
Fores

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Anza Vista,37.7808,-122.4432,Anza Vista,37.779721,-122.442065,Neighborhood
1,Anza Vista,37.7808,-122.4432,European Wax Center San Francisco,37.781791,-122.445398,Health & Beauty Service
2,Anza Vista,37.7808,-122.4432,European Wax Center,37.781838,-122.445449,Health & Beauty Service
3,Anza Vista,37.7808,-122.4432,Tony's Cable Car Restaurant,37.782814,-122.444836,Burger Joint
4,Anza Vista,37.7808,-122.4432,Subway,37.781817,-122.445434,Sandwich Place


#### New York City

In [12]:
nyc_venues = getNearbyVenues(names=nyc_hoods['Neighborhood'],
                                   latitudes=nyc_hoods['Latitude'],
                                   longitudes=nyc_hoods['Longitude']
                                  )

print(nyc_venues.shape)
nyc_venues.head()

Marble Hill
Inwood
Washington Heights
Hamilton Heights
Manhattanville
Morningside Heights
Harlem
East Harlem
Upper West Side
Carnegie Hill
Upper East Side
Roosevelt Island
Columbus Circle
Midtown
Clinton
Sutton Place
Turtle Bay
Tudor City
Garment District
Murry Hill
Chelsea
Flatiron District
Gramercy
Stuyvesant Town
West Village
Greenwich Village
East Village
SoHo
NoHo
Little Italy
Lower East Side
Tribeca
Battery Park
Financial District
Chinatown
(1371, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.8758,-73.9105,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.8758,-73.9105,Starbucks,40.873755,-73.908613,Coffee Shop
2,Marble Hill,40.8758,-73.9105,Rite Aid,40.875467,-73.908906,Pharmacy
3,Marble Hill,40.8758,-73.9105,Subway Sandwiches,40.874667,-73.909586,Sandwich Place
4,Marble Hill,40.8758,-73.9105,GameStop,40.874267,-73.909342,Video Game Store


### Group the venues by neighborhood

In [11]:
sf_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alamo Square,17,17,17,17,17,17
Anza Vista,5,5,5,5,5,5
Apparel City,12,12,12,12,12,12
Aquatic Park / Ft. Mason,29,29,29,29,29,29
Ashbury Heights,9,9,9,9,9,9
...,...,...,...,...,...,...
West Portal,41,41,41,41,41,41
Western Addition,19,19,19,19,19,19
Westwood Highlands,1,1,1,1,1,1
Westwood Park,3,3,3,3,3,3


In [13]:
nyc_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park,50,50,50,50,50,50
Carnegie Hill,25,25,25,25,25,25
Chelsea,50,50,50,50,50,50
Chinatown,50,50,50,50,50,50
Clinton,50,50,50,50,50,50
Columbus Circle,40,40,40,40,40,40
East Harlem,35,35,35,35,35,35
East Village,50,50,50,50,50,50
Financial District,50,50,50,50,50,50
Flatiron District,50,50,50,50,50,50


## One Hot Encoding
### San Francisco

In [15]:
# one hot encoding
sf_onehot = pd.get_dummies(sf_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sf_onehot['Neighborhood'] = sf_venues['Neighborhood'] 

sf_onehot.head()

Unnamed: 0,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Alternative Healer,American Restaurant,Antique Shop,Arcade,Art Gallery,...,Vietnamese Restaurant,Vineyard,Warehouse,Warehouse Store,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# move neighborhood column to the first column
cols = sf_onehot.columns.to_list()
cols.insert(0, cols.pop(cols.index('Neighborhood')))
sf_onehot = sf_onehot.reindex(columns = cols)
sf_onehot.columns

Index(['Neighborhood', 'ATM', 'Acai House', 'Accessories Store',
       'Adult Boutique', 'African Restaurant', 'Alternative Healer',
       'American Restaurant', 'Antique Shop', 'Arcade',
       ...
       'Vietnamese Restaurant', 'Vineyard', 'Warehouse', 'Warehouse Store',
       'Wine Bar', 'Wine Shop', 'Winery', 'Wings Joint', 'Women's Store',
       'Yoga Studio'],
      dtype='object', length=317)

In [22]:
sf_onehot.head()

Unnamed: 0,Neighborhood,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Alternative Healer,American Restaurant,Antique Shop,Arcade,...,Vietnamese Restaurant,Vineyard,Warehouse,Warehouse Store,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Anza Vista,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Anza Vista,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Anza Vista,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Anza Vista,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Anza Vista,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now that I have the desired dataframe, I'll group the data by neighborhood and calculate the mean of the frequency of occurence of each category.

In [24]:
sf_grouped = sf_onehot.groupby('Neighborhood').mean().reset_index()
sf_grouped.head()

Unnamed: 0,Neighborhood,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Alternative Healer,American Restaurant,Antique Shop,Arcade,...,Vietnamese Restaurant,Vineyard,Warehouse,Warehouse Store,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Alamo Square,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Anza Vista,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apparel City,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aquatic Park / Ft. Mason,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0
4,Ashbury Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0


### New York City

In [25]:
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood'] 

# move neighborhood column to the first column
cols = nyc_onehot.columns.to_list()
cols.insert(0, cols.pop(cols.index('Neighborhood')))
nyc_onehot = nyc_onehot.reindex(columns = cols)
nyc_onehot.columns

nyc_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [26]:
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()
nyc_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Battery Park,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Carnegie Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
2,Chelsea,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0
3,Chinatown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0
4,Clinton,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.06,0.0,0.0


## Get most common venues in each neighborhood

In [27]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### San Francisco

In [28]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
sf_hoods_venues_sorted = pd.DataFrame(columns=columns)
sf_hoods_venues_sorted['Neighborhood'] = sf_grouped['Neighborhood']

for ind in np.arange(sf_grouped.shape[0]):
    sf_hoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sf_grouped.iloc[ind, :], num_top_venues)

sf_hoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alamo Square,Park,Bus Station,Seafood Restaurant,Rock Club,Dog Run,Nightclub,Café,Historic Site,Liquor Store,Bakery
1,Anza Vista,Health & Beauty Service,Sandwich Place,Burger Joint,Yoga Studio,Farmers Market,Ethiopian Restaurant,Event Space,Exhibit,Eye Doctor,Falafel Restaurant
2,Apparel City,Nightclub,Miscellaneous Shop,Rental Car Location,Pet Service,Hardware Store,Automotive Shop,Garden Center,Outdoor Supply Store,Food Truck,Convenience Store
3,Aquatic Park / Ft. Mason,Chocolate Shop,Park,Gift Shop,Playground,Cantonese Restaurant,Shopping Plaza,Beer Garden,Garden,Mini Golf,Trail
4,Ashbury Heights,Italian Restaurant,Bakery,Sushi Restaurant,Organic Grocery,Gym,Bar,Breakfast Spot,Playground,Wine Bar,Department Store


### New York City

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
nyc_hoods_venues_sorted = pd.DataFrame(columns=columns)
nyc_hoods_venues_sorted['Neighborhood'] = nyc_grouped['Neighborhood']

for ind in np.arange(nyc_grouped.shape[0]):
    nyc_hoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyc_grouped.iloc[ind, :], num_top_venues)

nyc_hoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park,Park,Memorial Site,Coffee Shop,Cupcake Shop,Sandwich Place,Food Court,Shopping Mall,Boat or Ferry,Steakhouse,Gastropub
1,Carnegie Hill,Italian Restaurant,French Restaurant,Spa,Yoga Studio,Kosher Restaurant,Breakfast Spot,Sculpture Garden,Museum,Bookstore,Exhibit
2,Chelsea,Art Gallery,French Restaurant,Tapas Restaurant,Coffee Shop,Italian Restaurant,Park,Chinese Restaurant,Sandwich Place,Cupcake Shop,Bookstore
3,Chinatown,Chinese Restaurant,Bakery,Spa,Dessert Shop,Optical Shop,Dim Sum Restaurant,Cocktail Bar,Salon / Barbershop,Supermarket,Ice Cream Shop
4,Clinton,Theater,Wine Shop,New American Restaurant,American Restaurant,Bar,Lounge,Gym / Fitness Center,Hotel,Hotel Bar,Sporting Goods Shop


## K-Means Clustering

In [37]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
from folium import plugins

import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


### San Francisco

In [113]:
# set number of clusters
kclusters = 10

sf_grouped_clustering = sf_grouped.drop('Neighborhood', 1)

# run k-means clustering
sf_kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sf_grouped_clustering)

# check cluster labels generated for each row in the dataframe
sf_kmeans.labels_[0:10] 

array([5, 5, 5, 5, 5, 1, 9, 5, 5, 1])

In [126]:
# add clustering labels
# sf_hoods_venues_sorted.insert(0, 'Cluster Label', sf_kmeans.labels_)

sf_merged = sf_hoods

# merge sf_grouped with sf_data to add latitude/longitude for each neighborhood
sf_merged = sf_merged.join(sf_hoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

sf_merged.head(2)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anza Vista,37.7808,-122.4432,5.0,Health & Beauty Service,Sandwich Place,Burger Joint,Yoga Studio,Farmers Market,Ethiopian Restaurant,Event Space,Exhibit,Eye Doctor,Falafel Restaurant
1,Alamo Square,37.7764,-122.4346,5.0,Park,Bus Station,Seafood Restaurant,Rock Club,Dog Run,Nightclub,Café,Historic Site,Liquor Store,Bakery


In [127]:
#drop any missing data
sf_merged.dropna(inplace=True)
sf_merged.isnull().sum()

Neighborhood              0
Latitude                  0
Longitude                 0
Cluster Label             0
1st Most Common Venue     0
2nd Most Common Venue     0
3rd Most Common Venue     0
4th Most Common Venue     0
5th Most Common Venue     0
6th Most Common Venue     0
7th Most Common Venue     0
8th Most Common Venue     0
9th Most Common Venue     0
10th Most Common Venue    0
dtype: int64

In [129]:
sf_merged["Cluster Label"]= sf_merged["Cluster Label"].astype(int)
sf_merged.dtypes

Neighborhood               object
Latitude                  float64
Longitude                 float64
Cluster Label               int32
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

In [130]:
#location of San Francisco
sf_latitude = 37.7749 
sf_longitude = -122.4194

# create map
sf_map_clusters = folium.Map(location=[sf_latitude, sf_longitude], 
                             zoom_start=12.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# type()


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sf_merged['Latitude'], 
                                  sf_merged['Longitude'], 
                                  sf_merged['Neighborhood'], 
                                  sf_merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(sf_map_clusters)
       
sf_map_clusters

In [131]:
sf_merged["Cluster Label"].value_counts()

5    83
1    13
0     7
9     3
3     2
8     1
7     1
6     1
4     1
2     1
Name: Cluster Label, dtype: int64

### New York City

In [132]:
# set number of clusters
kclusters = 10

nyc_grouped_clustering = nyc_grouped.drop('Neighborhood', 1)

# run k-means clustering
nyc_kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyc_grouped_clustering)

# check cluster labels generated for each row in the dataframe
nyc_kmeans.labels_[0:10] 

array([1, 2, 7, 6, 1, 4, 4, 4, 1, 1])

In [134]:
# add clustering labels
# nyc_hoods_venues_sorted.insert(0, 'Cluster Label', nyc_kmeans.labels_)

nyc_merged = nyc_hoods

# merge sf_grouped with sf_data to add latitude/longitude for each neighborhood
nyc_merged = nyc_merged.join(nyc_hoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

nyc_merged.head(2)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,40.8758,-73.9105,4,Pharmacy,Train Station,Mattress Store,Storage Facility,Supermarket,Sandwich Place,Gym / Fitness Center,Bakery,Coffee Shop,Bank
1,Inwood,40.8688,-73.922,4,Café,Frozen Yogurt Shop,Wine Bar,Bank,American Restaurant,Park,Deli / Bodega,Diner,Ice Cream Shop,Sandwich Place


In [135]:
#check/drop any missing data
# nyc_merged.dropna(inplace=True)
nyc_merged.isnull().sum()

Neighborhood              0
Latitude                  0
Longitude                 0
Cluster Label             0
1st Most Common Venue     0
2nd Most Common Venue     0
3rd Most Common Venue     0
4th Most Common Venue     0
5th Most Common Venue     0
6th Most Common Venue     0
7th Most Common Venue     0
8th Most Common Venue     0
9th Most Common Venue     0
10th Most Common Venue    0
dtype: int64

In [136]:
nyc_merged.dtypes

Neighborhood               object
Latitude                  float64
Longitude                 float64
Cluster Label               int32
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

In [137]:
#manhattan location data
m_latitude = 40.7831 
m_longitude = -73.9712

# create map
nyc_map_clusters = folium.Map(location=[m_latitude, m_longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nyc_merged['Latitude'], nyc_merged['Longitude'], nyc_merged['Neighborhood'], nyc_merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(nyc_map_clusters)
       
nyc_map_clusters

In [138]:
nyc_merged["Cluster Label"].value_counts()

2    10
4     9
1     8
6     2
9     1
8     1
7     1
5     1
3     1
0     1
Name: Cluster Label, dtype: int64