## Final Project for IBM Data Science Capstone ##
## Comparison of neighborhoods in Buenos Aires, Argentina and Montevideo, Uruguay ##
## Buenos Aires

Import all the libraries required for the project

In [6]:
import pandas as pd
import numpy as np
# 
import requests
import geocoder
from geopy.geocoders import Nominatim 
#
import matplotlib.cm as cm
import matplotlib.colors as colors
# 
from sklearn.cluster import KMeans
#
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
#
import json 
from pandas.io.json import json_normalize
#
pd.set_option('display.max_rows', 500)

Import BA Neighborhood data

In [7]:
df_ba = pd.read_csv("/Users/davidknight/Downloads/DS-ba.csv", sep=',', encoding='maccentraleurope')
df_ba.shape

(48, 4)

In [8]:
print('The Buenos Aires dataframe has {} unique neighborhoods.'.format(
        len(df_ba['Neighborhood'].unique())))

The Buenos Aires dataframe has 48 unique neighborhoods.


Register logon information with Foursquare (Data has been masked).

In [11]:
CLIENT_ID = 'hello' # your Foursquare ID
CLIENT_SECRET = 'how are you?' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
# print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:


getNearbyVenues takes a dataframe and returns the venues for each row in the dataframe.

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
neighborhood_latitude = df_ba.loc[0, 'Lat'] # neighborhood latitude value
neighborhood_longitude = df_ba.loc[0, 'Long'] # neighborhood longitude value
neighborhood_name = df_ba.loc[0, 'Neighborhood'] # neighborhood name

Build a URL for each call to Foursquare

In [14]:
LIMIT = 100
#
radius = 1000 # define radius
#
url_ba = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url_ba

'https://api.foursquare.com/v2/venues/explore?&client_id=4O4EX12OQOIHEDWFXUJALPEJ1GUNW14U3UWR0KOXRHBLGICK&client_secret=TRVJAMEIG3SJY41TLPSPPPSUTLMWFNPCFVCMO23BVYKGVRHA&v=20180605&ll=-34.6,-58.4833&radius=1000&limit=100'

In [15]:
results_ba = requests.get(url_ba).json()

In [18]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Build a list of venues by neighborhood.

In [20]:
ba_venues = getNearbyVenues(names=df_ba['Neighborhood'],
                                   latitudes=df_ba['Lat'],
                                   longitudes=df_ba['Long']
                                  )

Agronom°a
Almagro
Balvanera
Barracas
Belgrano
Boedo
Caballito
Chacarita
Coghlan
Colegiales
ConstituciĘn
Flores
Floresta
La Boca
La Paternal
Liniers
Mataderos
Monte Castro
Monserrat
Nueva Pompeya
N£§ez
Palermo
Parque Avellaneda
Parque Chacabuco
Parque Chas
Parque Patricios
Puerto Madero
Recoleta
Retiro
Saavedra
San CristĘbal
San Nicol†s
San Telmo
Vālez S†rsfield
Versalles
Villa Crespo
Villa del Parque
Villa Devoto
Villa General Mitre
Villa Lugano
Villa Luro
Villa Ort£zar
Villa PueyrredĘn
Villa Real
Villa Riachuelo
Villa Santa Rita
Villa Soldati
Villa Urquiza


In [21]:
print('The Buenos Aires dataframe has {} unique neighborhoods.'.format(
        len(ba_venues['Neighborhood'].unique())))

The Buenos Aires dataframe has 48 unique neighborhoods.


Count venues by neighborhood.

In [24]:
ba_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agronom°a,8,8,8,8,8,8
Almagro,59,59,59,59,59,59
Balvanera,31,31,31,31,31,31
Barracas,4,4,4,4,4,4
Belgrano,44,44,44,44,44,44
Boedo,8,8,8,8,8,8
Caballito,20,20,20,20,20,20
Chacarita,22,22,22,22,22,22
Coghlan,14,14,14,14,14,14
Colegiales,31,31,31,31,31,31


Count unique categories.  

In [25]:
print('There are {} uniques categories.'.format(len(ba_venues['Venue Category'].unique())))

There are 185 uniques categories.


Add one hot encoding

In [26]:
# one hot encoding
ba_onehot = pd.get_dummies(ba_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ba_onehot['Neighborhood'] = ba_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ba_onehot.columns[-1]] + list(ba_onehot.columns[:-1])
ba_onehot = ba_onehot[fixed_columns]

ba_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Amphitheater,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Athletics & Sports,Auto Garage,Auto Workshop,...,Used Bookstore,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agronom°a,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Agronom°a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Agronom°a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Agronom°a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agronom°a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
ba_onehot.shape

(1201, 186)

In [29]:
ba_grouped = ba_onehot.groupby('Neighborhood').mean().reset_index()
ba_grouped

Unnamed: 0,Neighborhood,Accessories Store,Amphitheater,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Athletics & Sports,Auto Garage,Auto Workshop,...,Used Bookstore,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agronom°a,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Almagro,0.0,0.0,0.033898,0.0,0.0,0.0,0.0,0.016949,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Balvanera,0.0,0.0,0.096774,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032258
3,Barracas,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Belgrano,0.0,0.0,0.045455,0.0,0.022727,0.0,0.0,0.0,0.0,...,0.0,0.045455,0.0,0.0,0.0,0.0,0.022727,0.0,0.022727,0.022727
5,Boedo,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Caballito,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Chacarita,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Coghlan,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Colegiales,0.0,0.0,0.032258,0.0,0.0,0.0,0.032258,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
ba_grouped.shape

(47, 174)

Show top 5 venues by neighborhood

In [30]:
num_top_venues = 5

for hood in ba_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ba_grouped[ba_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agronom°a----
                venue  freq
0         Snack Place  0.12
1       Design Studio  0.12
2   Jewish Restaurant  0.12
3            Bus Stop  0.12
4  Athletics & Sports  0.12


----Almagro----
           venue  freq
0        Theater  0.07
1           Café  0.07
2         Bakery  0.05
3    Pizza Place  0.05
4  Deli / Bodega  0.03


----Balvanera----
                    venue  freq
0  Argentinian Restaurant  0.10
1             Pizza Place  0.10
2     Japanese Restaurant  0.10
3      Spanish Restaurant  0.10
4                   Hotel  0.06


----Barracas----
                    venue  freq
0  Argentinian Restaurant  0.25
1                    Park  0.25
2       Electronics Store  0.25
3           Grocery Store  0.25
4       Accessories Store  0.00


----Belgrano----
                    venue  freq
0             Coffee Shop  0.11
1             Pizza Place  0.07
2  Argentinian Restaurant  0.05
3                Tea Room  0.05
4      Italian Restaurant  0.05


----Boedo----
        

Generic function return_most_common_venues

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Retrieve top 10 venues by neighborhood.

In [32]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ba_grouped['Neighborhood']

for ind in np.arange(ba_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ba_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(49)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agronom°a,Design Studio,Bus Stop,Jewish Restaurant,Athletics & Sports,Snack Place,Photography Studio,Coffee Shop,Grocery Store,Falafel Restaurant,Food Service
1,Almagro,Café,Theater,Bakery,Pizza Place,Pub,Martial Arts Dojo,Empanada Restaurant,Deli / Bodega,Coffee Shop,Gym / Fitness Center
2,Balvanera,Pizza Place,Argentinian Restaurant,Japanese Restaurant,Spanish Restaurant,Hotel,Café,Bar,Sandwich Place,Coffee Shop,Fast Food Restaurant
3,Barracas,Argentinian Restaurant,Park,Electronics Store,Grocery Store,Yoga Studio,Falafel Restaurant,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant
4,Belgrano,Coffee Shop,Pizza Place,Bookstore,Ice Cream Shop,Argentinian Restaurant,Italian Restaurant,Vegetarian / Vegan Restaurant,Tea Room,Burger Joint,Smoke Shop
5,Boedo,Argentinian Restaurant,Intersection,Grocery Store,Ice Cream Shop,Pizza Place,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant,Flea Market
6,Caballito,Soccer Stadium,Grocery Store,Bistro,Pizza Place,Bus Station,Stadium,Sports Club,Coffee Shop,Soccer Field,Basketball Court
7,Chacarita,Pizza Place,Argentinian Restaurant,Bar,Bakery,Brewery,Spanish Restaurant,Dessert Shop,Office,Farmers Market,Coffee Shop
8,Coghlan,Argentinian Restaurant,Café,Pizza Place,Brewery,BBQ Joint,Gym / Fitness Center,Bakery,Food Service,History Museum,Bus Stop
9,Colegiales,Bakery,Pizza Place,Restaurant,Coffee Shop,Café,Hotel,Furniture / Home Store,Supermarket,Lounge,Brewery


## Clustering ##

In [34]:
# set number of clusters
kclusters = 3

ba_grouped_clustering = ba_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ba_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 1, 2, 1, 2, 2, 2, 2], dtype=int32)

In [37]:
# add clustering labels
#
# ** UNcomment This!!  **
#
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ba_merged = df_ba

# merge ba_grouped with ba_data to add latitude/longitude for each neighborhood
ba_merged = ba_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how="inner")


Create a Map using Folium

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ba_merged['Lat'], ba_merged['Long'], ba_merged['Neighborhood'], ba_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
map_clusters.save("/Users/davidknight/Downloads/buenosaires_clusters.html")

Generate reports by cluster showing most common venues.

In [42]:
ba_merged.loc[ba_merged['Cluster Labels'] == 0, ba_merged.columns[[0] + list(range(5, ba_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
39,Villa Lugano,Grocery Store,Supermarket,Light Rail Station,Bus Stop,Yoga Studio,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant,Flea Market
46,Villa Soldati,Light Rail Station,Soccer Stadium,Beer Bar,Yoga Studio,Falafel Restaurant,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant,Flea Market


In [44]:
ba_merged.loc[ba_merged['Cluster Labels'] == 1, ba_merged.columns[[0] + list(range(5, ba_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Barracas,Argentinian Restaurant,Park,Electronics Store,Grocery Store,Yoga Studio,Falafel Restaurant,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant
5,Boedo,Argentinian Restaurant,Intersection,Grocery Store,Ice Cream Shop,Pizza Place,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant,Flea Market
15,Liniers,Argentinian Restaurant,Burger Joint,Bike Shop,Pizza Place,Yoga Studio,Farmers Market,Furniture / Home Store,French Restaurant,Food Service,Food & Drink Shop
16,Mataderos,Argentinian Restaurant,Soccer Stadium,Café,History Museum,Yoga Studio,Falafel Restaurant,French Restaurant,Food Service,Food & Drink Shop,Fondue Restaurant
23,Parque Chacabuco,Athletics & Sports,Ice Cream Shop,Grocery Store,Mediterranean Restaurant,Gym,Pizza Place,Latin American Restaurant,Beer Bar,Restaurant,Argentinian Restaurant
42,Villa PueyrredĘn,Ice Cream Shop,Argentinian Restaurant,Train Station,Bus Station,Soccer Field,Nightclub,Fast Food Restaurant,Tennis Court,Electronics Store,Grocery Store
45,Villa Santa Rita,Ice Cream Shop,Argentinian Restaurant,Pizza Place,Pharmacy,Soccer Field,Sports Club,Café,Yoga Studio,Falafel Restaurant,Food Service


In [45]:
ba_merged.loc[ba_merged['Cluster Labels'] == 2, ba_merged.columns[[0] + list(range(5, ba_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agronom°a,Design Studio,Bus Stop,Jewish Restaurant,Athletics & Sports,Snack Place,Photography Studio,Coffee Shop,Grocery Store,Falafel Restaurant,Food Service
1,Almagro,Café,Theater,Bakery,Pizza Place,Pub,Martial Arts Dojo,Empanada Restaurant,Deli / Bodega,Coffee Shop,Gym / Fitness Center
2,Balvanera,Pizza Place,Argentinian Restaurant,Japanese Restaurant,Spanish Restaurant,Hotel,Café,Bar,Sandwich Place,Coffee Shop,Fast Food Restaurant
4,Belgrano,Coffee Shop,Pizza Place,Bookstore,Ice Cream Shop,Argentinian Restaurant,Italian Restaurant,Vegetarian / Vegan Restaurant,Tea Room,Burger Joint,Smoke Shop
6,Caballito,Soccer Stadium,Grocery Store,Bistro,Pizza Place,Bus Station,Stadium,Sports Club,Coffee Shop,Soccer Field,Basketball Court
7,Chacarita,Pizza Place,Argentinian Restaurant,Bar,Bakery,Brewery,Spanish Restaurant,Dessert Shop,Office,Farmers Market,Coffee Shop
8,Coghlan,Argentinian Restaurant,Café,Pizza Place,Brewery,BBQ Joint,Gym / Fitness Center,Bakery,Food Service,History Museum,Bus Stop
9,Colegiales,Bakery,Pizza Place,Restaurant,Coffee Shop,Café,Hotel,Furniture / Home Store,Supermarket,Lounge,Brewery
10,ConstituciĘn,Restaurant,Café,Cultural Center,Skate Park,Bus Station,Plaza,Performing Arts Venue,Fondue Restaurant,Flea Market,Fast Food Restaurant
11,Flores,Pharmacy,Pizza Place,Intersection,Café,Bakery,Sports Club,Lounge,Metro Station,Nightclub,Fast Food Restaurant
