# Capstone Project: Language diversity and neighborhood characteristics in DC

This notebook contains the code accompanying the report and presentation for my capstone project.

In [120]:
import pandas as pd
import numpy as np

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [121]:
lgdata = pd.read_csv("Clean_ACS5000plus_DC.csv")
lgdata.head()

Unnamed: 0,intptlon,intptlat,statefp,african_la,arabic,tagalog,otherasian,vietnamese,korean,chinese,otherindic,hindi,persian,russian,french,spanish,english,total
0,-77.215291,38.829664,51.0,51.0,112.0,244.0,0.0,294.0,259.0,0.0,8.0,24.0,109.0,36.0,6.0,866.0,1283.0,3348.0
1,-77.128695,38.9154,51.0,47.0,17.0,11.0,42.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,32.0,107.0,5741.0,6131.0
2,-76.928277,38.997738,24.0,63.0,52.0,23.0,38.0,0.0,36.0,100.0,243.0,36.0,33.0,42.0,19.0,581.0,4257.0,5774.0
3,-76.998274,38.887757,11.0,0.0,11.0,0.0,0.0,12.0,0.0,98.0,0.0,0.0,0.0,0.0,72.0,38.0,1569.0,1821.0
4,-77.038232,38.984972,11.0,203.0,22.0,0.0,0.0,0.0,0.0,12.0,1.0,12.0,11.0,0.0,73.0,82.0,3417.0,3895.0


In [122]:
lgdata = lgdata[pd.notnull(lgdata['intptlat'])] #omit all rows with null latitude
lgdata = lgdata[lgdata['statefp']==11.0] #omit all rows from outside DC proper
lgdata = lgdata.drop(['statefp'], axis = 1) #we no longer need this now we've separated out DC neighborhoods
lgdata.rename(columns={"intptlon": "Longitude", "intptlat":"Latitude", "african_la":"african lgs", "otherasian": "other asian", "otherindic": "other indic" }, inplace = True)

In [123]:
lgdata.head()

Unnamed: 0,Longitude,Latitude,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english,total
3,-76.998274,38.887757,0.0,11.0,0.0,0.0,12.0,0.0,98.0,0.0,0.0,0.0,0.0,72.0,38.0,1569.0,1821.0
4,-77.038232,38.984972,203.0,22.0,0.0,0.0,0.0,0.0,12.0,1.0,12.0,11.0,0.0,73.0,82.0,3417.0,3895.0
9,-77.018245,38.868179,38.0,30.0,28.0,0.0,0.0,0.0,35.0,31.0,0.0,0.0,12.0,100.0,70.0,2791.0,3255.0
13,-77.038846,38.916208,17.0,12.0,40.0,0.0,0.0,14.0,27.0,0.0,0.0,19.0,0.0,61.0,148.0,2958.0,3554.0
17,-76.996558,38.830731,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1345.0,1358.0


First, map the complete set of points from the ACS data.

In [124]:
address = 'Washington, DC'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [125]:
# create map of DC
map_dc = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng in zip(lgdata['Latitude'], lgdata['Longitude']):
    label = '{}, {}'.format(latitude,longitude)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dc)  
map_dc

Since the population size differs between census tracts, it is preferable to convert each raw number of speakers to a percentage of speakers in the census tract (by dividing by the row total). This will help us to compare the makeup of neighborhoods which may have different numbers of inhabitants.

In [126]:
lg_pct = lgdata.loc[:,"african lgs":"english"].div(lgdata["total"], axis=0)*100
lg_pct.dropna(inplace=True)
lg_pct.head()

Unnamed: 0,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english
3,0.0,0.604064,0.0,0.0,0.658979,0.0,5.381658,0.0,0.0,0.0,0.0,3.953871,2.086766,86.16145
4,5.21181,0.564827,0.0,0.0,0.0,0.0,0.308087,0.025674,0.308087,0.282413,0.0,1.874198,2.105263,87.727856
9,1.167435,0.921659,0.860215,0.0,0.0,0.0,1.075269,0.952381,0.0,0.0,0.368664,3.072197,2.150538,85.745008
13,0.478334,0.337648,1.125492,0.0,0.0,0.393922,0.759707,0.0,0.0,0.534609,0.0,1.716376,4.164322,83.230163
17,0.95729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.04271


Now run KMeans on the data to cluster the neighborhoods based on their linguistic characteristics.

In [127]:
# set number of clusters
kclusters = 4
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(lg_pct)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 1, 1, 1, 1, 0, 0], dtype=int32)

In [128]:
lgdata_merged = lg_pct
# add clustering labels and re-insert Lat-Long
lgdata_merged['Cluster Labels'] = kmeans.labels_
lgdata_merged['Latitude'] = lgdata['Latitude']
lgdata_merged['Longitude'] = lgdata['Longitude']
lgdata_merged.head(10)

Unnamed: 0,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english,Cluster Labels,Latitude,Longitude
3,0.0,0.604064,0.0,0.0,0.658979,0.0,5.381658,0.0,0.0,0.0,0.0,3.953871,2.086766,86.16145,2,38.887757,-76.998274
4,5.21181,0.564827,0.0,0.0,0.0,0.0,0.308087,0.025674,0.308087,0.282413,0.0,1.874198,2.105263,87.727856,2,38.984972,-77.038232
9,1.167435,0.921659,0.860215,0.0,0.0,0.0,1.075269,0.952381,0.0,0.0,0.368664,3.072197,2.150538,85.745008,2,38.868179,-77.018245
13,0.478334,0.337648,1.125492,0.0,0.0,0.393922,0.759707,0.0,0.0,0.534609,0.0,1.716376,4.164322,83.230163,2,38.916208,-77.038846
17,0.95729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.04271,1,38.830731,-76.996558
18,0.300752,0.0,0.0,0.0,0.300752,0.300752,0.0,0.0,0.0,0.250627,1.002506,1.052632,2.35589,93.984962,1,38.887741,-76.980109
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.572289,94.427711,1,38.893157,-76.958504
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.224469,97.067745,1,38.848477,-76.974367
21,1.217765,0.0,0.358166,0.0,0.0,0.0,2.148997,0.931232,0.0,0.0,0.573066,2.148997,8.524355,74.247851,0,38.911278,-77.08712
22,0.078003,2.76131,0.0,0.202808,0.0,0.280811,2.215289,0.0,0.0,1.279251,0.546022,4.056162,8.221529,74.196568,0,38.923767,-77.091734


In [129]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat, lon, cluster in zip(lgdata_merged['Latitude'], lgdata_merged['Longitude'], lgdata_merged['Cluster Labels']):
    label = '{} : {}, {}'.format(cluster, latitude, longitude)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        popup = label,
        radius=5,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let's also calculate the mean language distribution in each of the four clusters to help us see what is driving the clustering.

In [130]:
lgdata_grp = lgdata_merged.groupby(['Cluster Labels']).mean()
lgdata_grp

Unnamed: 0_level_0,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english,Latitude,Longitude
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.160114,0.484673,0.415369,0.431374,0.163749,0.404393,1.853571,0.289995,0.2589,0.268743,0.72048,2.58085,10.658988,74.981873,38.924866,-77.048342
1,0.683017,0.053124,0.109593,0.021668,0.033595,0.059169,0.164254,0.028865,0.029342,0.019815,0.068326,0.482082,2.6094,94.996498,38.879342,-76.975013
2,1.484266,0.342874,0.290056,0.181131,0.095081,0.232748,1.117816,0.126494,0.167997,0.144016,0.349811,1.50959,6.456933,85.029644,38.918039,-77.017798
3,6.912892,0.248685,0.480918,0.042322,0.489226,0.031807,0.389248,0.310074,0.111818,0.1575,0.107678,1.642581,24.997862,62.783958,38.949983,-77.028736


In [131]:
lgdata_merged.reset_index(drop=True, inplace=True)
lgdata_merged["Neighborhood ID"] = lgdata_merged.index + 1  #Since we have no Neighborhood names, assign each lat-long pair and ID number a
lgdata_merged.head()


Unnamed: 0,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english,Cluster Labels,Latitude,Longitude,Neighborhood ID
0,0.0,0.604064,0.0,0.0,0.658979,0.0,5.381658,0.0,0.0,0.0,0.0,3.953871,2.086766,86.16145,2,38.887757,-76.998274,1
1,5.21181,0.564827,0.0,0.0,0.0,0.0,0.308087,0.025674,0.308087,0.282413,0.0,1.874198,2.105263,87.727856,2,38.984972,-77.038232,2
2,1.167435,0.921659,0.860215,0.0,0.0,0.0,1.075269,0.952381,0.0,0.0,0.368664,3.072197,2.150538,85.745008,2,38.868179,-77.018245,3
3,0.478334,0.337648,1.125492,0.0,0.0,0.393922,0.759707,0.0,0.0,0.534609,0.0,1.716376,4.164322,83.230163,2,38.916208,-77.038846,4
4,0.95729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.04271,1,38.830731,-76.996558,5


In [132]:
#reformat the dataframe so the neighborhood, latitude and longitude are at the front.
cols = lgdata_merged.columns.tolist()
fixed_columns = [lgdata_merged.columns[-1]] + [lgdata_merged.columns[-2]]+ [lgdata_merged.columns[-3]]+ list(lgdata_merged.columns[:-3])
lgdata_merged = lgdata_merged[fixed_columns] 
lgdata_merged.head()

Unnamed: 0,Neighborhood ID,Longitude,Latitude,african lgs,arabic,tagalog,other asian,vietnamese,korean,chinese,other indic,hindi,persian,russian,french,spanish,english,Cluster Labels
0,1,-76.998274,38.887757,0.0,0.604064,0.0,0.0,0.658979,0.0,5.381658,0.0,0.0,0.0,0.0,3.953871,2.086766,86.16145,2
1,2,-77.038232,38.984972,5.21181,0.564827,0.0,0.0,0.0,0.0,0.308087,0.025674,0.308087,0.282413,0.0,1.874198,2.105263,87.727856,2
2,3,-77.018245,38.868179,1.167435,0.921659,0.860215,0.0,0.0,0.0,1.075269,0.952381,0.0,0.0,0.368664,3.072197,2.150538,85.745008,2
3,4,-77.038846,38.916208,0.478334,0.337648,1.125492,0.0,0.0,0.393922,0.759707,0.0,0.0,0.534609,0.0,1.716376,4.164322,83.230163,2
4,5,-76.996558,38.830731,0.95729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.04271,1


<h2>Is there a relationship between linguistic groups and venue distribution in DC neighborhoods?</h2>

The number of geospatial points provided in the ACS data is larger than desirable for gathering data about neighborhoods from Foursquare. Therefore, the first step is to group and reduce the number of points to be queried.

In [133]:
tracts = lgdata_merged[['Latitude', 'Longitude']]

In [134]:
# set number of clusters
kclusters = 30
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tracts)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([ 5, 28, 14, 20,  3, 13, 24, 25,  9, 15], dtype=int32)

In [135]:
# add clustering labels
tracts['Cluster Labels'] = kmeans.labels_
tracts.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Latitude,Longitude,Cluster Labels
0,38.887757,-76.998274,5
1,38.984972,-77.038232,28
2,38.868179,-77.018245,14
3,38.916208,-77.038846,20
4,38.830731,-76.996558,3
5,38.887741,-76.980109,13
6,38.893157,-76.958504,24
7,38.848477,-76.974367,25
8,38.911278,-77.08712,9
9,38.923767,-77.091734,15


In [196]:
neighborhoods = tracts.groupby(['Cluster Labels']).mean().reset_index()
neighborhoods = neighborhoods.rename(columns={'Cluster Labels': 'Neighborhood ID', 'Latitude': 'Latitude', 'Longitude': 'Longitude'})
neighborhoods.head()

Unnamed: 0,Neighborhood ID,Latitude,Longitude
0,0,38.882135,-76.942816
1,1,38.906113,-77.025208
2,2,38.963332,-77.012785
3,3,38.832466,-76.997726
4,4,38.939968,-77.07626


Map the neighborhoods to see whether they're well distributed.

In [167]:
# create map of DC
map_dcneigh = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for cluster, lat, lng in zip(neighborhoods['Neighborhood ID'], neighborhoods['Latitude'], neighborhoods['Longitude']):
    label = '{}: {}, {}'.format(cluster, latitude,longitude)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dcneigh)  
map_dcneigh

In [138]:
CLIENT_ID = 'RZQCCUUOY4VPUEIFA1ZL0VRXCUQOKZPJHWWEZHOTJJOCZ0EV' # your Foursquare ID
CLIENT_SECRET = 'PTDIWLKZ5T45NM4KKUIJLCIQHAIGQHNWKKT3H4JRTFFPVUIE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [197]:
def getNearbyVenues(neighbs, latitudes, longitudes, radius=500, LIMIT=50):
    
    venues_list=[]
    for neighb, lat, lng in zip(neighbs, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
       
        # return only relevant information for each nearby venue
        venues_list.append([( 
            neighb,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood ID', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [198]:
dc_venues = getNearbyVenues(neighbs=neighborhoods['Neighborhood ID'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'])

In [199]:
dc_venues.head(50)

Unnamed: 0,Neighborhood ID,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,38.882135,-76.942816,Fort DuPont Park,38.880704,-76.943986,Park
1,0,38.882135,-76.942816,Texas Convenience Store,38.883341,-76.942113,Convenience Store
2,0,38.882135,-76.942816,Williams Sewer and Drain Service LLC,38.883823,-76.940742,Home Service
3,0,38.882135,-76.942816,Mearry Berger,38.882222,-76.938812,Burger Joint
4,0,38.882135,-76.942816,Tha Money Club Empire,38.880531,-76.937916,Record Shop
5,1,38.906113,-77.025208,La Colombe Coffee Roasters,38.906584,-77.024952,Coffee Shop
6,1,38.906113,-77.025208,The Columbia Room,38.906395,-77.024534,Cocktail Bar
7,1,38.906113,-77.025208,El Rinconcito Cafe,38.90545,-77.02689,Latin American Restaurant
8,1,38.906113,-77.025208,SUNdeVICH,38.907726,-77.024465,Sandwich Place
9,1,38.906113,-77.025208,Seylou,38.907203,-77.025085,Bakery


In [200]:
dc_venues.groupby(['Neighborhood ID']).count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5,5,5,5,5,5
1,50,50,50,50,50,50
2,9,9,9,9,9,9
3,4,4,4,4,4,4
4,14,14,14,14,14,14
5,34,34,34,34,34,34
6,29,29,29,29,29,29
7,35,35,35,35,35,35
8,6,6,6,6,6,6
9,2,2,2,2,2,2


In [201]:
# one hot encoding
dc_onehot = pd.get_dummies(dc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dc_onehot['Neighborhood ID'] = dc_venues['Neighborhood ID'] 

# move neighborhood column to the first column
fixed_columns = [dc_onehot.columns[-1]] + list(dc_onehot.columns[:-1])
dc_onehot = dc_onehot[fixed_columns]

dc_onehot.head()

Unnamed: 0,Neighborhood ID,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,Bagel Shop,...,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [202]:
dc_grouped = dc_onehot.groupby('Neighborhood ID').mean().reset_index()
dc_grouped

Unnamed: 0,Neighborhood ID,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,Bagel Shop,...,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.08,0.02,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,...,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0
6,6,0.0,0.034483,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0
7,7,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.485714
8,8,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [203]:
# set number of clusters
kclusters = 5

dc_grouped_clustering = dc_grouped.drop('Neighborhood ID', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dc_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 3, 0, 0, 0, 0, 0, 0], dtype=int32)

In [204]:
dc_grouped.head()

Unnamed: 0,Neighborhood ID,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,Bagel Shop,...,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.08,0.02,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
# add clustering labels
dc_grouped['Cluster'] = kmeans.labels_
dc_grouped['Latitude'] = neighborhoods['Latitude']
dc_grouped['Longitude'] = neighborhoods['Longitude']
dc_grouped.head(30)

Unnamed: 0,Neighborhood ID,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,Bagel Shop,...,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit,Cluster,Latitude,Longitude
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,38.882135,-76.942816
1,1,0.0,0.08,0.02,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0,38.906113,-77.025208
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,38.963332,-77.012785
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,38.832466,-76.997726
4,4,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,38.939968,-77.07626
5,5,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,...,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0,38.887557,-76.993991
6,6,0.0,0.034483,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0,38.911498,-77.008854
7,7,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.485714,0,38.92888,-77.047752
8,8,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,38.865666,-76.980366
9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,38.91529,-77.07507


In [207]:
map_Vclusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat, lon, cluster, neighborhood in zip(dc_grouped['Latitude'], dc_grouped['Longitude'], dc_grouped['Cluster'], dc_grouped['Neighborhood ID']):
    label = '{} : cluster {}'.format(neighborhood, cluster)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        popup = label,
        radius=5,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_Vclusters)
       
map_Vclusters

This doesn't look terribly informative since most neighborhoods belong to the same group, but let's look at the top venues to see if there is anything we can use.

In [208]:
dc_analyze = dc_grouped.drop(['Latitude', 'Longitude', 'Cluster'], axis = 1)
dc_analyze.head()

Unnamed: 0,Neighborhood ID,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,Bagel Shop,...,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.08,0.02,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [209]:
num_top_venues = 5

for hood in dc_analyze['Neighborhood ID']:
    print(hood)
    temp = dc_analyze[dc_analyze['Neighborhood ID'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

0
               venue  freq
0       Home Service   0.2
1  Convenience Store   0.2
2        Record Shop   0.2
3               Park   0.2
4       Burger Joint   0.2


1
                     venue  freq
0      American Restaurant  0.08
1  New American Restaurant  0.08
2                      Bar  0.06
3              Coffee Shop  0.06
4                   Bakery  0.04


2
                 venue  freq
0  Rental Car Location  0.22
1              Brewery  0.11
2                 Park  0.11
3   Athletics & Sports  0.11
4      Automotive Shop  0.11


3
            venue  freq
0            Park  0.50
1    Costume Shop  0.25
2   Grocery Store  0.25
3  Pilates Studio  0.00
4       Nightclub  0.00


4
                venue  freq
0                 Gym  0.14
1  African Restaurant  0.07
2             Theater  0.07
3      Shipping Store  0.07
4         Coffee Shop  0.07


5
                 venue  freq
0          Coffee Shop  0.09
1          Pizza Place  0.09
2                 Park  0.06
3           Food

Cluster 0 seems particularly uselessly heterogeneous. For example, it includes both neighborhood 18, adjacent to Bolling Air Force Base to the south of the city, whose top 5 venue types are: hotel, baseball field, stationery store, sandwich place, and playground; and neighborhood 1, in Chinatown in the heart of downtown, whose top venues are: American Restaurant, New American Restaurant, Bar, Coffee Shop, Bakery. What's special about the neighborhoods that are NOT assigned to cluster 0?




Point 24, cluster 2, in SE, has only two venues: a tennis court and a pool. Point 28, cluster 4 in the north of the city, close to Silver Spring, also has only two venues: a trail and a restaurant. This suggests that the main characteristic driving the clustering is a shortage of venues in some parts of the city. However, this shortage of venues is in fact a shortcoming of the Foursquare data: not enough venues have been added to the data for some parts of DC for us to effectively conduct the analysis intended.