 #  The Battle of Neighborhoods Capstone - San Francisco

* Install/import the necessary python libraries for clustering, map plotting and data cleaninh

In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
!conda install -c conda-forge folium=0.5.0
import folium #map plotting
#importing matplotlib and necessary modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

* Get initial look at zip code data for San Francisco

In [4]:
response = requests.get("http://www.healthysf.org/bdi/outcomes/zipmap.htm")
soup = BeautifulSoup(response.text, "lxml")
table = soup.find_all("table")
df = pd.read_html(str(table))
df = pd.DataFrame(df[4])
df.columns = df.iloc[0]
# Sort neighborhoods by population
df.sort_values(by=['Population (Census 2000)'], inplace=True)
# We see that Chinatown is the smallest neighborhood and Inner Mission/Bernal Heights  is the largest neighborhood
df = df.iloc[1:-1, :-1]
df_sf = df
df_sf.head()

Unnamed: 0,Zip Code,Neighborhood
3,94107,Potrero Hill
17,94127,St. Francis Wood/Miraloma/West Portal
15,94123,Marina
2,94103,South of Market
19,94132,Lake Merced


In [5]:
!pip install uszipcode
from uszipcode import SearchEngine

search = SearchEngine(simple_zipcode=True)

latitude = []
longitude = []

for index, row in df_sf.iterrows():
    zipcode = search.by_zipcode(row["Zip Code"]).to_dict()
    latitude.append(zipcode.get("lat"))
    longitude.append(zipcode.get("lng"))

df_sf["Latitude"] = latitude
df_sf["Longitude"] = longitude

# Drop any null values
df_sf.dropna(inplace=True)
df_sf.head()

Collecting uszipcode
[?25l  Downloading https://files.pythonhosted.org/packages/bc/94/1b908c6fe2008f0e913b0b2d97951aa76e00ec1044883c012afb2e477b4a/uszipcode-0.2.4-py2.py3-none-any.whl (378kB)
[K     |████████████████████████████████| 378kB 7.4MB/s eta 0:00:01
[?25hCollecting pathlib-mate (from uszipcode)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/f2/a1e6044fe90784e7bbc05286f2e8616aa2ff167f7275f5a6f2df479092c0/pathlib_mate-0.0.15-py2.py3-none-any.whl (195kB)
[K     |████████████████████████████████| 204kB 17.0MB/s eta 0:00:01
Collecting autopep8 (from pathlib-mate->uszipcode)
[?25l  Downloading https://files.pythonhosted.org/packages/12/55/7b07585ca0c30e5b216e4d627f82f96f1a7e82d2dd727b1f926cb3f3d58b/autopep8-1.5.tar.gz (116kB)
[K     |████████████████████████████████| 122kB 12.1MB/s eta 0:00:01
Building wheels for collected packages: autopep8
  Building wheel for autopep8 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/2b/

Unnamed: 0,Zip Code,Neighborhood,Latitude,Longitude
3,94107,Potrero Hill,37.77,-122.39
17,94127,St. Francis Wood/Miraloma/West Portal,37.73,-122.46
15,94123,Marina,37.8,-122.44
2,94103,South of Market,37.78,-122.41
19,94132,Lake Merced,37.72,-122.48


* An initial glance at San Francisco

In [6]:
latitude = 37.7792808
longitude = -122.4192363
sf_map = folium.Map(location = [latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df_sf['Latitude'], df_sf['Longitude'], df_sf['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(sf_map)  
    
sf_map

### Clustering Neighborhoods in SF

* Get FourSquare API credentials

In [7]:
CLIENT_ID = '1ETHRMMH2RJA3VQRGIXOEMF5TRYVL55AI1I432GTOAWWXGKP' 
CLIENT_SECRET = 'IYX1A0RDTDJV2RNAUYED2G4MCOWEIHLYGYS1RRBM3BPEUGNT' 
VERSION = '20180604'
LIMIT = 50

In [141]:
# Abstract code for grabbing venues for the different neighborhoods in SF using the API

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=600):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # GET request for venues
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [16]:
# Comparing the smallest neighborhoods with other possibly similar neighborhoods and the largest neighborhood
df_ch = df_sf[df_sf['Neighborhood'].str.contains('Chinatown') | df_sf['Neighborhood'].str.contains('North Beach/Chinatown') | 
              df_sf['Neighborhood'].str.contains('Western Addition/Japantown') | df_sf['Neighborhood'].str.contains('Heights') 
              | df_sf['Neighborhood'].str.contains('Hill') | df_sf['Neighborhood'].str.contains('Market')]

sf_venues = getNearbyVenues(names = df_ch['Neighborhood'],
                                   latitudes = df_sf['Latitude'],
                                   longitudes = df_sf['Longitude'])
                                  
df_ch.head()
sf_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Potrero Hill,37.77,-122.39,SPARK Social SF,37.770762,-122.391689,Street Food Gathering
1,Potrero Hill,37.77,-122.39,Stagecoach Greens,37.770867,-122.390261,Mini Golf
2,Potrero Hill,37.77,-122.39,Spro Coffeelab,37.770835,-122.3914,Coffee Shop
3,Potrero Hill,37.77,-122.39,SFFSoccer Mission Bay Field,37.770886,-122.392197,Soccer Field
4,Potrero Hill,37.77,-122.39,Señor Sisig,37.770809,-122.391437,Food Truck


In [17]:
sf_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hayes Valley/Tenderloin/North of Market,50,50,50,50,50,50
Inner Mission/Bernal Heights,50,50,50,50,50,50
North Beach/Chinatown,50,50,50,50,50,50
Parkside/Forest Hill,50,50,50,50,50,50
Polk/Russian Hill (Nob Hill),36,36,36,36,36,36
Potrero Hill,50,50,50,50,50,50
South of Market,28,28,28,28,28,28
Western Addition/Japantown,19,19,19,19,19,19


In [18]:
# use one hot encoding for categorical variables (Venue Category)
sf_onehot = pd.get_dummies(sf_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighborhood column back to dataframe
sf_onehot['Neighborhood'] = sf_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sf_onehot.columns[-1]] + list(sf_onehot.columns[:-1])
sf_onehot = sf_onehot[fixed_columns]

sf_onehot.head()

Unnamed: 0,Neighborhood,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Theater,Tiki Bar,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Potrero Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Potrero Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Potrero Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Potrero Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Potrero Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
sf_grouped = sf_onehot.groupby('Neighborhood').mean().reset_index()
sf_grouped.head(10)

Unnamed: 0,Neighborhood,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Theater,Tiki Bar,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Hayes Valley/Tenderloin/North of Market,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.04,0.0,...,0.06,0.0,0.0,0.0,0.0,0.0,0.06,0.02,0.0,0.0
1,Inner Mission/Bernal Heights,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.04,0.0,...,0.04,0.02,0.0,0.0,0.04,0.0,0.02,0.04,0.02,0.0
2,North Beach/Chinatown,0.02,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0
3,Parkside/Forest Hill,0.02,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0
4,Polk/Russian Hill (Nob Hill),0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,...,0.0,0.0,0.027778,0.027778,0.0,0.027778,0.0,0.0,0.0,0.027778
5,Potrero Hill,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
6,South of Market,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429
7,Western Addition/Japantown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        # append 'st', 'nd', 'rd' to the top 3 venues
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = sf_grouped['Neighborhood']

for ind in np.arange(sf_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sf_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Hayes Valley/Tenderloin/North of Market,Coffee Shop,Vietnamese Restaurant,Theater,Pizza Place,Food Truck,Café,Bar,Bakery,Sandwich Place,Marijuana Dispensary
1,Inner Mission/Bernal Heights,French Restaurant,Performing Arts Venue,Sushi Restaurant,Concert Hall,Wine Bar,Vegetarian / Vegan Restaurant,Indian Restaurant,Theater,Cocktail Bar,Bakery
2,North Beach/Chinatown,Wine Bar,Gym / Fitness Center,French Restaurant,Deli / Bodega,Salad Place,Sandwich Place,Thai Restaurant,Italian Restaurant,Alternative Healer,Chinese Restaurant
3,Parkside/Forest Hill,Wine Bar,Gym / Fitness Center,French Restaurant,Deli / Bodega,Salad Place,Sandwich Place,Thai Restaurant,Italian Restaurant,Alternative Healer,Chinese Restaurant
4,Polk/Russian Hill (Nob Hill),Bookstore,American Restaurant,Gift Shop,Italian Restaurant,Park,Coffee Shop,Sandwich Place,Restaurant,Burger Joint,Bus Station


In [23]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 3
sf_grouped_clustering = sf_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(sf_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 2, 2, 2, 0, 0, 1], dtype=int32)

In [24]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sf_merged = df_sf
sf_merged = sf_merged.merge(neighborhoods_venues_sorted, on = 'Neighborhood')

sf_merged.head()

Unnamed: 0,Zip Code,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,94107,Potrero Hill,37.77,-122.39,0,Food Truck,Coffee Shop,Pharmacy,Park,Café,New American Restaurant,Street Food Gathering,Pizza Place,Mediterranean Restaurant,Mini Golf
1,94103,South of Market,37.78,-122.41,0,Chinese Restaurant,Yoga Studio,Café,Pizza Place,Shipping Store,Bus Line,Pool Hall,Coffee Shop,Playground,Pharmacy
2,94133,North Beach/Chinatown,37.8,-122.44,2,Wine Bar,Gym / Fitness Center,French Restaurant,Deli / Bodega,Salad Place,Sandwich Place,Thai Restaurant,Italian Restaurant,Alternative Healer,Chinese Restaurant
3,94102,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,0,Coffee Shop,Vietnamese Restaurant,Theater,Pizza Place,Food Truck,Café,Bar,Bakery,Sandwich Place,Marijuana Dispensary
4,94115,Western Addition/Japantown,37.79,-122.44,1,Pizza Place,Café,Mexican Restaurant,Sandwich Place,Juice Bar,Burger Joint,Dog Run,Rental Car Location,Coffee Shop,Gym


In [25]:

# create map
latitude = 37.7792808
longitude = -122.4192363
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sf_merged['Latitude'], sf_merged['Longitude'], sf_merged['Neighborhood'], sf_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

In [26]:
# Exploring the clusters formed to verify our initial assumption

In [27]:
sf_merged.loc[sf_merged['Cluster Labels'] == 1, sf_merged.columns[[1] + list(range(5, sf_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Western Addition/Japantown,Pizza Place,Café,Mexican Restaurant,Sandwich Place,Juice Bar,Burger Joint,Dog Run,Rental Car Location,Coffee Shop,Gym


In [28]:
sf_merged.loc[sf_merged['Cluster Labels'] == 2, sf_merged.columns[[1] + list(range(5, sf_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North Beach/Chinatown,Wine Bar,Gym / Fitness Center,French Restaurant,Deli / Bodega,Salad Place,Sandwich Place,Thai Restaurant,Italian Restaurant,Alternative Healer,Chinese Restaurant
5,Parkside/Forest Hill,Wine Bar,Gym / Fitness Center,French Restaurant,Deli / Bodega,Salad Place,Sandwich Place,Thai Restaurant,Italian Restaurant,Alternative Healer,Chinese Restaurant
6,Polk/Russian Hill (Nob Hill),Bookstore,American Restaurant,Gift Shop,Italian Restaurant,Park,Coffee Shop,Sandwich Place,Restaurant,Burger Joint,Bus Station
7,Inner Mission/Bernal Heights,French Restaurant,Performing Arts Venue,Sushi Restaurant,Concert Hall,Wine Bar,Vegetarian / Vegan Restaurant,Indian Restaurant,Theater,Cocktail Bar,Bakery


In [172]:
# We ran our KMeans algorithm with k=3 for the number of clusters but we find that our 3rd cluster is empty


In [30]:
sf_merged.loc[sf_merged['Cluster Labels'] == 0, sf_merged.columns[[1] + list(range(5, sf_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Potrero Hill,Food Truck,Coffee Shop,Pharmacy,Park,Café,New American Restaurant,Street Food Gathering,Pizza Place,Mediterranean Restaurant,Mini Golf
1,South of Market,Chinese Restaurant,Yoga Studio,Café,Pizza Place,Shipping Store,Bus Line,Pool Hall,Coffee Shop,Playground,Pharmacy
3,Hayes Valley/Tenderloin/North of Market,Coffee Shop,Vietnamese Restaurant,Theater,Pizza Place,Food Truck,Café,Bar,Bakery,Sandwich Place,Marijuana Dispensary


In [174]:
# Since San Francisco only has 21 Neighborhoods and we further limited the number of neighborhoods in our analysis, 
#it is understandable that our 3rd cluster would be reasonably empty