# Comparing Neighborhoods in Seattle, WA and Portland, OR

Downloading and importing the required libraries

In [1]:
import numpy as np #handle vectorized data
import pandas as pd #data analysis tools

!conda install -c conda-forge folium=0.5.0 --yes
import folium #map rendering

import matplotlib.cm as cm
import matplotlib.colors as colors #graphical tools

import json #library to handle JSON files
import requests #library to handle requests
from pandas.io.json import json_normalize #tranform JSON file into a pandas dataframe

!conda install -c conda-forge geocoder --yes
import geocoder #location data api

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim #location data api

!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup #web scraping

import re #regex

from sklearn.cluster import KMeans #clustering
print("All libraries imported")

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

All libraries imported


### Building Neighborhood DataFrames

We can easily scrape the Seattle neighborhoods from a Wikipedia page using Pandas.

In [2]:
wiki_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Seattle')[0]
wiki_data.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood name,Within larger district,Annexed[41],Locator map,Street map,Image,Notes
0,1,North Seattle,Seattle,Various,,,,North of the Lake Washington Ship Canal[42]
1,2,Broadview,North Seattle[42],1954[43],,,,[44]
2,3,Bitter Lake,North Seattle[42],1954[43],,,,[45]
3,4,North Beach / Blue Ridge,North Seattle[42],"1940,[43] 1954[43]",,,,[46]
4,5,Crown Hill,North Seattle[42],"1907,[47] 1952,[43] 1954[43]",,,,[48]


We don't need most of the information in this table, so we create a new table with columns for location data.

In [3]:
columns = ['Neighborhood', 'Latitude', 'Longitude']
seattle_neighborhoods = pd.DataFrame(columns = columns)

In [4]:
neighborhoods = wiki_data['Neighborhood name']

We can use the information from Wikipedia and Nominatim (a location data API) to get the latitude and longitude for each neighborhood.

In [5]:
for neighborhood in neighborhoods:
    address = ('{}, Seattle, WA'.format(neighborhood))
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
        seattle_neighborhoods = seattle_neighborhoods.append({'Neighborhood': neighborhood,
                                      'Latitude': latitude,
                                      'Longitude': longitude}, ignore_index = True)
    except:
        print('No coordinates found for {}'.format(neighborhood))
seattle_neighborhoods

No coordinates found for North College Park (Licton Springs)
No coordinates found for Portage Bay[95] / Roanoke
No coordinates found for Pike-Pine Corridor / Pike/Pine[97][98][99]
No coordinates found for International District ("ID")
No coordinates found for West Edge[118][119]
No coordinates found for Central Area[120] / Central District ("CD")
No coordinates found for Cherry Hill & Squire Park
No coordinates found for Harrison[120] / Denny-Blaine[120]
No coordinates found for Leschi[120]
No coordinates found for Dunlap / Othello
No coordinates found for Rainier Beach / Atlantic City Beach
No coordinates found for Rainier View / Lakeridge
No coordinates found for Mid Beacon Hill (Maplewood)
No coordinates found for Holly Park[144] / NewHolly[145]
No coordinates found for South Beacon Hill[146] / Van Asselt
No coordinates found for Industrial District
No coordinates found for North Admiral[153] / Admiral District
No coordinates found for Junction[154] / West Seattle Junction / Alaska 

Unnamed: 0,Neighborhood,Latitude,Longitude
0,North Seattle,47.691037,-122.305549
1,Broadview,47.722320,-122.360407
2,Bitter Lake,47.726236,-122.348764
3,North Beach / Blue Ridge,47.696210,-122.392362
4,Crown Hill,47.694715,-122.371459
...,...,...,...
103,Riverview,47.539383,-122.349189
104,Highland Park,47.528432,-122.352626
105,South Delridge,47.552123,-122.363874
106,Roxhill,47.519866,-122.367813


In [6]:
seattle_neighborhoods['Neighborhood'] = seattle_neighborhoods['Neighborhood'] + 'S'

We cannot find latitudes and longitudes for some neighborhoods. This is ok because we are concerned with clusters rather than individual neighborhoods.

We can use Folium to label the neighborhoods on a map of Seattle.

In [8]:
address = 'Seattle, WA'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
seattle_map = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(seattle_neighborhoods['Latitude'], seattle_neighborhoods['Longitude'], seattle_neighborhoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(seattle_map)

seattle_map

Now we can go through the same process for Portland.

Again, we can scrape the neighborhoods from a Wikipedia page, but unlike the Seattle neighborhoods, the information is not available in a table.

In [9]:
url = 'https://en.wikipedia.org/wiki/Neighborhoods_of_Portland,_Oregon'
page = requests.get(url)

We create a DataFrame to hold the neighborhood and location data.

In [10]:
columns = ['Neighborhood', 'Latitude', 'Longitude']
portland_neighborhoods = pd.DataFrame(columns = columns)

We can use a different scraping tool, BeautifulSoup, to read the neighborhoods from the page html.

In [11]:
text = ''
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'mw-content-text')
neighborhoods = results.find_all('div', class_ = 'div-col')
for n in neighborhoods:
    places = n.find('ul')
    text = text + places.text + '\n'

Remove the extra information that exists for some neighborhoods, making it possible to locate them

In [12]:
text = re.sub(r"\((.*?)\)", '', text)
text = text[:-1]

We convert the string into a list that can be iterated over to fill the DataFrame with location data.

In [13]:
neighborhoods = text.split('\n')

In [14]:
for neighborhood in neighborhoods:
    address = ('{}, Portland, OR'.format(neighborhood))
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
        portland_neighborhoods = portland_neighborhoods.append({'Neighborhood': neighborhood,
                                      'Latitude': latitude,
                                      'Longitude': longitude}, ignore_index = True)
    except:
        print('No coordinates found for {}'.format(neighborhood))
portland_neighborhoods

No coordinates found for Northwest Industrial


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Arlington Heights,45.519496,-122.710667
1,Forest Park,45.561376,-122.758458
2,Goose Hollow,45.517749,-122.692819
3,Hillside,45.527439,-122.713120
4,Linnton,45.600330,-122.786779
...,...,...,...
88,Richmond,45.504675,-122.622700
89,Sellwood-Moreland,45.471488,-122.651430
90,South Tabor,45.501381,-122.593759
91,Sunnyside,45.515774,-122.624528


In [15]:
portland_neighborhoods['Neighborhood'] = portland_neighborhoods['Neighborhood'] + 'P'

Again, missing neighborhoods won't impact the results in small quantities.

We can use Folium to map the neighborhoods in Portland.

In [17]:
address = 'Portland, OR'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
portland_map = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(portland_neighborhoods['Latitude'], portland_neighborhoods['Longitude'], portland_neighborhoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(portland_map)

portland_map

### Creating Venue Data

We can combine the neighborhoods from both cities to grab all the venue data at once.

In [18]:
all_neighborhoods = seattle_neighborhoods.append(portland_neighborhoods)

In [19]:
print(seattle_neighborhoods.shape)
print(portland_neighborhoods.shape)
print(all_neighborhoods.shape)

(108, 3)
(93, 3)
(201, 3)


We define a function to get venues in each neighbourhood using the Foursquare API.

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print('All venues retrieved!')
    return(nearby_venues)

In [22]:
all_venues = getNearbyVenues(names = all_neighborhoods["Neighborhood"],
                                latitudes = all_neighborhoods["Latitude"],
                                longitudes = all_neighborhoods["Longitude"])

All venues retrieved!


In [23]:
print(all_venues.shape)
all_venues.head()

(4527, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North SeattleS,47.691037,-122.305549,The Growler Guys,47.690667,-122.307405,Beer Bar
1,North SeattleS,47.691037,-122.305549,Phayathai Cuisine,47.693528,-122.305673,Thai Restaurant
2,North SeattleS,47.691037,-122.305549,7-Eleven,47.688964,-122.30962,Convenience Store
3,North SeattleS,47.691037,-122.305549,lighthouse diving centers,47.688979,-122.310413,Sporting Goods Shop
4,BroadviewS,47.72232,-122.360407,Chada Thai,47.719663,-122.355114,Thai Restaurant


In [24]:
all_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AdamsS,9,9,9,9,9,9
AlamedaS,7,7,7,7,7,7
Alki PointS,5,5,5,5,5,5
Arbor HeightsS,1,1,1,1,1,1
Arbor LodgeS,10,10,10,10,10,10
...,...,...,...,...,...,...
WindermereS,3,3,3,3,3,3
Woodland ParkS,27,27,27,27,27,27
WoodlawnS,12,12,12,12,12,12
WoodstockS,41,41,41,41,41,41


We create a data frame with dummy variables for venue category.

In [25]:
onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")
onehot['Neighborhood'] = all_venues['Neighborhood']
onehot.head()

Unnamed: 0,ATM,Accessories Store,African Restaurant,American Restaurant,Amphitheater,Antique Shop,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We normalize the counts of each venue category.

In [26]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
print(grouped.shape)
grouped.head()

(197, 362)


Unnamed: 0,Neighborhood,ATM,Accessories Store,African Restaurant,American Restaurant,Amphitheater,Antique Shop,Arcade,Argentinian Restaurant,Art Gallery,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,AdamsS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AlamedaS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alki PointS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arbor HeightsS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arbor LodgeS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We create our clustering model.

In [27]:
kclusters = 3

clusters = grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clusters)

kmeans.labels_[0:10] 

array([0, 0, 0, 1, 0, 2, 0, 2, 0, 0])

We record the cluster labels and add them to the DataFrame containing all the neighborhoods.

In [28]:
grouped.insert(0, "Cluster Labels", kmeans.labels_)

In [29]:
all_neighborhoods = all_neighborhoods.merge(grouped[['Cluster Labels', 'Neighborhood']], on = 'Neighborhood')
all_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels
0,North SeattleS,47.691037,-122.305549,0
1,BroadviewS,47.72232,-122.360407,0
2,Bitter LakeS,47.726236,-122.348764,0
3,North Beach / Blue RidgeS,47.69621,-122.392362,2
4,Crown HillS,47.694715,-122.371459,0


We display the clustered neighborhoods in Portland and Seattle.

In [30]:
address = 'Portland, OR'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(all_neighborhoods['Latitude'], all_neighborhoods['Longitude'], all_neighborhoods['Neighborhood'], all_neighborhoods['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [31]:
address = 'Seattle, WA'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(all_neighborhoods['Latitude'], all_neighborhoods['Longitude'], all_neighborhoods['Neighborhood'], all_neighborhoods['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

We can see that there are three broad clusters of neighborhoods which would feel similar based on the nearby venues.