# Coursera IBM Data Science Capstone Project.

### Comparing neighborhoods of Seattle using geo data and foursquare api.

Importing most of the libraries that I will need.

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np

import requests # library to handle requests
import random # library for random number generation

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

import json # library to handle JSON files
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#! pip install folium==0.5.0
import folium # plotting library

print('Libraries imported.')

Libraries imported.


In [3]:
#Foursquare API credentials
CLIENT_ID = 'NERYLZJIO3LDMUIX2OVRYHTRNEPOYW34VBLMN1WCKPCCK0XP' # your Foursquare ID
CLIENT_SECRET = 'GILN0XA0CVK420FBZ0H2Q1EYAQKXDXDGMGDQ5LT2KAQPS3IV' # your Foursquare Secret
ACCESS_TOKEN = 'J0TME0XJXOAGSAJ5YQVV1JOKCH530CVDJIN5L5IU1OOYFQPX' # your FourSquare Access Token
VERSION = '20180604'

Using folium to build and view a map of Seattle.

In [5]:
sea_lat = 47.608013
sea_lon = -122.335167
map_seattle = folium.Map(location=[sea_lat, sea_lon], zoom_start=12)
    
map_seattle

I found a wiki page with a table containing Seattle neighborhoods.
I am using BeautifulSoup to scrape the page and load a dataframe.

In [6]:
from bs4 import BeautifulSoup

In [48]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Seattle")

soup = BeautifulSoup(req.content,'lxml')

table = soup.find_all('table')[0]

df1 = pd.read_html(str(table))

hoods = pd.DataFrame(df1[0])
hoods.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood name,Within larger district,Annexed[41],Locator map,Street map,Image,Notes
0,1,North Seattle,Seattle,Various,,,,North of the Lake Washington Ship Canal[42]
1,2,Broadview,North Seattle[42],1954[43],,,,[44]
2,3,Bitter Lake,North Seattle[42],1954[43],,,,[45]
3,4,North Beach / Blue Ridge,North Seattle[42],"1940,[43] 1954[43]",,,,[46]
4,5,Crown Hill,North Seattle[42],"1907,[47] 1952,[43] 1954[43]",,,,[48]


### Cleaning the data
Unfortunately I could not find a source that already included geo coordinates for the neighborhoods.  
I just need the Neighborhood name from this data set, and will add coordinates later.  
There are also a couple of greater neighborhoods that are redundant so I am removing those.

In [49]:
#remove unwanted rows
hoods.drop( hoods[ (hoods['Neighborhood name'] == "North Seattle") | (hoods['Neighborhood name'] == "Central Seattle") | (hoods['Neighborhood name'] == "West Seattle") | (hoods['Neighborhood name'] == "Queen Anne")].index , inplace=True)

#remove unwanted columns
hoods.drop(['Unnamed: 0','Within larger district','Annexed[41]','Locator map','Street map','Image','Notes'], axis=1, inplace=True)

#rename column for less typing
hoods.rename(columns={'Neighborhood name':'Neighborhood'}, inplace=True)

#reset index after dropped rows
hoods = hoods.reset_index(drop=True)

hoods.head()

Unnamed: 0,Neighborhood
0,Broadview
1,Bitter Lake
2,North Beach / Blue Ridge
3,Crown Hill
4,Greenwood


In [50]:
hoods.shape

(123, 1)

Now to add the coordinates using geocoder and arcgis.

In [28]:
#!pip install geocoder
import geocoder

In [44]:
latitude=[]
longitude=[]
for row in hoods['Neighborhood']:
    g = geocoder.arcgis('{}, Seattle, Washington'.format(row))
    print(row, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Seattle, Washington'.format(row))
        print(row, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

Broadview [47.722380000000044, -122.36497999999995]
Bitter Lake [47.71868000000006, -122.35029999999995]
North Beach / Blue Ridge [47.70044000000007, -122.38417999999996]
Crown Hill [47.69520000000006, -122.37409999999994]
Greenwood [47.69082000000003, -122.35528999999997]
Northgate [47.713100000000054, -122.31929999999994]
Haller Lake [47.72320000000008, -122.33869999999996]
Pinehurst [47.71894000000003, -122.31399999999996]
North College Park (Licton Springs) [47.69914000000006, -122.33967999999999]
Maple Leaf [47.70013000000006, -122.31764999999996]
Lake City [47.71780000000007, -122.28317999999996]
Cedar Park [47.72645000000006, -122.28800999999999]
Matthews Beach [47.69934000000006, -122.27834999999999]
Meadowbrook [47.70841000000007, -122.29585999999995]
Olympic Hills [47.72656000000006, -122.30259999999998]
Victory Heights [47.710300000000075, -122.30719999999997]
Wedgwood [47.68701000000004, -122.29493999999994]
View Ridge [47.680330000000026, -122.27289999999999]
Sand Point [4

In [51]:
#append the coordinates to the dataframe
hoods['Latitude'] = latitude
hoods['Longitude'] = longitude

In [52]:
hoods.shape

(123, 3)

Now I am redrawing the Seattle map with neighborhood markers.

In [39]:
sea_lat = 47.608013
sea_lon = -122.335167
map_seattle = folium.Map(location=[sea_lat, sea_lon], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(hoods['Latitude'], hoods['Longitude'], hoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_seattle)  
    
map_seattle

Now we will get venue data via the foursquare api, and create a larger dataframe merging the hoods dataframe.

In [60]:
#define a function to gather venues for all of our hoods using a 1000 meter radius from each hood coordinates.

#amount of results per hood
LIMIT = 30
#looking for burgers
query = "burger"
#I would like to get ratings as well, but that is a premium call on foursquare's api

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&query={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            query, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [61]:
venues = getNearbyVenues(names=hoods['Neighborhood'],
                                   latitudes=hoods['Latitude'],
                                   longitudes=hoods['Longitude']
                                  )

Broadview
Bitter Lake
North Beach / Blue Ridge
Crown Hill
Greenwood
Northgate
Haller Lake
Pinehurst
North College Park (Licton Springs)
Maple Leaf
Lake City
Cedar Park
Matthews Beach
Meadowbrook
Olympic Hills
Victory Heights
Wedgwood
View Ridge
Sand Point
Roosevelt
Ravenna
Bryant
Windermere
Hawthorne Hills
Laurelhurst
University District (U District)
University Village
Wallingford
Northlake
Green Lake
Fremont
Phinney Ridge
Ballard
West Woodland
Whittier Heights
Adams
Sunset Hill
Loyal Heights
Magnolia
Lawton Park
Briarcliff
Southeast Magnolia
Interbay
North Queen Anne
East Queen Anne
Lower Queen Anne
West Queen Anne
Capitol Hill
Portage Bay[95] / Roanoke
Broadway
Pike-Pine Corridor / Pike/Pine[97][98][99]
Montlake
Stevens
Interlaken
Madison Valley
Renton Hill
Madison Park
Broadmoor
Lake Union
South Lake Union, Seattle
Cascade, Seattle
Westlake
Eastlake
Downtown
Denny Triangle
Belltown
Pike-Market
Central Business District
First Hill
Pioneer Square
International District ("ID")
Yesler T

In [62]:
print(venues.shape)
venues.head()

(731, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bitter Lake,47.71868,-122.3503,Rain Cafe,47.724808,-122.343854,Café
1,Bitter Lake,47.71868,-122.3503,Jack in the Box,47.720318,-122.345448,Burger Joint
2,Bitter Lake,47.71868,-122.3503,Burger King,47.724633,-122.345448,Fast Food Restaurant
3,Crown Hill,47.6952,-122.3741,Dick's Drive-In,47.6965,-122.371748,Burger Joint
4,Crown Hill,47.6952,-122.3741,Wild Mountain Cafe,47.690779,-122.374559,American Restaurant


I want to remove the category of Fast Food Restaurant.

In [63]:
#remove unwanted rows
venues.drop( venues[ (venues['Venue Category'] == "Fast Food Restaurant")].index , inplace=True)

In [66]:
print(venues.shape)
venues.head()

(724, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bitter Lake,47.71868,-122.3503,Rain Cafe,47.724808,-122.343854,Café
1,Bitter Lake,47.71868,-122.3503,Jack in the Box,47.720318,-122.345448,Burger Joint
3,Crown Hill,47.6952,-122.3741,Dick's Drive-In,47.6965,-122.371748,Burger Joint
4,Crown Hill,47.6952,-122.3741,Wild Mountain Cafe,47.690779,-122.374559,American Restaurant
5,Crown Hill,47.6952,-122.3741,Crown Hill Broiler,47.687647,-122.377145,Restaurant


In [67]:
#jack in the box is trying to sneak in there as a Burger Joint instead of a fast food
venues.drop( venues[ (venues['Venue'] == "Jack in the Box")].index , inplace=True)

In [68]:
print(venues.shape)
venues.head()

(711, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bitter Lake,47.71868,-122.3503,Rain Cafe,47.724808,-122.343854,Café
3,Crown Hill,47.6952,-122.3741,Dick's Drive-In,47.6965,-122.371748,Burger Joint
4,Crown Hill,47.6952,-122.3741,Wild Mountain Cafe,47.690779,-122.374559,American Restaurant
5,Crown Hill,47.6952,-122.3741,Crown Hill Broiler,47.687647,-122.377145,Restaurant
7,Northgate,47.7131,-122.3193,Boud's Pinehurst Pub,47.715641,-122.312633,Bar


Good! I got rid of 13 Jack in the Boxes

Explore the data a little.