# IBM Applied Data Science Capstone Project

## The Battle of Neighborhoods

### Preprocessing steps

#### import libraries

In [64]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library
import requests
from sklearn.cluster import KMeans # import k-means from clustering stage



#### read US zipcodes & geocoordinates csv file

In [35]:
df_us_zipcodes = pd.read_csv("https://raw.githubusercontent.com/cholovesdata/Coursera_Capstone/master/us_zipcode_geocoordinates.csv")

# select relevant columns
df_us_zipcodes = df_us_zipcodes[['zip', 'lat', 'lng', 'city', 'state_name']]
df_us_zipcodes

Unnamed: 0,zip,lat,lng,city,state_name
0,601,18.18004,-66.75218,Adjuntas,Puerto Rico
1,602,18.36073,-67.17517,Aguada,Puerto Rico
2,603,18.45439,-67.12202,Aguadilla,Puerto Rico
3,606,18.16724,-66.93828,Maricao,Puerto Rico
4,610,18.29032,-67.12243,Anasco,Puerto Rico
...,...,...,...,...,...
33094,99923,55.97796,-130.03671,Hyder,Alaska
33095,99925,55.55767,-132.97627,Klawock,Alaska
33096,99926,55.12617,-131.48928,Metlakatla,Alaska
33097,99927,56.25100,-133.37571,Point Baker,Alaska


#### rename dataframe columns

In [36]:
# rename dataframe columns
df_us_zipcodes = df_us_zipcodes.rename(columns={"zip": "PostalCode", "lat": "Latitude", "lng": "Longitude", "city": "City", "state_name": "State"})
df_us_zipcodes

Unnamed: 0,PostalCode,Latitude,Longitude,City,State
0,601,18.18004,-66.75218,Adjuntas,Puerto Rico
1,602,18.36073,-67.17517,Aguada,Puerto Rico
2,603,18.45439,-67.12202,Aguadilla,Puerto Rico
3,606,18.16724,-66.93828,Maricao,Puerto Rico
4,610,18.29032,-67.12243,Anasco,Puerto Rico
...,...,...,...,...,...
33094,99923,55.97796,-130.03671,Hyder,Alaska
33095,99925,55.55767,-132.97627,Klawock,Alaska
33096,99926,55.12617,-131.48928,Metlakatla,Alaska
33097,99927,56.25100,-133.37571,Point Baker,Alaska


#### select zipcodes in Washington State

In [44]:
df_wa_zipcodes = df_us_zipcodes[df_us_zipcodes["State"] == "Washington"]
df_wa_zipcodes

Unnamed: 0,PostalCode,Latitude,Longitude,City,State
32265,98001,47.30998,-122.26521,Auburn,Washington
32266,98002,47.30836,-122.21639,Auburn,Washington
32267,98003,47.30513,-122.31508,Federal Way,Washington
32268,98004,47.61884,-122.20595,Bellevue,Washington
32269,98005,47.61478,-122.16862,Bellevue,Washington
...,...,...,...,...,...
32856,99363,46.06652,-118.88846,Wallula,Washington
32857,99371,46.80678,-118.31679,Washtucna,Washington
32858,99401,46.08744,-117.25143,Anatone,Washington
32859,99402,46.19394,-117.14740,Asotin,Washington


#### Select zipcodes in Seattle city in Washington State

In [39]:
df_seattle_zipcodes = df_wa_zipcodes[df_wa_zipcodes["City"] == "Seattle"]

# reset index
df_seattle_zipcodes = df_seattle_zipcodes.reset_index(drop=True)
df_seattle_zipcodes

Unnamed: 0,PostalCode,Latitude,Longitude,City,State
0,98101,47.61129,-122.33454,Seattle,Washington
1,98102,47.63632,-122.32213,Seattle,Washington
2,98103,47.67332,-122.34254,Seattle,Washington
3,98104,47.60169,-122.32849,Seattle,Washington
4,98105,47.66068,-122.28403,Seattle,Washington
5,98106,47.54349,-122.35434,Seattle,Washington
6,98107,47.66764,-122.378,Seattle,Washington
7,98108,47.54126,-122.31295,Seattle,Washington
8,98109,47.63159,-122.34417,Seattle,Washington
9,98112,47.63394,-122.28885,Seattle,Washington


#### Use geopy library to get the latitude and longitude values of Seattle, Washington.

In [46]:
address = 'Seattle, Washington'

geolocator = Nominatim(user_agent="seattle_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Seattle, Washington are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Seattle, Washington are 47.6038321, -122.3300624.


#### Create a map of Seattle, Washington with neighborhoods superimposed on top.

In [49]:
# create map of Seattle using latitude and longitude values
map_seattle = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
# add markers to map
for latitude, longitude, postal_code in zip(df_seattle_zipcodes['Latitude'], df_seattle_zipcodes['Longitude'], df_seattle_zipcodes['PostalCode']):
    label = '{}'.format(postal_code)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_seattle)  
    
map_seattle

#### Set up Foursquare ID and secret

In [50]:
CLIENT_ID = 'removed for submission' # your Foursquare ID
CLIENT_SECRET = 'removed for submission' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

#### Set up a function to get nearby venues given latitude and longitude pairs

In [52]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### retrieve nearby venues in seattle with postal code

In [54]:
venues_seattle = getNearbyVenues(names=df_seattle_zipcodes['PostalCode'],latitudes=df_seattle_zipcodes['Latitude'],longitudes=df_seattle_zipcodes['Longitude'], radius=500)
venues_seattle

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,98101,47.61129,-122.33454,Din Tai Fung Dumpling House,47.612671,-122.335073,Dumpling Restaurant
1,98101,47.61129,-122.33454,ACT Theatre,47.610763,-122.332905,Theater
2,98101,47.61129,-122.33454,Monorail Espresso,47.610828,-122.335048,Coffee Shop
3,98101,47.61129,-122.33454,Timbuk2,47.612561,-122.334223,Accessories Store
4,98101,47.61129,-122.33454,Grand Hyatt Seattle,47.612583,-122.333618,Hotel
...,...,...,...,...,...,...,...
532,98199,47.65142,-122.40270,Walkin' The Streets Of Magnolia,47.651458,-122.405470,Athletics & Sports
533,98199,47.65142,-122.40270,Rudy's Place,47.653865,-122.400653,Breakfast Spot
534,98199,47.65142,-122.40270,Kaspar's Special events and catering,47.654216,-122.401052,Restaurant
535,98199,47.65142,-122.40270,Espresso Bar,47.654860,-122.401153,Coffee Shop


#### group seattle venues by zipcodes

In [56]:
venues_seattle.groupby("Neighborhood").count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
98101,30,30,30,30,30,30
98102,30,30,30,30,30,30
98103,8,8,8,8,8,8
98104,30,30,30,30,30,30
98105,15,15,15,15,15,15
98106,5,5,5,5,5,5
98107,30,30,30,30,30,30
98108,3,3,3,3,3,3
98109,25,25,25,25,25,25
98112,4,4,4,4,4,4


#### find out how many unique categories can be curated from all the returned venues

In [57]:
print('There are {} uniques categories.'.format(len(venues_seattle['Venue Category'].unique())))


There are 171 uniques categories.


#### Categories of venues in Seattle

In [58]:
print(venues_seattle['Venue Category'].unique())

['Dumpling Restaurant' 'Theater' 'Coffee Shop' 'Accessories Store' 'Hotel'
 "Women's Store" 'Clothing Store' 'French Restaurant'
 'New American Restaurant' 'Performing Arts Venue' 'Pizza Place'
 'Discount Store' 'Cosmetics Shop' 'American Restaurant' 'Yoga Studio'
 'Dessert Shop' 'Cocktail Bar' 'Creperie' 'Bridal Shop' 'Wine Bar'
 'Concert Hall' 'Seafood Restaurant' 'Gym' 'Trail' 'Garden'
 'Bubble Tea Shop' 'Italian Restaurant' 'Bakery' 'Dog Run' 'Restaurant'
 'Boat or Ferry' 'Sandwich Place' 'Thai Restaurant' 'Massage Studio' 'Spa'
 'Bus Stop' 'Martial Arts Dojo' 'Food & Drink Shop' 'Korean Restaurant'
 'Cemetery' 'Furniture / Home Store' 'Golf Course' 'Caribbean Restaurant'
 'Tennis Court' 'Lake' 'Harbor / Marina' 'Park' 'Japanese Restaurant'
 'Mini Golf' 'Café' 'Breakfast Spot' 'Building' 'Social Club'
 'Sushi Restaurant' 'BBQ Joint' 'Boutique' 'Video Game Store' 'Hotel Bar'
 'Nightclub' 'Poke Place' 'Deli / Bodega' 'Chinese Restaurant' 'Brewery'
 'Steakhouse' 'Food Truck' 'Burger J

#### Analyze each zipcode in Seattle city area

In [59]:
# one hot encoding
seattle_onehot = pd.get_dummies(venues_seattle[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
seattle_onehot['Neighborhood'] = venues_seattle['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [seattle_onehot.columns[-1]] + list(seattle_onehot.columns[:-1])
seattle_onehot = seattle_onehot[fixed_columns]

seattle_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Airport,Airport Lounge,Airport Terminal,American Restaurant,Animal Shelter,Art Museum,Arts & Entertainment,...,Tour Provider,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,98101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,98101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,98101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,98101,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,98101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
seattle_grouped = seattle_onehot.groupby('Neighborhood').mean().reset_index()
seattle_grouped

Unnamed: 0,Neighborhood,ATM,Accessories Store,Airport,Airport Lounge,Airport Terminal,American Restaurant,Animal Shelter,Art Museum,Arts & Entertainment,...,Tour Provider,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,98101,0.0,0.033333,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.033333,0.033333
1,98102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,98104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
4,98105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,98106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,98107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0
7,98108,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,98109,0.04,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.04
9,98112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Show zipcode neighborhood with Ice Cream Shop

In [63]:
seattle_ice_cream = seattle_grouped[["Neighborhood", "Ice Cream Shop"]]
seattle_ice_cream

Unnamed: 0,Neighborhood,Ice Cream Shop
0,98101,0.0
1,98102,0.0
2,98103,0.0
3,98104,0.0
4,98105,0.0
5,98106,0.0
6,98107,0.0
7,98108,0.0
8,98109,0.0
9,98112,0.0
