In [1]:
!pip install requests beautifulsoup4 
!pip install folium
!pip install geocoder



In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import geocoder # import geocoder

from geopy.geocoders import Nominatim 
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
url = "https://en.wikipedia.org/wiki/Neighborhoods_of_Portland,_Oregon"
data = requests.get(url).text
soup = BeautifulSoup(data, 'html.parser')

In [4]:
data = []
for table in soup.find_all("div", class_="div-col columns column-width")[:3]:
  for row in table.findAll("li"):
    data.append(row.text)

In [5]:
df =pd.DataFrame({"Neighborhood": data})
df  

Unnamed: 0,Neighborhood
0,Arlington Heights
1,Forest Park
2,Goose Hollow
3,Hillside
4,Linnton
5,"Northwest District (includes Uptown, Nob Hill,..."
6,Northwest Heights
7,Northwest Industrial
8,Old Town Chinatown
9,Pearl District


In [6]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Portland, Oregon'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [7]:
coords = [ get_latlng(neighborhood) for neighborhood in df["Neighborhood"].tolist() ]


In [8]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [9]:
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Arlington Heights,45.52083,-122.7126
1,Forest Park,45.56632,-122.75337
2,Goose Hollow,45.51816,-122.69347
3,Hillside,45.52708,-122.70763
4,Linnton,45.60032,-122.78694


In [10]:
df.to_csv("df.csv", index=False)

In [11]:
address = 'Portland, Oregon'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Portland, Oregon {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Portland, Oregon 45.5202471, -122.6741949.


In [12]:
map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map)  
    
map

In [13]:
map.save('map.html')

In [14]:
CLIENT_ID = 'NY1WMPNFASJFXS1XILA1HJCKLWD3KFFREFWZNXRY5MGCAMVZ' # your Foursquare ID
CLIENT_SECRET = '2YFFLJAHO23RCBDP3LC2Y1ABKEHXNL5EPDHZRRN0BEF3HDMB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NY1WMPNFASJFXS1XILA1HJCKLWD3KFFREFWZNXRY5MGCAMVZ
CLIENT_SECRET:2YFFLJAHO23RCBDP3LC2Y1ABKEHXNL5EPDHZRRN0BEF3HDMB


In [15]:
radius = 1000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [16]:
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1408, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Arlington Heights,45.52083,-122.7126,Portland Japanese Garden,45.519457,-122.706937,Garden
1,Arlington Heights,45.52083,-122.7126,Hoyt Arboretum,45.517493,-122.717714,Park
2,Arlington Heights,45.52083,-122.7126,International Rose Test Garden,45.519075,-122.705616,Botanical Garden
3,Arlington Heights,45.52083,-122.7126,Washington Park,45.517835,-122.705784,Park
4,Arlington Heights,45.52083,-122.7126,Pittock Mansion,45.525262,-122.716684,Museum


In [17]:
venues_df.loc[venues_df['VenueCategory'].str.contains('Restaurant'), 'VenueCategory'] = 'Restaurant'

In [18]:
venues_df.groupby(["Neighborhood"]).count()


Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arbor Lodge,57,57,57,57,57,57
Arlington Heights,42,42,42,42,42,42
Arnold Creek,13,13,13,13,13,13
Ashcreek,6,6,6,6,6,6
Bridgeton,31,31,31,31,31,31
Bridlemile (includes Glencullen),14,14,14,14,14,14
Cathedral Park,47,47,47,47,47,47
Collins View,11,11,11,11,11,11
Crestwood,23,23,23,23,23,23
East Columbia,4,4,4,4,4,4


In [19]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))


There are 206 uniques categories.


In [20]:
venues_df['VenueCategory'].unique()[:50]


array(['Garden', 'Park', 'Botanical Garden', 'Museum', 'Trail',
       'Amphitheater', 'Playground', 'Café', 'Bus Station', 'Gift Shop',
       'Tennis Court', 'Scenic Lookout', 'Train Station', 'Fountain',
       'Field', 'Music Venue', 'Historic Site', 'Gym', 'Forest',
       'Cooking School', 'Pizza Place', 'Restaurant', 'Soccer Stadium',
       'Gym / Fitness Center', 'Coffee Shop', 'Pub', 'Hotel',
       'Cocktail Bar', 'Beer Store', 'Deli / Bodega', 'Bakery',
       'Steakhouse', 'Shipping Store', 'Athletics & Sports', 'Taco Place',
       'Theater', 'Grocery Store', 'Diner', 'Art Museum', 'Dessert Shop',
       'Farmers Market', 'Sandwich Place', 'Concert Hall',
       'Furniture / Home Store', 'Food Truck', 'Event Space', 'Brewery',
       'Clothing Store', 'Bookstore', 'Gastropub'], dtype=object)

In [21]:
"Restaurant" in venues_df['VenueCategory'].unique()


True

In [22]:
# one hot encoding
df_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Neighborhood'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

print(df_onehot.shape)
df_onehot.head()

(1408, 207)


Unnamed: 0,Neighborhood,ATM,Adult Boutique,Airport,Amphitheater,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Athletics & Sports,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Beach,Beer Bar,Beer Store,Big Box Store,Bike Shop,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Breakfast Spot,Brewery,Bridge,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,...,Shipping Store,Shoe Store,Shop & Service,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Soccer Field,Soccer Stadium,Spa,Sporting Goods Shop,Sports Bar,Stadium,Steakhouse,Storage Facility,Street Food Gathering,Strip Club,Supermarket,Supplement Shop,Taco Place,Tea Room,Tennis Court,Theater,Theme Park,Theme Park Ride / Attraction,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Train Station,Tram Station,Tunnel,Video Store,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Arlington Heights,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Arlington Heights,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Arlington Heights,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Arlington Heights,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Arlington Heights,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
portland_grouped = df_onehot.groupby(["Neighborhood"]).mean().reset_index()

print(portland_grouped.shape)
portland_grouped['Restaurant']

(40, 207)


0     0.175439
1     0.000000
2     0.000000
3     0.166667
4     0.161290
5     0.142857
6     0.148936
7     0.090909
8     0.173913
9     0.000000
10    0.125000
11    0.000000
12    0.190000
13    0.097561
14    0.050000
15    0.205882
16    0.164557
17    0.153846
18    0.145833
19    0.000000
20    0.066667
21    0.071429
22    0.125000
23    0.134615
24    0.175676
25    0.142857
26    0.111111
27    0.180000
28    0.200000
29    0.190000
30    0.163265
31    0.194805
32    0.096774
33    0.100000
34    0.069767
35    0.218750
36    0.000000
37    0.083333
38    0.096774
39    0.258065
Name: Restaurant, dtype: float64

In [24]:
len(portland_grouped[portland_grouped["Restaurant"] > 0])


34

In [25]:
portland_restaurant = portland_grouped[["Neighborhood","Restaurant"]]
portland_restaurant.head()


Unnamed: 0,Neighborhood,Restaurant
0,Arbor Lodge,0.175439
1,Arlington Heights,0.0
2,Arnold Creek,0.0
3,Ashcreek,0.166667
4,Bridgeton,0.16129


In [26]:

# set number of clusters
kclusters = 3

portland_clustering = portland_restaurant.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(portland_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 2, 2, 1, 1, 1, 1, 0, 1, 2], dtype=int32)

In [27]:
portland_merged = portland_restaurant.copy()

# add clustering labels
portland_merged["Cluster Labels"] = kmeans.labels_
portland_merged

Unnamed: 0,Neighborhood,Restaurant,Cluster Labels
0,Arbor Lodge,0.175439,1
1,Arlington Heights,0.0,2
2,Arnold Creek,0.0,2
3,Ashcreek,0.166667,1
4,Bridgeton,0.16129,1
5,Bridlemile (includes Glencullen),0.142857,1
6,Cathedral Park,0.148936,1
7,Collins View,0.090909,0
8,Crestwood,0.173913,1
9,East Columbia,0.0,2


In [28]:
portland_merged = portland_merged.join(df.set_index("Neighborhood"), on="Neighborhood")

print(portland_merged.shape)
portland_merged.head()

(40, 5)


Unnamed: 0,Neighborhood,Restaurant,Cluster Labels,Latitude,Longitude
0,Arbor Lodge,0.175439,1,45.57171,-122.69094
1,Arlington Heights,0.0,2,45.52083,-122.7126
2,Arnold Creek,0.0,2,45.44014,-122.70202
3,Ashcreek,0.166667,1,45.45483,-122.73761
4,Bridgeton,0.16129,1,45.60307,-122.66939


In [29]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
# x = np.arange(kclusters)
# ys = [i+x+(i*x)**2 for i in range(kclusters)]
rainbow = ['#ff0000','#0000ff','#00ff00','#ffff00','#00ffff','#ff00ff']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(portland_merged['Latitude'], portland_merged['Longitude'], portland_merged['Neighborhood'], portland_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [30]:
map_clusters.save('map_clusters.html')


In [43]:
print("Total frequency of restaurant:",portland_merged.loc[portland_merged['Cluster Labels'] == 0]['Restaurant'].sum())
portland_merged.loc[portland_merged['Cluster Labels'] == 0]


Total frequency of restaurant: 1.3189409626311532


Unnamed: 0,Neighborhood,Restaurant,Cluster Labels,Latitude,Longitude
7,Collins View,0.090909,0,45.45832,-122.68081
10,Far Southwest,0.125,0,45.44063,-122.7339
13,Hayden Island,0.097561,0,45.61564,-122.68998
14,Hayhurst (includes Vermont Hills),0.05,0,45.47814,-122.72705
20,Maplewood,0.066667,0,45.47013,-122.72874
21,Markham,0.071429,0,45.46182,-122.69732
22,Marshall Park,0.125,0,45.45306,-122.69955
23,Multnomah (includes Multnomah Village),0.134615,0,45.46975,-122.70562
26,Northwest Industrial,0.111111,0,45.541221,-122.712308
32,Portsmouth,0.096774,0,45.58867,-122.71949


In [44]:
print("Total frequency of restaurant:",portland_merged.loc[portland_merged['Cluster Labels'] == 1]['Restaurant'].sum())
portland_merged.loc[portland_merged['Cluster Labels'] == 1]


Total frequency of restaurant: 3.552638580022184


Unnamed: 0,Neighborhood,Restaurant,Cluster Labels,Latitude,Longitude
0,Arbor Lodge,0.175439,1,45.57171,-122.69094
3,Ashcreek,0.166667,1,45.45483,-122.73761
4,Bridgeton,0.16129,1,45.60307,-122.66939
5,Bridlemile (includes Glencullen),0.142857,1,45.49252,-122.72786
6,Cathedral Park,0.148936,1,45.58845,-122.76056
8,Crestwood,0.173913,1,45.45473,-122.72725
12,Goose Hollow,0.19,1,45.51816,-122.69347
15,Hillsdale,0.205882,1,45.48193,-122.69369
16,Hillside,0.164557,1,45.52708,-122.70763
17,Homestead,0.153846,1,45.49691,-122.68986


In [45]:
print("Total frequency of restaurant:",portland_merged.loc[portland_merged['Cluster Labels'] == 2]['Restaurant'].sum())
portland_merged.loc[portland_merged['Cluster Labels'] == 2]

Total frequency of restaurant: 0.0


Unnamed: 0,Neighborhood,Restaurant,Cluster Labels,Latitude,Longitude
1,Arlington Heights,0.0,2,45.52083,-122.7126
2,Arnold Creek,0.0,2,45.44014,-122.70202
9,East Columbia,0.0,2,45.59169,-122.65191
11,Forest Park,0.0,2,45.56632,-122.75337
19,Linnton,0.0,2,45.60032,-122.78694
36,St. Johns,0.0,2,45.60847,-122.75907
