In [1]:
# Imports and installs
import requests
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

# Part One: Scrape the Coordinates for All Standard Zipcodes in Tallahassee

In [2]:
# Collect list of Tallahassee zip codes with coordinate mappings

# Get and parse page 
root = "https://www.zip-codes.com"
r = requests.get(root + "/city/fl-tallahassee.asp")
soup = BeautifulSoup(r.text, "html.parser")

# Make a list of zip code links from the relevant table
table = soup.select("#tblZIP")[0]
urls = [a["href"] for a in table.find_all("a", href=True)]
urls = [root + url for url in urls if "zip-code-" in url]
urls[0]

'https://www.zip-codes.com/zip-code/32301/zip-code-32301.asp'

In [3]:
# Visit each link to get the lat and long for each zip code in town
coords = {}
for url in urls:
    next = False
    zipcode = url.split("/zip-code-")[1].strip(".asp")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    table = soup.select(".statTable")[0]
    tr = [tr.text for tr in table.find_all("tr")]
    for t in tr:
        if "Classification" in t:
            if "[Non-Unique]" not in t:
                next = True
    if next:
        continue  # absolutely convoluted
    for t in tr:
        if "Latitude" in t:
            latitude = t.strip("Latitude:")
        elif "Longitude" in t:
            longitude = t.strip("Longitude:")
    coords[zipcode] = (latitude, longitude)
# Hallelujah
coords

{'32301': ('30.42613', '-84.251652'),
 '32303': ('30.523084', '-84.332434'),
 '32304': ('30.456156', '-84.354432'),
 '32305': ('30.343062', '-84.292137'),
 '32308': ('30.472782', '-84.220316'),
 '32309': ('30.579313', '-84.1019'),
 '32310': ('30.384952', '-84.500468'),
 '32311': ('30.363739', '-84.163013'),
 '32312': ('30.579714', '-84.2056'),
 '32317': ('30.467774', '-84.126652')}

# Part Two: FourSquare API and Data Analysis

In [4]:
# @hidden_cell
CLIENT_ID = 'VWZMMMOLA50YBUWRTGPQQCHWL2VSYTWMW3JQ1WUPR42EBDPT' # your Foursquare ID
CLIENT_SECRET = 'YFP4MEWMOX2WHCISRKA0SRE1NY0PF4MAS0U0NPZJ5C0AHTB2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VWZMMMOLA50YBUWRTGPQQCHWL2VSYTWMW3JQ1WUPR42EBDPT
CLIENT_SECRET:YFP4MEWMOX2WHCISRKA0SRE1NY0PF4MAS0U0NPZJ5C0AHTB2


In [5]:
def get_nearby_venues(coords):
    RADIUS = 1000
    LIMIT = 100
    URL = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'
    venues_list=[]
    for zipcode, coordinates in coords.items():
        print(zipcode, end=", ")
        lat, lng = coordinates[0], coordinates[1]
            
        # create the API request URL
        url = URL.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            zipcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Zipcode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return nearby_venues

In [6]:
tally_venues = get_nearby_venues(coords)
tally_venues.shape

32301, 32303, 32304, 32305, 32308, 32309, 32310, 32311, 32312, 32317, 

(73, 7)

In [7]:
tally_venues.groupby("Zipcode").count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32301,35,35,35,35,35,35
32303,4,4,4,4,4,4
32304,24,24,24,24,24,24
32308,5,5,5,5,5,5
32312,4,4,4,4,4,4
32317,1,1,1,1,1,1


In [8]:
print('There are {} uniques categories.'.format(len(tally_venues['Venue Category'].unique())))

There are 48 uniques categories.


In [9]:
# one hot encoding
tally_onehot = pd.get_dummies(tally_venues[['Venue Category']], prefix="", prefix_sep="")

# add zipcode to dataframe as first column
tally_onehot.insert(0, "Zipcode", tally_venues['Zipcode'])

tally_onehot.head()

Unnamed: 0,Zipcode,American Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,Bank,Brewery,Business Service,Café,Chinese Restaurant,...,Skating Rink,Soccer Field,Sporting Goods Shop,Stadium,Tennis Court,Thrift / Vintage Store,Track,Video Game Store,Video Store,Women's Store
0,32301,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,32301,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,32301,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,32301,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,32301,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
tally_onehot.shape

(73, 49)

In [11]:
tally_grouped = tally_onehot.groupby('Zipcode').mean().reset_index()
tally_grouped

Unnamed: 0,Zipcode,American Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,Bank,Brewery,Business Service,Café,Chinese Restaurant,...,Skating Rink,Soccer Field,Sporting Goods Shop,Stadium,Tennis Court,Thrift / Vintage Store,Track,Video Game Store,Video Store,Women's Store
0,32301,0.057143,0.0,0.0,0.028571,0.028571,0.028571,0.028571,0.028571,0.028571,...,0.0,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0
1,32303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32304,0.083333,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.041667,...,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.041667,0.041667
3,32308,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
4,32312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.25,0.25,0.0,0.25,0.0,0.0,0.0
5,32317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
tally_grouped.shape

(6, 49)

In [13]:
num_top_venues = 5

for hood in tally_grouped['Zipcode']:
    print("----"+hood+"----")
    temp = tally_grouped[tally_grouped['Zipcode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----32301----
                  venue  freq
0        Sandwich Place  0.11
1           Pizza Place  0.09
2           Golf Course  0.09
3   American Restaurant  0.06
4  Fast Food Restaurant  0.06


----32303----
                        venue  freq
0             Harbor / Marina  0.25
1                    Gun Shop  0.25
2  Construction & Landscaping  0.25
3          Country Dance Club  0.25
4         American Restaurant  0.00


----32304----
                  venue  freq
0  Fast Food Restaurant  0.12
1   American Restaurant  0.08
2        Discount Store  0.08
3        Sandwich Place  0.08
4   Rental Car Location  0.08


----32308----
                venue  freq
0  Salon / Barbershop   0.2
1         Video Store   0.2
2           Locksmith   0.2
3        Optical Shop   0.2
4  Athletics & Sports   0.2


----32312----
                 venue  freq
0                Track  0.25
1         Tennis Court  0.25
2              Stadium  0.25
3         Soccer Field  0.25
4  American Restaurant  0.00


--

In [14]:
# Function borrowed to sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [15]:
# More code borrowed to show the 10 most common venue types per neighborhood in a dataframe
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Zipcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Zipcode'] = tally_grouped['Zipcode']

for ind in np.arange(tally_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tally_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,32301,Sandwich Place,Golf Course,Pizza Place,American Restaurant,Fast Food Restaurant,Hotel,Mexican Restaurant,Park,Gym,Diner
1,32303,Harbor / Marina,Construction & Landscaping,Gun Shop,Country Dance Club,Automotive Shop,Bank,Athletics & Sports,Gun Range,Golf Course,Gas Station
2,32304,Fast Food Restaurant,American Restaurant,Discount Store,Fried Chicken Joint,Sandwich Place,Rental Car Location,Pharmacy,Auto Dealership,Chinese Restaurant,Cosmetics Shop
3,32308,Athletics & Sports,Video Store,Locksmith,Optical Shop,Salon / Barbershop,Women's Store,Department Store,Golf Course,Gas Station,Furniture / Home Store
4,32312,Track,Tennis Court,Stadium,Soccer Field,Women's Store,Cosmetics Shop,Golf Course,Gas Station,Furniture / Home Store,Fried Chicken Joint
5,32317,Intersection,Women's Store,Gym,Gun Range,Golf Course,Gas Station,Furniture / Home Store,Fried Chicken Joint,Fast Food Restaurant,Discount Store


# Part Three: K-Means Clustering

In [16]:
# set number of clusters
kclusters = 4

tally_grouped_clustering = tally_grouped.drop('Zipcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tally_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([1, 3, 1, 1, 0, 2])

In [17]:
coords = pd.DataFrame(coords).T
coords.columns = ["Latitude", "Longitude"]
coords.index.name = "Zipcode"
coords

Unnamed: 0_level_0,Latitude,Longitude
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1
32301,30.42613,-84.251652
32303,30.523084,-84.332434
32304,30.456156,-84.354432
32305,30.343062,-84.292137
32308,30.472782,-84.220316
32309,30.579313,-84.1019
32310,30.384952,-84.500468
32311,30.363739,-84.163013
32312,30.579714,-84.2056
32317,30.467774,-84.126652


In [18]:
# add clustering labels
try:   # this causes an error if run more than once
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except ValueError:
    pass
    
tally_merged = coords
tally_merged = tally_merged.join(neighborhoods_venues_sorted.set_index('Zipcode'), on='Zipcode')

tally_merged.head(10) # check the last columns!

Unnamed: 0_level_0,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
32301,30.42613,-84.251652,1.0,Sandwich Place,Golf Course,Pizza Place,American Restaurant,Fast Food Restaurant,Hotel,Mexican Restaurant,Park,Gym,Diner
32303,30.523084,-84.332434,3.0,Harbor / Marina,Construction & Landscaping,Gun Shop,Country Dance Club,Automotive Shop,Bank,Athletics & Sports,Gun Range,Golf Course,Gas Station
32304,30.456156,-84.354432,1.0,Fast Food Restaurant,American Restaurant,Discount Store,Fried Chicken Joint,Sandwich Place,Rental Car Location,Pharmacy,Auto Dealership,Chinese Restaurant,Cosmetics Shop
32305,30.343062,-84.292137,,,,,,,,,,,
32308,30.472782,-84.220316,1.0,Athletics & Sports,Video Store,Locksmith,Optical Shop,Salon / Barbershop,Women's Store,Department Store,Golf Course,Gas Station,Furniture / Home Store
32309,30.579313,-84.1019,,,,,,,,,,,
32310,30.384952,-84.500468,,,,,,,,,,,
32311,30.363739,-84.163013,,,,,,,,,,,
32312,30.579714,-84.2056,0.0,Track,Tennis Court,Stadium,Soccer Field,Women's Store,Cosmetics Shop,Golf Course,Gas Station,Furniture / Home Store,Fried Chicken Joint
32317,30.467774,-84.126652,2.0,Intersection,Women's Store,Gym,Gun Range,Golf Course,Gas Station,Furniture / Home Store,Fried Chicken Joint,Fast Food Restaurant,Discount Store


In [20]:
# Define latitude and longtiude of Tallahassee
latitude = "30.4383"
longitude = "-84.2807"

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# drop nan values
tally_merged.dropna(inplace=True)


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tally_merged['Latitude'], tally_merged['Longitude'], tally_merged.index, tally_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=11,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters