### Collect Venue data from FourSquare:
. check FourSquare website for more info (https://developer.foursquare.com/docs/api)

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt

### 1. start here with a list of US counties

In [3]:
dfa = pd.read_csv("C:\\Users\\fumei_000\\Documents\\DSprojects\\DM_VenueProfiles\\county_120.csv")
dfa.head()

Unnamed: 0,countyid,rank,st,county,highrate,address,lat,lng
0,0500000US16055,44,ID,Kootenai,0,"Kootenai County, ID",47.654857,-116.716292
1,0500000US01073,47,AL,Jefferson,0,"Jefferson County, AL",33.522818,-86.916451
2,0500000US44003,4,RI,Kent,0,"Kent County, RI",41.672291,-71.602762
3,0500000US27037,55,MN,Dakota,0,"Dakota County, MN",44.666655,-93.044911
4,0500000US41047,26,OR,Marion,0,"Marion County, OR",44.896442,-122.720325


### 2. retrieve nearby venue data for each County - calling FourSquare API  

In [8]:
import requests

from IPython.display import Image 
from IPython.core.display import HTML 
    
from pandas.io.json import json_normalize

In [9]:
CLIENT_ID = 'xxxxx' # your Foursquare ID
CLIENT_SECRET = 'xxxxx' # your Foursquare Secret
VERSION = '20180604'

In [10]:
# define a function to repeat same process to all loc_id
# not all listings have all items (eg. some missing postalCode, categories) 
def getNearbyHealthyVenues(loc_ids, latitudes, longitudes, search_query):
    
    radius= 50000 # m, about 30 mi
    LIMIT = 500 
    
    venues_list=[]
    for loc_id, lat, lng in zip(loc_ids, latitudes, longitudes):            
               
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, search_query, radius, LIMIT)
    
        #results = requests.get(url).json()["response"]['groups'][0]['items']
        results = requests.get(url).json()         
        ven = results["response"]['venues']
         
        #print(loc_id)
        
        for v in ven:
            venueid = v['id']
            venue = v['name']
            vlat = v['location']['lat']
            vlng = v['location']['lng']
            vstate = get_alt_state(v)  
            vcategory = get_alt_category(v) 
            venues_list.append([loc_id,venueid,venue,vlat,vlng,vstate,vcategory])
            
    vcolumns = ['loc_id','venueid','venue','latitude','longitude','state','category']
    nearby_venues = pd.DataFrame(data = venues_list, columns = vcolumns)
            
    return(nearby_venues)

In [11]:
# define a function that extracts the category of the venue
def get_alt_category(row):
    try:
        category = row['categories'][0]['name']
    except:
        category = 'None'
        
    return category

In [12]:
# define a function that extracts the category of the venue
def get_alt_state(row):
    try:
        state = row['location']['state']
    except:
        state = ''
        
    return state

In [None]:
# search with 10 key words 
# 950 free regular API calls per day
searches = ['health','Athletics','yoga','park','market','burger','BBQ','mcdonald','donuts','fitness']  
for i in range (0, 7):
    search_term = searches[i]
    venues = getNearbyHealthyVenues(loc_ids = dfa['countyid'],
                                   latitudes = dfa['lat'],
                                   longitudes = dfa['lng'], search_query = search_term)    
    if (i == 0):
        us_venues = venues
    else:
        us_venues = pd.concat([us_venues, venues], axis=0)
        
    print(search_term)
    print(venues.shape)

### 3. process venue data 

In [21]:
# remove duplicates
us_venues = us_venues.drop_duplicates(keep='first', inplace=False)

That's all venue data within 30 mi, for 120 counties

In [29]:
# some overlap between counties, and that's ok
ds = us_venues
print(ds.shape)
print(ds['venueid'].nunique())

(28354,)
24469


In [22]:
ds.rename(columns={'loc_id':'countyid'}, inplace=True)
ds.to_csv("xxxxx", index=False) # save results as file

In [4]:
print('There are {} uniques categories.'.format(len(ds['category'].unique())))
cat = ds[['venueid','category']].groupby(['category'],as_index=False).count()
cat.columns = ['category','v_count']

There are 519 uniques categories.
