In [1]:
import pandas as pd
import requests, json
from datetime import datetime as dt
from keys import foursquare_oauth_token

In [2]:
json_file = 'data/foursqure-checkins.json'
url_template = 'https://api.foursquare.com/v2/users/self/checkins?limit=250&oauth_token={}&v=20160104&offset={}'

In [3]:
data = []
offset = 0
while True:
    response = requests.get(url_template.format(foursquare_oauth_token, offset))
    if len(response.json()['response']['checkins']['items']) == 0:
        break
    data.append(response.json())
    print offset,
    offset += 250

0 250 500 750 1000 1250 1500 1750 2000 2250 2500 2750 3000 3250


In [4]:
with open(json_file, 'w') as f:
    f.write(json.dumps(data, indent=2))

## Now parse the responses and load into a dataframe

In [5]:
location_components = ['city', 'state', 'country', 'lat', 'lng']
rows = []
for response in data:
    for item in response['response']['checkins']['items']:
        try:
            checkin = {}
            checkin['venue_name'] = item['venue']['name']
            checkin['created_at'] = item['createdAt']
            
            if len(item['venue']['categories']) > 0:
                checkin['category'] = item['venue']['categories'][0]['name']
                
            for component in location_components:
                if component in item['venue']['location']:
                    checkin[component] = item['venue']['location'][component]
                else:
                    checkin[component] = None
            rows.append(checkin)
        except:
            pass

In [6]:
df_full = pd.DataFrame(rows)

# make each column that could contain non-ascii characters unicode
df_full['category'] = df_full['category'].astype(unicode)
df_full['city'] = df_full['city'].astype(unicode)
df_full['state'] = df_full['state'].astype(unicode)

# convert unix timestamp to date and time, then drop the timestamp column
df_full['datetime'] = df_full['created_at'].map(lambda x: dt.fromtimestamp(x).strftime('%Y-%m-%d %H:%M'))
df_full = df_full.drop('created_at', axis=1)

# rename lng column to lon
df_full = df_full.rename(columns={'lng':'lon'})

## Clean up the data set then save to CSV

In [7]:
# remove health care and private homes for privacy purposes
terms = ['doctor', 'emergency', 'urgent', 'hospital', 'medical', 'private']
mask = False
for term in terms:
    mask = mask | df_full['category'].str.lower().str.contains(term)
    
df_filtered = df_full[~mask]

In [8]:
# remove multiple check-ins at the same location
df_unique = df_filtered.drop_duplicates(subset=['venue_name', 'lat', 'lon'], keep='last').copy()
df_unique.head()

Unnamed: 0,category,city,country,lat,lon,state,venue_name,datetime
0,Burger Joint,Fremont,United States,37.504152,-121.974019,CA,In-N-Out Burger,2016-01-03 19:20
1,Beach,Big Sur,United States,36.228096,-121.764583,CA,Partington Cove,2016-01-03 16:49
2,Trail,Big Sur,United States,36.250675,-121.800705,CA,Ewoldsen Trail,2016-01-03 16:49
3,Scenic Lookout,Big Sur,United States,36.159273,-121.671771,CA,McWay Falls,2016-01-03 16:46
4,Pub,Big Sur,United States,36.236673,-121.769847,CA,Big Sur Taphouse,2016-01-03 16:06


In [9]:
df_unique.to_csv('data/foursquare-location-history.csv', index=False, encoding='utf-8')