# Download Your Untappd History Data

This script depends on the ```pandas``` and ```selenium``` packages

In [1]:
import time, random, pandas as pd, pytz
from dateutil import parser as date_parser
from datetime import datetime as dt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

from keys import untappd_username, untappd_password

In [2]:
# only get n check-ins, or get all of them if zero
limit_checkin_count = 0

# define the url to log into untappd and the url for the user's profile
login_url = 'https://untappd.com/login'
profile_url = 'https://untappd.com/user/{}'.format(untappd_username)

In [3]:
# define html element ids for username and password input boxes
username_box_id = 'username'
password_box_id = 'password'

# define xpath queries to find the html elements of interest
show_more_button_query = '//a[@class="yellow button more_checkins more_checkins_logged track-click"]'
checkin_item_query = '//div[@id="main-stream"]/div[@class="item"]'
beer_name_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="top"]/p[@class="text"]/a'
count_query = '//div[@class="stats-bar"]/div[@class="stats"]/a[@href="/user/{}"]/span[@class="stat"]'
rating_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="top"]/p[@class="checkin-comment"]/span[contains(@class, "rating")]'
date_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="feedback"]/div[@class="bottom"]/a[@class="time timezoner track-click"]'

In [4]:
# define the pause durations
short_pause_min = 1
short_pause_max = 2
medium_pause_min = 3
medium_pause_max = 4
long_pause_min = 5
long_pause_max = 6

In [5]:
# define the options for launching chrome
chrome_options = Options()
chrome_options.add_argument('--disable-extensions')
chrome_options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
executable_path = 'chromedriver.exe'
maximize_window = False

In [6]:
# define pause functions
def pause(duration_min, duration_max):
    duration = (duration_max - duration_min) * random.random() + duration_min
    time.sleep(duration)

def pause_short():
    pause(short_pause_min, short_pause_max)

def pause_medium():
    pause(medium_pause_min, medium_pause_max)
    
def pause_long():
    pause(long_pause_min, long_pause_max)

In [7]:
def get_details(names_urls):

    # initialize the name and link variables with empty strings
    beer_name = ''
    beer_url = ''
    brewery_name = ''
    brewery_url = ''
    venue_name = ''
    venue_url = ''

    # for each name+link pair we found, see if it's a beer, a brewery, or a venue
    for name, url in names_urls:
        if '/b/' in url:
            beer_name = name
            beer_url = url
        elif '/w/' in url:
            brewery_name = name
            brewery_url = url
        elif '/v/' in url:
            venue_name = name
            venue_url = url

    return beer_name, beer_url, brewery_name, brewery_url, venue_name, venue_url

In [8]:
# determine the time the script started at
start_time = time.time()
print 'start time {}'.format(dt.fromtimestamp(start_time).strftime('%H:%M:%S'))

start time 08:36:29


In [9]:
# launch the chrome driver, then clear cookies and cache
driver = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options)
driver.delete_all_cookies()
if maximize_window:
    driver.maximize_window()

In [10]:
# log into untappd
driver.get(login_url)
pause_short()

username_box = driver.find_element_by_id(username_box_id)
username_box.clear()
username_box.send_keys(untappd_username)
username_box.send_keys(Keys.TAB)
pause_short()

password_box = driver.find_element_by_id(password_box_id)
password_box.clear()
password_box.send_keys(untappd_password)
pause_short()

password_box.send_keys(Keys.ENTER)

In [11]:
# go to the user's profile web page
pause_long()
driver.get(profile_url)

In [12]:
# get the count of total check-ins
pause_medium()
checkin_count_item = driver.find_elements(By.XPATH, count_query.format(untappd_username))[0]
checkin_count = int(checkin_count_item.text.replace(',', ''))
print '{:,}'.format(checkin_count)

1,430


In [13]:
# scroll down and click 'show more' until we find the total number of check-ins
pause_short()
count_found = 0
scroll_count = 0
actions = ActionChains(driver)

checkin_count = limit_checkin_count if limit_checkin_count > 0 else checkin_count
while count_found < checkin_count:
    
    actions.key_down(Keys.END).key_up(Keys.END).perform()
    pause_short()
    
    show_more_button = driver.find_elements(By.XPATH, show_more_button_query)[0]
    actions.move_to_element(show_more_button).perform()
    pause_short()
    
    actions.click(show_more_button).perform()   
    pause_medium()
    
    scroll_count += 1
    count_found = len(driver.find_elements(By.XPATH, checkin_item_query))
    print '{} {:,}'.format(scroll_count, count_found)
    
pause_medium()
actions.key_down(Keys.END).key_up(Keys.END).perform()
pause_short()

1 40
2 90
3 165
4 240
5 315
6 440
7 740
8 1,090
9 1,430


In [14]:
# report how many total check-in items were found in the end, and the current time
checkin_items = driver.find_elements(By.XPATH, checkin_item_query)
current_time = time.time()
print 'found {:,} checkins'.format(len(checkin_items))
print 'current time {}'.format(dt.fromtimestamp(current_time).strftime('%H:%M:%S'))
print 'elapsed time so far: {:,.1f} secs'.format(current_time-start_time)

found 1,430 checkins
current time 08:48:46
elapsed time so far: 736.7 secs


In [15]:
# loop through each check-in item and get the beer, brewery, and venue details
checkins = []
for checkin_item in checkin_items:
    
    # get the check-in id then the names and links for the beer, brewery, and venue
    checkin_item_id = checkin_item.get_attribute('id')
    text_items = driver.find_elements(By.XPATH, beer_name_query.format(checkin_item_id))
    names_urls = [(item.text, item.get_attribute('href')) for item in text_items]
    
    # get the beer, brewery, and venue details
    beer_name, beer_url, brewery_name, brewery_url, venue_name, venue_url = get_details(names_urls)
    
    # when we're getting those detils, get the rating
    try:
        rating_item = driver.find_elements(By.XPATH, rating_query.format(checkin_item_id))[0]
        rating = int(rating_item.get_attribute('class').split(' r')[1]) / 100.
    except:
        rating = None
    
    # then get the date
    date_item = driver.find_elements(By.XPATH, date_query.format(checkin_item_id))[0]
    date = date_item.get_attribute('data-gregtime')
    
    # to get the style, public rating, public check-ins etc, you must visit the individual beer's page
    
    # now save the details to an object and append to the list
    checkins.append({'checkin_id' : checkin_item_id.split('_')[1],
                     'beer_name' : beer_name,
                     'beer_url' : beer_url,
                     'brewery_name' : brewery_name,
                     'brewery_url' : brewery_url,
                     'venue_name' : venue_name,
                     'venue_url' : venue_url,
                     'rating' : rating,
                     'date' : date})

In [16]:
# see my 10th check-in, as an example
checkins[-10]

{'beer_link': u'https://untappd.com/b/greene-king-morland-old-speckled-hen/3121',
 'beer_name': u'Morland Old Speckled Hen',
 'brewery_link': u'https://untappd.com/w/greene-king/612',
 'brewery_name': u'Greene King',
 'checkin_id': u'17989067',
 'date': u'Sat, 15 Dec 2012 23:32:01 +0000',
 'rating': 3.5,
 'venue_link': u'https://untappd.com/v/schmidts-pub/191667',
 'venue_name': u"Schmidt's Pub"}

In [17]:
# calculate the end time and the elapsed time
end_time = time.time()
print 'end time {}'.format(dt.fromtimestamp(end_time).strftime('%H:%M:%S'))
print 'elapsed time: {:,.1f} secs'.format(end_time-start_time)

end time 08:57:26
elapsed time: 1,257.3 secs


In [96]:
# turn the list of check-in dicts into a dataframe
df = pd.DataFrame(checkins)
print 'created {:,} rows'.format(len(df))

created 1,430 rows


In [97]:
# convert each timestamp to pacific time
def parse_convert_date(date_string):
    date_time = date_parser.parse(date_string)
    date_time_tz = date_time.replace(tzinfo=date_time.tzinfo).astimezone(pytz.timezone('US/Pacific'))
    return date_time_tz
    
df['date_pacific_tz'] = df['date'].map(parse_convert_date)
df = df.drop('date', axis=1)

You could potentially get the local timezone for each check-in venue's lat/long by using this Google API: https://developers.google.com/maps/documentation/timezone/intro

In [100]:
df.head()

Unnamed: 0,beer_url,beer_name,brewery_url,brewery_name,checkin_id,rating,venue_url,venue_name,date_pacific_tz
0,https://untappd.com/b/angel-city-brewery-angel...,Angel City IPA,https://untappd.com/w/angel-city-brewery/4406,Angel City Brewery,321628572,3.5,https://untappd.com/v/boeing-manor/509841,Boeing Manor,2016-06-09 20:43:09-07:00
1,https://untappd.com/b/cascade-brewery-co-austr...,Cascade Premium Light,https://untappd.com/w/cascade-brewery-co-austr...,Cascade Brewery Co. (Australia),321044810,2.0,https://untappd.com/v/golden-monkey/1201269,Golden Monkey,2016-06-08 01:45:36-07:00
2,https://untappd.com/b/naked-for-satan-naked-la...,Naked Lager,https://untappd.com/w/naked-for-satan/83569,Naked For Satan,321038174,3.75,https://untappd.com/v/naked-for-satan/86227,Naked For Satan,2016-06-07 22:39:59-07:00
3,https://untappd.com/b/venom-brewing-golden-ale...,Golden Ale,https://untappd.com/w/venom-brewing/219040,Venom Brewing,321031246,3.25,https://untappd.com/v/town-hall-hotel/52668,Town Hall Hotel,2016-06-07 21:25:57-07:00
4,https://untappd.com/b/matilda-bay-brewing-comp...,Fat Yak,https://untappd.com/w/matilda-bay-brewing-comp...,Matilda Bay Brewing Company,321022902,4.0,https://untappd.com/v/bowl-bowl/4803853,Bowl Bowl,2016-06-07 20:37:02-07:00


In [101]:
# save the dataset to csv
df.to_csv('data/untappd.csv', index=False, encoding='utf-8')