# Part 1: Download Your Untappd History Data



In [1]:
import time, random, pandas as pd, pytz
from dateutil import parser as date_parser
from datetime import datetime as dt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

from keys import untappd_username, untappd_password

In [2]:
# only get n check-ins, or get all of them if zero
limit_checkin_count = 0

# define the url to log into untappd and the url for the user's profile
login_url = 'https://untappd.com/login'
profile_url = 'https://untappd.com/user/{}'.format(untappd_username)

In [3]:
# define html element ids for username and password input boxes
username_box_id = 'username'
password_box_id = 'password'

# define xpath queries to find the html elements of interest
show_more_button_query = '//a[@class="yellow button more_checkins more_checkins_logged track-click"]'
checkin_item_query = '//div[@id="main-stream"]/div[@class="item"]'
beer_name_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="top"]/p[@class="text"]/a'
count_query = '//div[@class="stats-bar"]/div[@class="stats"]/a[@href="/user/{}"]/span[@class="stat"]'
rating_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="top"]/p[@class="checkin-comment"]/span[contains(@class, "rating")]'
date_query = '//div[@id="{}"]/div[@class="checkin"]/div[@class="feedback"]/div[@class="bottom"]/a[@class="time timezoner track-click"]'

In [4]:
# define the pause durations
short_pause_min = 1
short_pause_max = 2
medium_pause_min = 3
medium_pause_max = 4
long_pause_min = 5
long_pause_max = 6

In [5]:
# define the options for launching chrome
chrome_options = Options()
chrome_options.add_argument('--disable-extensions')
chrome_options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
executable_path = 'chromedriver.exe'
maximize_window = False

In [6]:
# define pause functions
def pause(duration_min, duration_max):
    duration = (duration_max - duration_min) * random.random() + duration_min
    time.sleep(duration)

def pause_short():
    pause(short_pause_min, short_pause_max)

def pause_medium():
    pause(medium_pause_min, medium_pause_max)
    
def pause_long():
    pause(long_pause_min, long_pause_max)

In [7]:
def get_details(names_urls):

    # initialize the name and link variables with empty strings
    beer_name = ''
    beer_url = ''
    brewery_name = ''
    brewery_url = ''
    venue_name = ''
    venue_url = ''

    # for each name+link pair we found, see if it's a beer, a brewery, or a venue
    for name, url in names_urls:
        if '/b/' in url:
            beer_name = name
            beer_url = url
        elif '/w/' in url:
            brewery_name = name
            brewery_url = url
        elif '/v/' in url:
            venue_name = name
            venue_url = url

    return beer_name, beer_url, brewery_name, brewery_url, venue_name, venue_url

In [8]:
# determine the time the script started at
start_time = time.time()
print('start time {}'.format(dt.fromtimestamp(start_time).strftime('%H:%M:%S')))

start time 11:46:47


In [9]:
# launch the chrome driver, then clear cookies and cache
driver = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options)
driver.delete_all_cookies()
if maximize_window:
    driver.maximize_window()

In [10]:
# log into untappd
driver.get(login_url)
pause_short()

username_box = driver.find_element_by_id(username_box_id)
username_box.clear()
username_box.send_keys(untappd_username)
username_box.send_keys(Keys.TAB)
pause_short()

password_box = driver.find_element_by_id(password_box_id)
password_box.clear()
password_box.send_keys(untappd_password)
pause_short()

password_box.send_keys(Keys.ENTER)

In [11]:
# go to the user's profile web page
pause_long()
driver.get(profile_url)

In [12]:
# get the count of total check-ins
pause_medium()
checkin_count_item = driver.find_elements(By.XPATH, count_query.format(untappd_username))[0]
checkin_count = int(checkin_count_item.text.replace(',', ''))
print('{:,}'.format(checkin_count))

1,480


In [13]:
# scroll down and click 'show more' until we find the total number of check-ins
pause_short()
count_found = 0
scroll_count = 0
actions = ActionChains(driver)

checkin_count = limit_checkin_count if limit_checkin_count > 0 else checkin_count

# until you've found all the check-ins you expect, scroll down, click 'show more' button, repeat
while count_found < checkin_count:
    
    actions.key_down(Keys.END).key_up(Keys.END).perform()
    pause_short()
    
    show_more_button = driver.find_elements(By.XPATH, show_more_button_query)[0]
    actions.move_to_element(show_more_button).perform()
    pause_short()
    
    actions.click(show_more_button).perform()   
    pause_medium()
    
    scroll_count += 1
    count_found = len(driver.find_elements(By.XPATH, checkin_item_query))
    print('{} {:,}'.format(scroll_count, count_found))
    
pause_medium()
actions.key_down(Keys.END).key_up(Keys.END).perform()
pause_short()

1 40
2 90
3 165
4 240
5 365
6 615
7 915
8 1,265
9 1,480


In [14]:
# report how many total check-in items were found in the end, and the current time
checkin_items = driver.find_elements(By.XPATH, checkin_item_query)
current_time = time.time()
print('found {:,} checkins'.format(len(checkin_items)))
print('current time {}'.format(dt.fromtimestamp(current_time).strftime('%H:%M:%S')))
print('elapsed time so far: {:,.1f} secs'.format(current_time-start_time))

found 1,480 checkins
current time 12:07:03
elapsed time so far: 1,216.6 secs


In [15]:
# loop through each check-in item and get the beer, brewery, and venue details
checkins = []
for checkin_item in checkin_items:
    
    # get the check-in id then the names and links for the beer, brewery, and venue
    checkin_item_id = checkin_item.get_attribute('id')
    text_items = driver.find_elements(By.XPATH, beer_name_query.format(checkin_item_id))
    names_urls = [(item.text, item.get_attribute('href')) for item in text_items]
    
    # get the beer, brewery, and venue details
    beer_name, beer_url, brewery_name, brewery_url, venue_name, venue_url = get_details(names_urls)
    
    # when we're getting those details, get the rating
    try:
        rating_item = driver.find_elements(By.XPATH, rating_query.format(checkin_item_id))[0]
        rating = int(rating_item.get_attribute('class').split(' r')[1]) / 100.
    except:
        rating = None
    
    # then get the date
    date_item = driver.find_elements(By.XPATH, date_query.format(checkin_item_id))[0]
    date = date_item.get_attribute('data-gregtime')
    
    # to get the style, public rating, public check-ins etc, you must visit the individual beer's page
    
    # now save the details to an object and append to the list
    checkins.append({'checkin_id' : checkin_item_id.split('_')[1],
                     'beer_name' : beer_name,
                     'beer_url' : beer_url,
                     'brewery_name' : brewery_name,
                     'brewery_url' : brewery_url,
                     'venue_name' : venue_name,
                     'venue_url' : venue_url,
                     'rating' : rating,
                     'date' : date})

In [16]:
# all done, close the webdriver
driver.close()

In [17]:
# calculate the end time and the elapsed time
end_time = time.time()
print('end time {}'.format(dt.fromtimestamp(end_time).strftime('%H:%M:%S')))
print('elapsed time: {:,.1f} secs'.format(end_time-start_time))

end time 12:20:06
elapsed time: 1,999.2 secs


In [18]:
# see my 10th check-in, as an example
checkins[-10]

{'beer_name': 'Morland Old Speckled Hen',
 'beer_url': 'https://untappd.com/b/greene-king-morland-old-speckled-hen/3121',
 'brewery_name': 'Greene King',
 'brewery_url': 'https://untappd.com/w/greene-king/612',
 'checkin_id': '17989067',
 'date': 'Sat, 15 Dec 2012 23:32:01 +0000',
 'rating': 3.5,
 'venue_name': "Schmidt's Pub",
 'venue_url': 'https://untappd.com/v/schmidts-pub/191667'}

In [19]:
# turn the list of check-in dicts into a dataframe
df = pd.DataFrame(checkins)
print('created {:,} rows'.format(len(df)))

created 1,480 rows


In [20]:
# convert each timestamp to pacific time
def parse_convert_date(date_string):
    date_time = date_parser.parse(date_string)
    date_time_tz = date_time.replace(tzinfo=date_time.tzinfo).astimezone(pytz.timezone('US/Pacific'))
    return date_time_tz
    
df['date_pacific_tz'] = df['date'].map(parse_convert_date)
df = df.drop('date', axis=1)

In [21]:
df.head()

Unnamed: 0,beer_name,beer_url,brewery_name,brewery_url,checkin_id,rating,venue_name,venue_url,date_pacific_tz
0,McRed,https://untappd.com/b/humboldt-regeneration-mc...,Humboldt Regeneration,https://untappd.com/w/humboldt-regeneration/49462,332182456,3.75,,,2016-07-04 18:59:34-07:00
1,Earth Thirst,https://untappd.com/b/eel-river-brewing-co-ear...,Eel River Brewing Co.,https://untappd.com/w/eel-river-brewing-co/481,331728849,2.5,Eel River Brewing Company,https://untappd.com/v/eel-river-brewing-compan...,2016-07-03 19:00:03-07:00
2,Organic Amber Ale,https://untappd.com/b/eel-river-brewing-co-org...,Eel River Brewing Co.,https://untappd.com/w/eel-river-brewing-co/481,331728103,2.5,Eel River Brewing Company,https://untappd.com/v/eel-river-brewing-compan...,2016-07-03 18:58:48-07:00
3,Emerald Triangle IPA,https://untappd.com/b/eel-river-brewing-co-eme...,Eel River Brewing Co.,https://untappd.com/w/eel-river-brewing-co/481,331692205,4.0,Eel River Brewing Company,https://untappd.com/v/eel-river-brewing-compan...,2016-07-03 18:04:39-07:00
4,Blonde Ale,https://untappd.com/b/eelriver-brewing-company...,Eelriver Brewing Company,https://untappd.com/w/eelriver-brewing-company...,331691575,3.5,Eel River Brewing Company,https://untappd.com/v/eel-river-brewing-compan...,2016-07-03 18:03:50-07:00


In [22]:
# save the dataset to csv
df.to_csv('data/untappd.csv', index=False, encoding='utf-8')