In [1]:
from api_keys import untappd_URL, untappd_ID, untappd_SECRET   # private, local constants

import pickle
import requests
import time

In [2]:
# make a call to untappd's API to get User Feed data for a specific User ID
# This method is used in the main data getter, below (userFeedBatch())
def searchUserFeeds(userFeed_params, uid):
    # build the query string
    method_endpoint = '/user/checkins/' + str(uid)
    query = untappd_URL + method_endpoint
    response = requests.get(query, userFeed_params)
    # parse the response
    if response:  # response==True for codes 200-400
        remaining_calls = response.headers['X-Ratelimit-Remaining']
        return remaining_calls, response
    else: 
        print(f"That GET request for user {uid} with params={userFeed_params.items()} \
failed, with code: {response.status_code}")
        print(response.json())
        return 0,0

In [3]:
def userFeedBatch(uid, builderDict, olderthan=None, newerthan=None, numCalls=5):
    '''
    This method makes @numCalls queries to untappd's User Feed,
    for username @uid, using either @olderthan or @newerthan to let untappd know
    where to start (going back in time from @olderthan, which is untappd's 
    "max_id" param) or end (going from now to @newerthan, which is "min_id").
    
    The results and updated pagination limits are added to the 
    @builderDict passed into this method.
    '''
    # the max 'limit' accepted by untappd is 50
    params = {'limit':50, 'max_id': olderthan, 'min_id': newerthan, 'client_id':untappd_ID, 'client_secret':untappd_SECRET}
       
    for i in range(numCalls):
        calls_left, response = searchUserFeeds(params, uid)
        if not response:
            print('NO RESPONSE')
            return
        resp = response.json()
        # start pagination and rate limit bookkeeping
        try:
            new_oldest = resp['response']['pagination']['max_id']
            params['max_id'] = new_oldest
            builderDict[uid]['oldestID'] = new_oldest
            builderDict[uid]['datalist'].append(resp)
            # this happens only first time; mark in this user's dict the youngest checkin seen
            if builderDict[uid]['newestID'] is None:
                builderDict[uid]['newestID'] = resp['response']['checkins']['items'][0]['checkin_id']
        except KeyError as kerr:
            print(f'No {kerr} was included in the response.')
            print(f'There are {calls_left} calls left.')
            print(f'The max_id passed in was {olderthan}.')
            print(f'The userID was {uid}')
            break
            
        if resp['response']['checkins']['count'] < params['limit']:  # ran out of items, so break
            print(f"Last call response had a count of {resp['response']['checkins']['count']}")
            break

    print(f'Completed {i+1} calls. Hourly calls remaining: {calls_left}') 
    
    
      

#### Pickling utilities for generators

In [4]:
def unpickle_gen(pickled_filename):
    """Unpickles a generator, maintaining the order of its elements.
    Returns the generator.
    """
    with open(pickled_filename, 'rb') as f:
        gen = (item for item in pickle.load(f))
    return gen

def pickle_gen(gen, to_pkl_filename):
    """Pickles a generator, preserving the order of its elements for later use."""
    with open(to_pkl_filename, 'wb') as f:
        pickle.dump([item for item in gen], f)

In [5]:
# Get an ordered generator (ordered by users with highest total IPA checkins on first 607 IPAs in this case)
# This is only accurate if the pickled generator reflects the accurate current state of queried Users
#  Otherwise you have to pare it down while keeping its order, using the cell below this.
ipa_user_gen = unpickle_gen('capstone_1/ipa_gen_after_part_12.pkl')

### Creating a new User generator when you've lost track of who's been generated and queried already

In [None]:
#######  THIS TAKES SIGNIFICANT TIME TO RUN (over a half hour, with 4-5 GB of userFeedDicts in 9 files) ######

from glob import glob

# start with the most recent saved form of the generator and pare it down
ipa_user_gen = unpickle_gen('capstone_1/ipa_user_gen.pkl') 

queried = set()

# For each file, find which users have been queried already, so as not to regenerate them
for file in glob('capstone_1/userFeeds/userFeedDicts*'):
    with open(file, 'rb') as f:
        feeds = pickle.load(f)
    queried = queried.union({user for user in feeds if feeds[user]['datalist']})
# Make a new generator after removing the generated ones, keeping the order
ipa_user_gen = (user for user in ipa_user_gen if user not in queried)

In [25]:
len(queried)

5993

Well, that took over a half hour for the 4-5 GB of userFeedDicts through part_9.  Inefficiency partly due to the fact that each part holds all 87K userDicts, even though only ~700 of them are queried and filled. At some point, these files will be trimmed down to just the dicts that are queried in that part, but for now, just save the most recent ipa_user_gen, which is through a (very short) part_9, so as not to have to duplicate efforts later.

In [32]:
pickle_gen(ipa_user_gen, 'capstone_1/ipa_user_gen_thru_part_9.pkl')
# since the generator gets drained while pickling it, rebuild it right away
ipa_user_gen = unpickle_gen('capstone_1/ipa_user_gen_thru_part_9.pkl')

### Main loop for building User Feed dataset

In [6]:
# load a partially filled UserFeedDict
with open('capstone_1/userFeeds/userFeedDicts_part_9.pkl', 'rb') as f:
    userFeedDicts_part_9 = pickle.load(f)

For initializing a new dict, when you reach about 500MB in a dict, and ipynb slows down

In [7]:
# Initialize a storage dict from all those 87K users in "feeds", generated above
#  Use that dict as input to the main GET routine, to accumulate User Feeds
userFeedDicts_part_13 = {uID: {'newestID':None, 'oldestID':None, 'datalist':[]} for uID in userFeedDicts_part_9}

In [8]:
# Update this as files get too large, to use (fill) in the main GET routine, below
current_dict = userFeedDicts_part_13

In [11]:
#  ======= Can get about 500 users/day with 200 reviews per user = 100K checkins/day ============

#time.sleep(600)  # in case the call rate is too fast for the API, use this when restarting process
for batch in range(150):  # The argument here divided by about 23 is how many hours this cell will take
    uID = next(ipa_user_gen)
    #                     (newerthan=current_dict[uID]['newestID'])   to fill in newer
    userFeedBatch(uID, current_dict, olderthan=current_dict[uID]['oldestID'], numCalls=4)
    print(f'Batch {batch} finished at {time.asctime()[11:16]}')
    time.sleep(150)  # 4 calls per 150 secs = 96 calls per hour, with 100 being the API limit

Completed 4 calls. Hourly calls remaining: 8
Batch 0 finished at 23:53
Completed 4 calls. Hourly calls remaining: 8
Batch 1 finished at 23:56
Completed 4 calls. Hourly calls remaining: 8
Batch 2 finished at 23:58
Completed 4 calls. Hourly calls remaining: 8
Batch 3 finished at 00:01
Completed 4 calls. Hourly calls remaining: 8
Batch 4 finished at 00:03
Completed 4 calls. Hourly calls remaining: 8
Batch 5 finished at 00:06
Completed 4 calls. Hourly calls remaining: 8
Batch 6 finished at 00:09
Last call response had a count of 29
Completed 1 calls. Hourly calls remaining: 10
Batch 7 finished at 00:11
Completed 4 calls. Hourly calls remaining: 10
Batch 8 finished at 00:14
Completed 4 calls. Hourly calls remaining: 10
Batch 9 finished at 00:16
Completed 4 calls. Hourly calls remaining: 10
Batch 10 finished at 00:19
Completed 4 calls. Hourly calls remaining: 10
Batch 11 finished at 00:21
Completed 4 calls. Hourly calls remaining: 10
Batch 12 finished at 00:24
Completed 4 calls. Hourly calls

In [12]:
# pickle a UserFeedDict
with open('capstone_1/userFeeds/userFeedDicts_part_13.pkl', 'wb') as f:
    pickle.dump(userFeedDicts_part_13, f)

In [13]:
# Checking updated user counts
users_13 = [user for user in userFeedDicts_part_13 if userFeedDicts_part_13[user]['datalist']]
len(users_13)

779

In [14]:
ls -l capstone_1/userFeeds/

total 10518512
-rw-r--r--  1 ethanhaley  staff  542487840 Oct 24 13:16 userFeedDicts.pkl
-rw-r--r--  1 ethanhaley  staff  396639770 Nov  6 18:07 userFeedDicts_part_10.pkl
-rw-r--r--  1 ethanhaley  staff  410871781 Nov  8 05:02 userFeedDicts_part_11.pkl
-rw-r--r--  1 ethanhaley  staff  414498152 Nov  9 11:21 userFeedDicts_part_12.pkl
-rw-r--r--  1 ethanhaley  staff  469531327 Nov 11 06:51 userFeedDicts_part_13.pkl
-rw-r--r--  1 ethanhaley  staff  471741274 Oct 26 12:28 userFeedDicts_part_2.pkl
-rw-r--r--  1 ethanhaley  staff  501215060 Oct 28 09:46 userFeedDicts_part_3.pkl
-rw-r--r--  1 ethanhaley  staff  413816034 Oct 29 21:21 userFeedDicts_part_4.pkl
-rw-r--r--  1 ethanhaley  staff  430760177 Oct 31 10:35 userFeedDicts_part_5.pkl
-rw-r--r--  1 ethanhaley  staff  397829802 Nov  1 22:22 userFeedDicts_part_6.pkl
-rw-r--r--  1 ethanhaley  staff  381633389 Nov  3 10:21 userFeedDicts_part_7.pkl
-rw-r--r--  1 ethanhaley  staff  400218504 Nov  4 16:11 userFeedDicts_part_8.pkl
-rw

In [15]:
# closing notebook, saving generator in current state
pickle_gen(ipa_user_gen, 'capstone_1/ipa_gen_after_part_13.pkl')

# TODO-- part 13 is the first one not included in the parsed/glued df's.
### Use the first 12 for analysis and use 13 --> for val/testing