# This getter accumulates responses from calls to untappd's User Beer endpoint.
### The goal of these calls is to try to get as many global IPA ratings, and possibly some extra beer descriptions, in as short a time possible, for feature analysis (User Checkins only provided user ratings).

In [1]:
import pandas as pd
import pickle
import requests
import time

from api_keys import untappd_URL, untappd_ID, untappd_SECRET   # private, local constants


In [2]:
# make a call to untappd's API to get User Beer data for a specific User ID
# This method is used in the main data getter, below (userBeerBatch())
def searchUserBeers(userBeer_params, uid):
    # build the query string
    method_endpoint = '/user/beers/' + str(uid)
    query = untappd_URL + method_endpoint
    response = requests.get(query, userBeer_params)
    # parse the response
    if response:  # response==True for codes 200-400
        remaining_calls = response.headers['X-Ratelimit-Remaining']
        return remaining_calls, response
    else: 
        print(f"That GET request for user {uid} with params={userBeer_params.items()} \
failed, with code: {response.status_code}")
        print(response.json())
        return 0,0

In [3]:
def userBeerBatch(uid, builderDict, start_offset=0, numCalls=5):
    '''
    This method makes @numCalls queries to untappd's User Feed,
    for username @uid, starting at @start_offset and updating User's
    offset each call, to let untappd know
    where to start future calls.
    
    The results and updated offsets are added to the 
    @builderDict passed into this method.
    '''
    # the max 'limit' accepted by untappd is 50
    params = {'limit':50, 'offset':start_offset, 
              'client_id':untappd_ID, 'client_secret':untappd_SECRET}
       
    for i in range(numCalls):
        calls_left, response = searchUserBeers(params, uid)
        if not response:
            print('NO RESPONSE')
            return
        resp = response.json()
        builderDict[uid].append(resp)
        try:
            params['offset'] += resp['response']['beers']['count']
        except KeyError as kerr:
            print(f'No {kerr} was included in the response.')
            print(f'There are {calls_left} calls left.')
            print(f'The last offset passed in was {params["offset"]}.')
            print(f'The userID was {uid}')
            break
            
        if resp['response']['beers']['count'] < params['limit']:  # ran out of items, so break
            print(f"Last call response had a count of {resp['response']['beers']['count']}")
            break

    print(f'Completed {i+1} calls. Hourly calls remaining: {calls_left}') 
    


In [37]:
# generate each user from usernames in checkin df_0, which had the most IPA drinkers
userbeer_gen = (uid for uid in pd.read_csv('capstone_1/checkins/df_0')['user.user_name'].unique())

In [4]:
# generate each user from usernames in checkin df_2, which had the second most IPA drinkers
userbeer_gen = (uid for uid in pd.read_csv('capstone_1/checkins/df_2')['user.user_name'].unique())

In [5]:
# count how many will be generated, while preserving the order to regenerate
curr_dic = {uid:[] for uid in userbeer_gen}
len(curr_dic)

756

In [6]:
# regenerate
userbeer_gen = (uid for uid in curr_dic)

In [9]:
#  ======= Can get about 500 beers/user * 10 users/hour so 100K beers/day with some overlap ============

#time.sleep(600)  # in case the call rate is too fast for the API, use this when restarting process
for batch in range(80):  # The argument here divided by about 10 is how many hours this cell will take
    uID = next(userbeer_gen)
    userBeerBatch(uID, curr_dic, numCalls=10)
    print(f'Batch {batch} finished at {time.asctime()[11:16]}')
    time.sleep(360)  # 10 calls per 360 secs = 100 calls per hour, the API limit

Completed 10 calls. Hourly calls remaining: 81
Batch 0 finished at 22:25
Completed 10 calls. Hourly calls remaining: 81
Batch 1 finished at 22:31
Completed 10 calls. Hourly calls remaining: 71
Batch 2 finished at 22:37
Completed 10 calls. Hourly calls remaining: 61
Batch 3 finished at 22:43
Completed 10 calls. Hourly calls remaining: 51
Batch 4 finished at 22:49
Completed 10 calls. Hourly calls remaining: 41
Batch 5 finished at 22:55
Completed 10 calls. Hourly calls remaining: 31
Batch 6 finished at 23:01
Last call response had a count of 11
Completed 6 calls. Hourly calls remaining: 25
Batch 7 finished at 23:07
Completed 10 calls. Hourly calls remaining: 15
Batch 8 finished at 23:14
Completed 10 calls. Hourly calls remaining: 5
Batch 9 finished at 23:20
Completed 10 calls. Hourly calls remaining: 5
Batch 10 finished at 23:26
Completed 10 calls. Hourly calls remaining: 5
Batch 11 finished at 23:32
Completed 10 calls. Hourly calls remaining: 5
Batch 12 finished at 23:38
Completed 10 cal

### Time to inspect the results

In [10]:
data = [curr_dic[user] for user in curr_dic if curr_dic[user]]
len(data)   # 836 for all of batch_0

160

In [48]:
norm = pd.io.json.json_normalize(data[0][0]['response']['beers']['items'])
norm.columns

Index(['beer.beer_abv', 'beer.beer_description', 'beer.beer_ibu',
       'beer.beer_label', 'beer.beer_name', 'beer.beer_slug',
       'beer.beer_style', 'beer.bid', 'beer.created_at', 'beer.has_had',
       'beer.rating_count', 'beer.rating_score', 'brewery.brewery_active',
       'brewery.brewery_id', 'brewery.brewery_label', 'brewery.brewery_name',
       'brewery.brewery_page_url', 'brewery.brewery_slug',
       'brewery.brewery_type', 'brewery.contact.facebook',
       'brewery.contact.instagram', 'brewery.contact.twitter',
       'brewery.contact.url', 'brewery.country_name',
       'brewery.location.brewery_city', 'brewery.location.brewery_state',
       'brewery.location.lat', 'brewery.location.lng', 'count',
       'first_checkin_id', 'first_created_at', 'first_had', 'rating_score',
       'recent_checkin_id', 'recent_created_at', 'recent_created_at_timezone',
       'user_auth_rating_score'],
      dtype='object')

In [159]:
# There's no userID in there, so need to insert it in the pre-concat data dicts,
##  while the UID is still attached (as the dict key)
#####  TODO , maybe. Or not.  #######


In [11]:
check = pd.concat([pd.io.json.json_normalize(dic['response']['beers']['items']) 
                  for user in data for dic in user], axis=0, ignore_index=True)

In [12]:
check.shape

(75866, 37)

### My main question is how many global ratings (a beer's avg. rating for all users) you get per hour.
#### This should be highest in the first hour, since there are only 9 users overlapping their beers thus far.
Will monitor how this rate changes as several hours go by.

In [169]:
len(check['beer.rating_score'].unique()) # 3600 1hr, 9260 3hrs, 17595 6hrs, 35047 18hrs

83336

Rate of unique ratings slows down more between 12-18 hrs, so probably only worth doing for about 2 days

In [170]:
len(check['beer.bid'].unique()) # 3735 1hr, 10045 3hrs, 20725 6hrs 49236 18 hrs

155148

Rate of unique beers slows down less than of unique ratings, so the beers keep flowing but fewer ratings

### Save

In [13]:
check.to_csv('capstone_1/userbeers2.csv')  # 68 MB for 80K rows

In [172]:
with open('capstone_1/UBdicts2.pkl', 'wb') as f:    # 103 MB for 80K rows
    pickle.dump(curr_dic, f)

In [173]:
check.shape

(378504, 37)

In [101]:
check.head()

Unnamed: 0,beer.beer_abv,beer.beer_description,beer.beer_ibu,beer.beer_label,beer.beer_name,beer.beer_slug,beer.beer_style,beer.bid,beer.created_at,beer.has_had,...,brewery.location.lng,count,first_checkin_id,first_created_at,first_had,rating_score,recent_checkin_id,recent_created_at,recent_created_at_timezone,user_auth_rating_score
0,4.8,Session ale with a subtle malt bill and fresh ...,25,https://untappd.akamaized.net/site/beer_logos/...,My Own Private Idaho,buoy-beer-company-my-own-private-idaho,IPA - Session / India Session Ale,3467088,"Sat, 05 Oct 2019 17:18:24 +0000",False,...,-123.835,1,828069072,"Sat, 09 Nov 2019 21:41:08 -0800","Sat, 09 Nov 2019 21:41:08 -0800",3.75,828069072,"Sat, 09 Nov 2019 21:41:08 -0800",-8,0
1,6.5,"West coast IPA with light malt character, brig...",65,https://untappd.akamaized.net/site/beer_logos/...,Handup IPA,deschutes-brewery-handup-ipa,IPA - American,3311276,"Tue, 02 Jul 2019 19:02:29 +0000",False,...,-121.322,1,827954397,"Sat, 09 Nov 2019 17:37:27 -0800","Sat, 09 Nov 2019 17:37:27 -0800",3.75,827954397,"Sat, 09 Nov 2019 17:37:27 -0800",-8,0
2,6.7,,55,https://untappd.akamaized.net/site/beer_logos/...,Mad Respect,backwoods-brewing-company-mad-respect,IPA - American,1445432,"Sat, 27 Feb 2016 21:41:53 +0000",False,...,-121.82,1,827883521,"Sat, 09 Nov 2019 16:15:55 -0800","Sat, 09 Nov 2019 16:15:55 -0800",4.0,827883521,"Sat, 09 Nov 2019 16:15:55 -0800",-8,0
3,7.2,Return to Space was inspired by two of our fav...,0,https://untappd.akamaized.net/site/beer_logos/...,Return to Space,great-notion-brewing-return-to-space,IPA - New England,2655469,"Fri, 18 May 2018 21:54:30 +0000",False,...,-122.643,1,827668186,"Sat, 09 Nov 2019 12:58:42 -0800","Sat, 09 Nov 2019 12:58:42 -0800",4.25,827668186,"Sat, 09 Nov 2019 12:58:42 -0800",-8,0
4,7.8,This west coast style IPA utilizes 5 pounds of...,0,https://untappd.akamaized.net/site/assets/imag...,Denim Vest-pionage West Coast IPA,baerlic-brewing-company-denim-vest-pionage-wes...,IPA - American,3257257,"Fri, 31 May 2019 21:14:02 +0000",False,...,-122.655,1,827585996,"Sat, 09 Nov 2019 11:51:17 -0800","Sat, 09 Nov 2019 11:51:17 -0800",4.0,827585996,"Sat, 09 Nov 2019 11:51:17 -0800",-8,0


In [174]:
sum(check['beer.beer_style'].apply(lambda x: x.startswith('IPA')))    # 34530 non-unique out of 79331 non-unique

161111

In [175]:
sum(check['beer.beer_style'].str.startswith('IPA'))  # 161K out of 368K overall are IPAs

161111

In [176]:
check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378504 entries, 0 to 378503
Data columns (total 37 columns):
beer.beer_abv                     378504 non-null float64
beer.beer_description             378504 non-null object
beer.beer_ibu                     378504 non-null int64
beer.beer_label                   378504 non-null object
beer.beer_name                    378504 non-null object
beer.beer_slug                    378504 non-null object
beer.beer_style                   378504 non-null object
beer.bid                          378504 non-null int64
beer.created_at                   378504 non-null object
beer.has_had                      378504 non-null bool
beer.rating_count                 378504 non-null int64
beer.rating_score                 378504 non-null float64
brewery.brewery_active            378504 non-null int64
brewery.brewery_id                378504 non-null int64
brewery.brewery_label             378504 non-null object
brewery.brewery_name              37850

In [177]:
sum(check['beer.beer_description'] == '')   # about 22% have no description

67196

In [178]:
check.loc[222222,'beer.rating_score']

3.7856000000000001