## Turn pickled Untappd User Feed data into Pandas DF's

In [1]:
!ls capstone_1/userFeeds/

userFeedDicts.pkl         userFeedDicts_part_2.pkl  userFeedDicts_part_6.pkl
userFeedDicts_part_10.pkl userFeedDicts_part_3.pkl  userFeedDicts_part_7.pkl
userFeedDicts_part_11.pkl userFeedDicts_part_4.pkl  userFeedDicts_part_8.pkl
userFeedDicts_part_12.pkl userFeedDicts_part_5.pkl  userFeedDicts_part_9.pkl


In [2]:
import pandas as pd
import pickle

In [33]:
# running this manually and iterating this variable every time
part = 5

In [34]:
with open(f'capstone_1/userFeeds/userFeedDicts_part_{part}.pkl', 'rb') as f:
    feed = pickle.load(f)

print(type(feed), len(feed))

<class 'dict'> 87167


Most of those 87K dict items have no data, so filter down to the data ones

In [35]:
feed = {user:feed[user] for user in feed if feed[user]['datalist']}  # empty datalists will eval to False
len(feed)

694

In [36]:
# Just drilling down to the level where all the data are
feed = [data['response']['checkins']['items'] for val in feed.values() for data in val['datalist']]

In [37]:
# There are a lot of data here, so let's use a generator to control the flow
checkins = (pd.io.json.json_normalize(checkin) for checkin in feed)

In [54]:
# inspect structure
#checkin_df = next(checkins)
#checkin_df.shape  # So 50 checkins per batch,

In [7]:
'''
# Going to keep about a third of those 76 columns and reorder them for easier viewing
keepcols=['checkin_id', 'beer.bid', 'user.uid', 'rating_score', 'beer.beer_abv', 
          'brewery.brewery_name','beer.beer_name', 'beer.beer_style',  
           'brewery.brewery_id', 'brewery.brewery_type', 'brewery.country_name',
          'brewery.location.brewery_city', 'brewery.location.brewery_state',
          'brewery.location.lat', 'brewery.location.lng', 'user.user_name',
          'venue.categories.items', 'venue.location.lat', 'venue.location.lng',
         'venue.location.venue_city', 'venue.location.venue_country', 
          'venue.location.venue_state', 'venue.primary_category',
          'venue.venue_id', 'checkin_comment','created_at', 
       ]
'''

In [38]:
built = 0 # counter to monitor progress
batch_concat = []
batch_size = 40
while True:
    # concat vertically until generator runs out
    batch = []
    try:
        for _ in range(batch_size):
            batch.append(next(checkins))
        batch = pd.concat(batch, axis=0, ignore_index=True)
        # save some size by trimming 2/3 of the frame
        batch_concat.append(batch[keepcols])
        built += 1
        if built % 10 == 0:
            print(f'{built * batch_size} segments built')
    except StopIteration:
        print(f'appending {len(batch)} final batches')
        batch = pd.concat(batch, axis=0, ignore_index=True)
        batch_concat.append(batch[keepcols])
        break

400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 12 final batches


In [39]:
len(batch_concat)

69

In [40]:
df = pd.concat(batch_concat, axis=0, ignore_index=True)

In [41]:
df.shape

(136019, 26)

In [42]:
df.to_csv(f'capstone_1/checkins/df_{part}')

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177458 entries, 0 to 177457
Data columns (total 26 columns):
checkin_id                        177458 non-null int64
beer.bid                          177458 non-null int64
user.uid                          177458 non-null int64
rating_score                      177458 non-null float64
beer.beer_abv                     177458 non-null float64
brewery.brewery_name              177458 non-null object
beer.beer_name                    177458 non-null object
beer.beer_style                   177458 non-null object
brewery.brewery_id                177458 non-null int64
brewery.brewery_type              177458 non-null object
brewery.country_name              177458 non-null object
brewery.location.brewery_city     177458 non-null object
brewery.location.brewery_state    177458 non-null object
brewery.location.lat              177458 non-null float64
brewery.location.lng              177458 non-null float64
user.user_name                    

## Just patch the routine together below to automate.  Will take awhile to run

In [43]:
for part in range(6,13):  # already did parts 1-5 of 12
    with open(f'capstone_1/userFeeds/userFeedDicts_part_{part}.pkl', 'rb') as f:
        feed = pickle.load(f)
    feed = {user:feed[user] for user in feed if feed[user]['datalist']}  # empty datalists will eval to False
    feed = [data['response']['checkins']['items'] for val in feed.values() for data in val['datalist']]
    checkins = (pd.io.json.json_normalize(checkin) for checkin in feed)
    built = 0 # counter to monitor progress
    batch_concat = []
    batch_size = 40
    while True:
        # concat vertically until generator runs out
        batch = []
        try:
            for _ in range(batch_size):
                batch.append(next(checkins))
            batch = pd.concat(batch, axis=0, ignore_index=True)
            # save some size by trimming 2/3 of the frame
            batch_concat.append(batch[keepcols])
            built += 1
            if built % 10 == 0:
                print(f'{built * batch_size} segments built')
        except StopIteration:
            print(f'appending {len(batch)} final batches')
            batch = pd.concat(batch, axis=0, ignore_index=True)
            batch_concat.append(batch[keepcols])
            break
    df = pd.concat(batch_concat, axis=0, ignore_index=True)
    df.to_csv(f'capstone_1/checkins/df_{part}')

400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 38 final batches
400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 38 final batches
400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 27 final batches
400 segments built
800 segments built
appending 4 final batches
400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 27 final batches
400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 2 final batches
400 segments built
800 segments built
1200 segments built
1600 segments built
2000 segments built
2400 segments built
appending 25 final batches
