In [1]:
import pandas as pd

checkins = pd.read_csv('comboframe.csv', usecols=['beer_id', 'rating_user',
                                                  'rating_global', 'user_id',
                                                  'abv', 'brewery_name',
                                                  'beer_style', 'beer_name',
                                                  'checkin_id'])
# only allow each user one rating for each beer
checkins.drop_duplicates(subset=['beer_id', 'user_id'], inplace=True)
print(checkins.shape)

(1894852, 9)


In [2]:
# remove the users who only checked fewer than 4 times
checkins = checkins[checkins.user_id.map(checkins.groupby('user_id').size() > 3)]
checkins.shape

(1529580, 9)

In [3]:
# remove the beers rated fewer than 4 times
checkins = checkins[checkins.beer_id.map(checkins.groupby('beer_id').size() > 3)]
checkins.shape

(1257869, 9)

In [4]:
# remove the ones with no global ratings
checkins.dropna(subset=['rating_global'], axis=0, inplace=True)
# and the zeros
checkins = checkins[checkins.rating_global > 0]
checkins.shape

(1193682, 9)

In [5]:
# remove beers that aren't really beer, based on the alcohol content
checkins = checkins[(checkins.abv > 0) & (checkins.abv < 20)]
checkins.shape

(1184752, 9)

In [6]:
checkins.head(2)

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,brewery_name,beer_name,beer_style,rating_global,abv
0,821797539,2095023,3340203,3.75,Stone Brewing,Stone Scorpion Bowl IPA,IPA - American,3.73789,7.5
1,818949121,1709568,3340203,3.5,Ritual Brewing Company,Pale Ale,Pale Ale - American,3.43165,5.2


In [6]:
# a mapping from user to number of ratings can be helpful in many situations
usercounts = checkins.groupby('user_id').size()

In [7]:
def split_last_X(frame, countDict, X):
    '''
    Split the input frame into training and testing,
    using the last X for each user as testers.
    CountDict input has the rows per user,
    and the frame is indexed by user.
    Returns the train split and test split
    '''
    boollist = [[True] * (countDict[u] - X) + [False] * X for u in frame.index.unique()]
    boollist = np.array([boo for lis in boollist for boo in lis])  # numpy to help with the logic
    
    return frame[boollist], frame[~boollist]

In [8]:
# make a func to deal with ties in rankings
def untied_rank(arr, vals):
    '''
    Measure how well the input vals (list or np.array) has chosen
    the top values of input arr (np.array). 
    vals must be subset of arr.
    1.0 is perfect, 0.0 is worst.
    '''
    fails = 0
    poss_fails = 0
    ordered = np.sort(arr)
    if max(ordered) == min(ordered): return 0.5  # like guessing, if all equal
    for i in range(len(vals)):
        fails += sum(arr > vals[i])
        arr = np.delete(arr, np.where(arr == vals[i])[0][0])
        poss_fails += sum(ordered > ordered[i])
    
    return 1 - fails / poss_fails

In [9]:
import numpy as np

In [10]:
# with sorted checkins for each user, we can simulate having a user's rating history when recommending
checkins.sort_values(by=['user_id', 'checkin_id'], inplace=True)
checkins.set_index('user_id', inplace=True)
# gather the users who have more than 50 ratings and save their last 10 for testing
bigs = checkins.index.map(usercounts) > 49
smalls = checkins[~bigs]
tr, test = split_last_X(checkins[bigs], usercounts, 10)
train = pd.concat([smalls, tr])
print(f'{train.shape[0]} training rows, {test.shape[0]} testing rows')

1112372 training rows, 72380 testing rows


In [12]:
# need the space
checkins = None

Get the baseline score of recommending top-rated beers, regardless of user.

In [13]:
# baseline (top picks) results/target
top_rank_scores = []
for u in test.index.unique():
    utest = test.loc[u, ['rating_user','rating_global']]
    top_rank_scores.append(untied_rank(utest.rating_user.values, 
                              utest.rating_user.values[np.argsort(utest.rating_global.values)[:-4:-1]]))
print(f'The average score picking the top 3 globally rated for {len(top_rank_scores)} "menus":  {np.mean(top_rank_scores)}')

The average score picking the top 3 globally rated for 7238 "menus":  0.7752973184428179


Now see where collaborative filtering can get us.

In [11]:
from tqdm import tqdm

In [12]:
from collections import defaultdict

In [13]:
# show how each user's ratings deviate from global ratings and from their own ratings
train['udiff'] = train.rating_user - train.rating_global
train['udev'] = train.udiff - train.index.map(train.groupby(train.index)['udiff'].mean())

In [14]:
train.reset_index(inplace=True)
udict = {uid:dict() for uid in train.user_id.unique()}
bdict = {bid:dict() for bid in train.beer_id.unique()}
for checkin in zip(train.user_id, train.beer_id, train.udev):
    udict[checkin[0]][checkin[1]] = checkin[2]
    bdict[checkin[1]][checkin[0]] = checkin[2]

In [None]:
# in order to pickle the default dict, use this instead of a lambda:
def dd():
    return defaultdict(float)

# calculate user/user correlation/similarity 
shared = {u: defaultdict(dd) for u in udict}
# this may take a minute or maybe an hour, depending on memory, with the nested loops, but seems messy otherwise
for u in tqdm(udict):
    # loop thru all checkins by u
    for b in udict[u]:
        # update the similarity factors for u-v for every checkin, as would happen with new checkins
        for v in bdict[b]:
            suv = shared[u][v]
            suv['count'] += 1  # increment the common u-v ratings
            #### stats.stackexchange has these going last, but then each new sample
            ##### updates to a diff from mean that it contributes to, which seems wrong
            ###### (and messes up the first one, where u_bar == u[b])  ####
            u_dev = udict[u][b] - suv['u_bar']
            v_dev = udict[v][b] - suv['v_bar']
            suv['numer'] += u_dev * v_dev
            suv['denom_1'] += u_dev ** 2
            suv['denom_2'] += v_dev ** 2
            #####
            suv['u_bar'] = ((suv['count']-1) * suv['u_bar'] + udict[u][b]) / suv['count']
            suv['v_bar'] = ((suv['count']-1) * suv['v_bar'] + udict[v][b]) / suv['count']
            
    # remove self-edges
    del shared[u][u]

100%|██████████| 43641/43641 [1:21:28<00:00,  8.93it/s]    


In [None]:
import pickle
# get this thing saved, since the memory required to build it is not always available
with open('suvdict.pkl', 'wb') as f:
    pickle.dump(shared, f)

In [None]:
# easier just to have this dict sitting by and ready to use
userbiasdict = dict(train.groupby('user_id').udiff.mean())

In [None]:

preds = []
actual = []
lam_4 = 5
k = 2
eps = 1

for row in zip(testers.user_id[:1000], testers.beer_id[:1000], testers.rating_user[:1000]):
    try:
        x = row[1]
        u = row[0]
        baseline = beer_mu[x] + userbiasdict[u]
        # only looking for users v who rated the beer x in question
        uv = [v for v in bdict[x] if v in shared[u]]
        if not uv:
            continue
        suv = [shared[u][v] for v in uv]
        # first get u-v pearsons
        ## https://stats.stackexchange.com/questions/410468/online-update-of-pearson-coefficient
        sims = [s['numer'] / np.sqrt(s['denom_1'] * s['denom_2'])
                    for s in suv]  
        # then shrink in relation to number of common u-v ratings
        sims = np.multiply(np.array(sims),
                           np.array([s['count'] / (s['count'] + lam_4) for s in suv]))
        most_sim = sorted(list(zip(uv, sims)), key=lambda x: abs(x[1]), reverse=True)
        # take the k most/least similar, or all for the many short lists
        most_sim = most_sim[:min(k, len(most_sim))]
        tweaks = [udict[sim[0]][x] * sim[1] for sim in most_sim]
        tweak = sum(tweaks) / (sum(abs(sim[1]) for sim in most_sim) + eps)
        preds.append(baseline + tweak)
        actual.append(row[2])
    except KeyError:
        continue
    except ZeroDivisionError:
        print(row)
        continue

diffs = np.array(preds) - np.array(actual)
rmse = np.sqrt(np.dot(diffs, diffs) / len(diffs))
rmse