In [19]:
import gzip
import random
from collections import defaultdict
import csv
import urllib.request
import tarfile
import scipy.sparse
from implicit import bpr

In [20]:
# extract gz files

url = 'http://cseweb.ucsd.edu/classes/fa21/cse258-b/files/assignment1.tar.gz'

ftpstream = urllib.request.urlopen(url)
thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
thetarfile.extractall()

In [21]:
def read_gz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def read_csv(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    print(header)
    for l in c:
        yield l

In [22]:
data = list(read_csv("assignment1/trainInteractions.csv.gz"))
data[:2]

['user_id', 'recipe_id', 'date', 'rating']


[['88348277', '03969194', '2004-12-23', '5'],
 ['86699739', '27096427', '2002-01-12', '4']]

In [23]:
# shuffle
random.shuffle(data)

In [24]:
userIDs,itemIDs = {},{}
for d in data:
    u,i = d[0],d[1]
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

indexToUserMap = {v:k for k,v in userIDs.items()}
indexToItemMap = {v:k for k,v in itemIDs.items()}

nUsers,nItems = len(userIDs),len(itemIDs)

In [25]:
data = [[d[0], d[1], 1] for d in data]

train_size = 400000
data_train = data[:train_size]
data_valid = data[train_size:]
print('data_train size = %d\tdata_valid size = %d' % (len(data_train), len(data_valid)))

data_train size = 400000	data_valid size = 100000


In [26]:
# compute user-recipes dict and all recipes set
user_recipes = defaultdict(set)
recipe_users = defaultdict(set)
all_recipes = set()
all_users = set()
for d in data_train:
    usr = d[0]
    r = d[1]
    all_recipes.add(r)
    all_users.add(usr)
    user_recipes[usr].add(r)
    recipe_users[r].add(usr)

In [27]:
# get a negative sample per each entry in the validation set
def random_sample(from_list, exclusions):
    s = random.choice(from_list)
    while s in exclusions:
        s = random.choice(from_list)
    return s

def supplement_with_negatives(data):
    all_recipes_list = list(all_recipes)
    negatives = []
    for d in data:
        usr = d[0]
        r_cooked = d[1]
        r_uncooked = random_sample(all_recipes_list, user_recipes[usr].union({r_cooked}))
        negatives.append([usr, r_uncooked, 0])
    return data + negatives

data_valid_sup = supplement_with_negatives(data_valid)

In [28]:
userItemsMap = defaultdict(list)
for d in data_valid_sup:
    userItemsMap[d[0]].append(d[1])

In [29]:
Xiu = scipy.sparse.lil_matrix((nItems, nUsers))
for d in data_train:
    Xiu[itemIDs[d[1]],userIDs[d[0]]] = 1

Xui = scipy.sparse.csr_matrix(Xiu.T)

In [30]:
model = bpr.BayesianPersonalizedRanking(factors = 5, learning_rate=0.01, regularization=0.001, iterations=800)

In [31]:
model.fit(Xiu)

  0%|          | 0/800 [00:00<?, ?it/s]

In [32]:
predMap = defaultdict(int)
for u, items in userItemsMap.items():
    i_indices = [itemIDs[i] for i in items]
    ranked = model.rank_items(userIDs[u], Xui, i_indices)
    ranked_pos = ranked[:len(ranked)//2]
    ranked_neg = ranked[len(ranked)//2:]
    for p in ranked_pos:
        predMap[(u, indexToItemMap[p[0]])] = 1
    for p in ranked_neg:
        predMap[(u, indexToItemMap[p[0]])] = 0

In [33]:
# the baseline model
recipe_count = defaultdict(int)
total_cooked = 0

for d in data_train:
    r = d[1]
    recipe_count[r] += 1
    total_cooked += 1

most_popular = [(recipe_count[x], x) for x in recipe_count]
most_popular.sort()
most_popular.reverse()

def fit(most_popular, total_cooked, threshold=0.5):
    popular_set = set()
    count = 0
    for ic, i in most_popular:
      count += ic
      popular_set.add(i)
      if count > total_cooked * threshold: break
    return popular_set

return1 = fit(most_popular, total_cooked, threshold=0.69)

In [34]:
correct_count = 0
for d in data_valid_sup:
    if d[2] == 1:
        if predMap[(d[0], d[1])] == 1 and (d[1] in return1):
            correct_count += 1
    else:
        if predMap[(d[0], d[1])] == 0 or (d[1] not in return1):
            correct_count += 1
acc = correct_count / (1.0 * len(data_valid_sup))
acc

0.669335