In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import gzip
import os
import multiprocessing as mp
import pickle
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
import math
import numpy as np
import time
from collections import defaultdict
import scipy.optimize
from sklearn import svm
from sklearn import linear_model
from tensorflow.keras import Model
import tensorflow as tf
from tqdm import tqdm as progress_bar

2022-11-30 01:13:31.000751: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
data_folder = '../data'
data_name = 'Clothing_Shoes_and_Jewelry_5'
data_file = os.path.join(data_folder,'%s.json.gz' % data_name)
batches_folder = os.path.join(data_folder, '%s' % data_name)
num_threads = mp.cpu_count()
batch_size = 8192

In [4]:
all_files = os.listdir(batches_folder)
all_files = [ os.path.join(batches_folder, name) for name in all_files ]
pkl_files = [ name for name in all_files if '.pkl' in name ]
pkl_files.sort()
print(len(pkl_files))

1377


In [5]:
def count_entries(pkl_file):
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
    return len(items)

In [6]:
with mp.Pool(num_threads) as p:
    batch_lens = p.map(count_entries, pkl_files)
dataset_len = sum(batch_lens)
print(dataset_len)

11280384


In [7]:
with open(pkl_files[0], 'rb') as file_read:
    items = pickle.load(file_read)
    item = items[0]
    print(item['reviewerID'], item['asin'], item['overall'])

A2IC3NZN488KWK 0871167042 5.0


In [8]:
def collect_ratings_data(pkl_file):
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
        data = []
        for item in items:
            u,i,r = item['reviewerID'], item['asin'], item['overall']
            data.append((u,i,r))
        return data

In [9]:
start_time = time.time()
with mp.Pool(num_threads) as p:
    datasets = p.map(collect_ratings_data, pkl_files)
print("--- %s seconds ---" % (time.time() - start_time))
print(len(datasets))

--- 5.484969615936279 seconds ---
1377


In [91]:
dataset_all = []
for dataset in datasets:
    dataset_all.extend(dataset)
len(dataset_all)

11280384

In [92]:
test_size = math.floor(len(dataset_all) / 100)

In [93]:
dataset_train = dataset_all[:-test_size]
dataset_test = dataset_all[-test_size:]
assert len(dataset_train)+len(dataset_test) == len(dataset_all)

In [94]:
trainRatings = [r[2] for r in dataset_train]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)
print(globalAverage)

4.2783192707534425


In [95]:
len(dataset_train), len(dataset_test)

(11167581, 112803)

In [96]:
dataset_train[0]

('A2IC3NZN488KWK', '0871167042', 5.0)

In [97]:
for idx, item in enumerate(dataset_train):
    dataset_train[idx] = item[0], item[1], item[2] - globalAverage

In [98]:
dataset_train[0]

('A2IC3NZN488KWK', '0871167042', 0.7216807292465575)

In [99]:
for idx, item in enumerate(dataset_test):
    dataset_test[idx] = item[0], item[1], item[2] - globalAverage

In [100]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
userIDs = {}
itemIDs = {}
for u,i,r in dataset_train:
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
with open('cache.pkl', 'wb') as cache_file:
    pickle.dump((ratingsPerUser, ratingsPerItem, userIDs, itemIDs), cache_file)

In [19]:
with open('cache.pkl', 'rb') as cache_file:
    ratingsPerUser, ratingsPerItem, userIDs, itemIDs = pickle.load(cache_file)

In [101]:
mu = globalAverage

In [103]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * tf.reduce_sum(self.betaU**2) +\
                           tf.reduce_sum(self.betaI**2) +\
                           tf.reduce_sum(self.gammaU**2) +\
                           tf.reduce_sum(self.gammaI**2)
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [105]:
def trainingStep(interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [121]:
u_test = []
i_test = []
r_actual = []
for u,i,r in progress_bar(dataset_test, total=len(dataset_test)):
    if u not in userIDs or i not in itemIDs:
            continue
    else:
        u_test.append(userIDs[u])
        i_test.append(itemIDs[i])
        r_actual.append(r)
u_test = np.array(u_test)
i_test = np.array(i_test)
r_actual = np.array(r_actual)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 112803/112803 [00:00<00:00, 748855.78it/s]


In [146]:
# Gradient descent optimizer, experiment with learning rate
optimizer = tf.keras.optimizers.Adam(0.1)

In [147]:
# Experiment with number of factors and regularization rate
model = LatentFactorModel(mu, 5, 0.0001)

In [None]:
# 10 iterations of gradient descent
iterations = 80
pbar = progress_bar(range(iterations), total=iterations)
for i in pbar:
    obj = trainingStep(dataset_train)
    
    r_pred = model.predictSample(u_test, i_test).numpy()
    mse = np.mean(np.square(r_pred-r_actual))
    
    pbar.set_postfix({'obj': obj, 'test:': mse})
    
#     print("iteration " + str(i) + ", objective = " + str(obj))
print("objective = " + str(obj))

 91%|████████████████████████████████████████████████████████████████████████████▍       | 91/100 [01:16<00:06,  1.42it/s, obj=5.75, test:=1.53]

In [143]:
r_pred = model.predictSample(u_test, i_test).numpy()
mse = np.mean(np.square(r_pred-r_actual))
print(mse)

3.3660921940560753


In [144]:
baseline = np.mean(np.square( r_actual - globalAverage ))
print(baseline)

20.92197617152311
