In [80]:
import gzip
import os
import multiprocessing as mp
import pickle
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
import math
import numpy as np
import time
from collections import defaultdict
import scipy.optimize
from sklearn import svm
from sklearn import linear_model
from tensorflow.keras import Model
import tensorflow as tf
from tqdm import tqdm as progress_bar

In [6]:
data_name = 'Clothing_Shoes_and_Jewelry_5'
data_file = 'data/%s.json.gz' % data_name
batches_folder = 'data/%s' % data_name
num_threads = mp.cpu_count()
batch_size = 8192

In [7]:
all_files = os.listdir(batches_folder)
all_files = [ os.path.join(batches_folder, name) for name in all_files ]
pkl_files = [ name for name in all_files if '.pkl' in name ]
pkl_files.sort()
print(len(pkl_files))

1377


In [8]:
def count_entries(pkl_file):
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
    return len(items)

In [9]:
with mp.Pool(num_threads) as p:
    batch_lens = p.map(count_entries, pkl_files)
dataset_len = sum(batch_lens)
print(dataset_len)

11280384


In [19]:
with open(pkl_files[0], 'rb') as file_read:
    items = pickle.load(file_read)
    print(item['reviewerID'], item['asin'], item['overall'])

A2IC3NZN488KWK 0871167042 5.0


In [26]:
def collect_ratings_data(pkl_file):
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
        data = []
        for item in items:
            u,i,r = item['reviewerID'], item['asin'], item['overall']
            data.append((u,i,r))
        return data

In [27]:
start_time = time.time()
with mp.Pool(num_threads) as p:
    datasets = p.map(collect_ratings_data, pkl_files)
print("--- %s seconds ---" % (time.time() - start_time))
print(len(datasets))

--- 10.99126148223877 seconds ---
1377


In [31]:
dataset_all = []
for dataset in datasets:
    dataset_all.extend(dataset)
len(dataset_all)

11280384

In [36]:
test_size = math.floor(len(dataset_all) / 10)

In [41]:
dataset_train = dataset_all[:-test_size]
dataset_test = dataset_all[-test_size:]
assert len(dataset_train)+len(dataset_test) == len(dataset_all)

In [53]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
userIDs = {}
itemIDs = {}
for u,i,r in dataset_train:
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

In [46]:
trainRatings = [r[2] for r in dataset_train]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)
print(globalAverage)

4.28446036019655


In [54]:
mu = globalAverage

In [59]:
# Gradient descent optimizer, experiment with learning rate
optimizer = tf.keras.optimizers.Adam(0.001)

In [55]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * tf.reduce_sum(self.betaU**2) +\
                           tf.reduce_sum(self.betaI**2) +\
                           tf.reduce_sum(self.gammaU**2) +\
                           tf.reduce_sum(self.gammaI**2)
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [56]:
# Experiment with number of factors and regularization rate
model = LatentFactorModel(mu, 5, 0.00001)

In [57]:
def trainingStep(interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [83]:
# 10 iterations of gradient descent
iterations = 100
for i in progress_bar(range(iterations), total=iterations):
    obj = trainingStep(dataset_train)
#     print("iteration " + str(i) + ", objective = " + str(obj))
print("objective = " + str(obj))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:35<00:00,  1.05it/s]

objective = 0.6267273





In [74]:
dataset_test[0]

('A732TT5ECI6KS', 'B00CXFL96Y', 4.0)

In [75]:
dataset_test[0][:2]

('A732TT5ECI6KS', 'B00CXFL96Y')

In [76]:
u_encoded = userIDs[dataset_test[0][0]]
i_encoded = itemIDs[dataset_test[0][1]]
model.predict(u_encoded, i_encoded).numpy()

4.313864

In [88]:
len(dataset_test)

1128038

In [94]:
u_test = []
i_test = []
for u,i,r in progress_bar(dataset_test[:10000], total=len(dataset_test[:10000])):
    try:
        u_test.append(userIDs[u])
        i_test.append(itemIDs[i])
    except KeyError:
        continue
u_test = np.array(u_test)
i_test = np.array(i_test)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 688222.63it/s]


In [101]:
r_pred = []
r_actual = []
for u,i,r in progress_bar(dataset_test[:10000], total=len(dataset_test[:10000])):
    try:
        u_encoded = userIDs[u]
        i_encoded = itemIDs[i]
    except KeyError:
        continue
    r_pred.append(model.predict(u_encoded, i_encoded).numpy())
    r_actual.append(r)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:40<00:00, 244.14it/s]


In [102]:
r_pred = np.array(r_pred) 
r_actual = np.array(r_actual)

In [103]:
mse = np.mean(np.square(r_pred-r_actual))
print(mse)

1.3988269350660398
