In [1]:
import random
import tensorflow as tf
from collections import defaultdict

In [87]:
userIDs = {}
itemIDs = {}
interactions = []
f = open("train.json")
f.readline()
#for d in parse(dataDir + "goodreads_reviews_comics_graphic.json.gz"):
for l in f:
    d = eval(l)
    u = d['userID']
    i = d['gameID']
    r = d['hours_transformed']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

    interactions.append((u,i,r))

In [88]:
random.shuffle(interactions)
len(interactions)

174999

In [89]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [90]:
#interactionsTrain = [i for i in interactionsTrain if i[2]<11]

In [91]:
len(interactionsTrain)

157499

In [77]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [78]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [79]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [80]:
class LatentFactorModelBiasOnly(tf.keras.Model):
    def __init__(self, mu, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i]
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        pred = self.alpha + beta_u + beta_i
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [81]:
def trainingStepBiasOnly(model, interactionsTrain):
    Nsamples = 100000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactionsTrain)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
        (grad, var) in zip(gradients, model.trainable_variables)
        if grad is not None)
    return loss.numpy()

In [82]:
optimizer = tf.keras.optimizers.legacy.Adam(0.1)
modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.000025)
for i in range(200):
    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9): 
        print("iteration " + str(i+1) + ", objective = " + str(obj))
    if (i % 20 == 9):    
        biasOnlyPredictions =[modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]
        labels = [r for _,_,r in interactionsTest]
        
        print(MSE(biasOnlyPredictions, labels))

iteration 10, objective = 1.8315191
3.649967212786028
iteration 20, objective = 1.6335082
iteration 30, objective = 1.5461633
3.1031634056994033
iteration 40, objective = 1.5339885
iteration 50, objective = 1.5177698
3.0522965390434167
iteration 60, objective = 1.517798
iteration 70, objective = 1.5172275
3.0494469699587246
iteration 80, objective = 1.5209827
iteration 90, objective = 1.5211657
3.050670965092708
iteration 100, objective = 1.5221378
iteration 110, objective = 1.5219678
3.047038857103788
iteration 120, objective = 1.5268795
iteration 130, objective = 1.5115079
3.0431120107061567
iteration 140, objective = 1.5128993
iteration 150, objective = 1.5036949
3.0454241461396294
iteration 160, objective = 1.5091177
iteration 170, objective = 1.5108358
3.044040568716573
iteration 180, objective = 1.5086373
iteration 190, objective = 1.5102401
3.0492721725185836
iteration 200, objective = 1.5199573


In [14]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
     
    if u in userIDs and g in itemIDs:
        pred = modelBiasOnly.predict(userIDs[u],itemIDs[g]).numpy()
    else:
        pred = mu
    
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()