In [161]:
import pandas as pd
import numpy as np
import torch
import time
from datetime import date
import sys
import pickle

import data_preparation
import matrices_functions
import cost_and_prediction_functions

In [162]:
def create_sub_matrices(userIds, movieIds):
    sUFeats = matrices_functions.create_sub_matrix(UFeats, userIds)
    sUBias  = matrices_functions.create_sub_matrix(UBias, userIds)
    sIFeats = matrices_functions.create_sub_matrix(IFeats, movieIds)
    sOnes   = matrices_functions.create_sub_matrix(Ones, movieIds)

    ufeats = torch.cat((sUFeats, sUBias), 1)
    ifeats = torch.cat((sIFeats, sOnes), 1)
    
    return (ufeats, ifeats)

In [163]:
def compute_predictions_and_costs(t, ufeats, ifeats, lamb): 
    
    pred = cost_and_prediction_functions.compute_prediction(ufeats, ifeats)
    
    mse = cost_and_prediction_functions.compute_mse(pred, t)
    
    regl2_u = cost_and_prediction_functions.compute_regl2(ufeats)
    
    regl2_i = cost_and_prediction_functions.compute_regl2(ifeats)
    
    loss = mse + (lamb/2) * regl2_u + (lamb/2) * regl2_i
    
    return loss

In [164]:
def create_expanded_target_and_mask_vectors(numberOfItems, ratings): 
    
    M = torch.autograd.Variable(torch.zeros(numberOfItems+1, 1), requires_grad=False).t()
    T = torch.autograd.Variable(torch.zeros(numberOfItems+1, 1), requires_grad=False).t()
    
    for r in ratings.iterrows(): 
        r = r[1]
        T.data[0, r.movieId] = r.rating
        M.data[0, r.movieId] = 1
        
    return (T.t(), M.t())

In [165]:
def fill_target_and_mask_vectors(ratings, T, M): 
    T.data[0, :] = M.data[0, :] = 0
    for r in ratings.iterrows(): 
        r = r[1]
        T.data[0, r.movieId] = r.rating
        M.data[0, r.movieId] = 1

In [166]:
def calculate_user_cost(lamb, userId, dataset): 
    
    userRatings = dataset[dataset.userId == userId]
    
    movieIds = userRatings.movieId.values
    
    (ufeats, ifeats) = create_sub_matrices([userId], movieIds)
    
    pred = torch.mm(ufeats, ifeats.t())

    R = userRatings.rating.values

    t = torch.autograd.Variable(torch.FloatTensor([R]), requires_grad=False)

    mse = cost_and_prediction_functions.compute_mse(pred, t)

    regl2_u = cost_and_prediction_functions.compute_regl2(ufeats)

    regl2_i = cost_and_prediction_functions.compute_regl2(ifeats)

    cost = mse + (lamb/2) * regl2_u + (lamb/2) * regl2_i
    
    return cost

In [167]:
def calculate_batch_cost(lamb, batch, userIds): 
    batchCost = 0.0
    
    for uId in userIds:
        
        userRatings = batch[batch.userId == uId]

        # Extract the list of movies ids for the user, in the current batch.
        movieIds = userRatings.movieId.values

        (ufeats, ifeats) = create_sub_matrices([uId], movieIds)

        # Extract the list of ratings for the user, in the current batch.
        R = userRatings.rating.values

        t = torch.autograd.Variable(torch.FloatTensor([R]), requires_grad=False)

        userCost = compute_predictions_and_costs(t, ufeats, ifeats, lamb)

        batchCost += userCost
        
    return batchCost

### Loading the ratings dataset.

In [6]:
ratings = pd.read_csv('ratings_sorted_by_timestamp.csv')

In [7]:
ratings.head(5)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,18713577,198518,1176,4,789652004
1,13721561,145654,21,3,789652009
2,13721567,145654,47,5,789652009
3,13721610,145654,1079,3,789652009
4,6406144,66985,31,5,822873600


In [8]:
ratings.sample()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
13995971,2838486,30177,1073,3,1177633473


### Counting how many ratings, users and items do we have in the ratings data frame.

In [9]:
numberOfItems = np.max(ratings.movieId.unique())

In [10]:
numberOfUsers = np.max(ratings.userId.unique())

In [11]:
numberOfRatings = ratings.shape[0]

In [12]:
(numberOfRatings, numberOfItems, numberOfUsers)

(24404096, 165201, 259137)

### Build training and test datasets.

TODO: create a validation dataset!

In [13]:
userRatings = ratings
numberOfUserRatings = userRatings.shape[0]

In [14]:
trainVolume = 0.7
testVolume  = 0.3

In [15]:
trainDataset = userRatings.head(int(trainVolume * numberOfUserRatings))

In [16]:
testDataset  = userRatings.tail(int(testVolume * numberOfUserRatings))

In [17]:
(trainDataset.shape[0], testDataset.shape[0], trainDataset.shape[0] + testDataset.shape[0])

(17082867, 7321228, 24404095)

In [18]:
maxBatchSize = 2000

In [19]:
numberOfBatches = int(np.ceil(float(trainDataset.shape[0]) / float(maxBatchSize)))
numberOfBatches

8542

In [20]:
trainDataset[0:2000].shape

(2000, 5)

In [21]:
trainDataset[2001:4001].shape

(2000, 5)

### Create user and item "embeddings" with an arbitrary number of dimensions.

In [22]:
numberOfLatentFeatures = 10

In [23]:
# The first line won't be used
UFeats, UBias = matrices_functions.create_user_features(numberOfLatentFeatures, numberOfUsers+1)

In [24]:
# The first line won't be used
IFeats = matrices_functions.create_item_features(numberOfLatentFeatures, numberOfItems+1)

In [25]:
Ones = torch.autograd.Variable(torch.ones(numberOfItems+1, 1), requires_grad=False)

### Training (v2.0)

In [174]:
# Create an error accumulator variable.
# For each training batch:
#   Extract the list of unique user ids for the batch.
#   For each (unique) userId in the batch:
#     Extract the list of ratings for the user, in the current batch.
#     Extract the list of movies ids for the user, in the current batch.
#     Create submatrices for the user and movies.
#     Create target vector.
#     Calculate predictions
#     Calculate error
#     Accumulate error
#   Backpropagate
#   Adjust parameters

def training_v2(epochs=1, lamb=0.4): 
    #optimizer1 = torch.optim.Adagrad([UFeats, UBias, IFeats], lr=0.1)
    optimizer1 = torch.optim.Adam([UFeats, UBias, IFeats])
    optimizer1.zero_grad()
    
    # Training procedure Version 2.0

    for e in range(epochs): 

        optimizer1.zero_grad()

        # Create an error accumulator variable.
        batchCost = 0.0

        # For each training batch:
        for i in range(1, numberOfBatches): 

            print('----------------------------------------------')

            print("Batch %d/%d" % (i,numberOfBatches))

            # Extract training batch
            batchStart = (i-1) * maxBatchSize
            batchEnd   = i * maxBatchSize
            batch = trainDataset[batchStart:batchEnd]

            # Extract the list of unique user ids for the batch.
            userIds = batch.userId.unique()

            # Print some stats
            nUIds = userIds.shape[0]
            nIIds = batch.movieId.unique().shape[0]
            nRats = batch.shape[0]
            sparsity = float(nRats) / float((nUIds * nIIds))

            print('  number of users: %d' % (nUIds))
            print('  number of items: %d' % (nIIds))
            print('  number of ratings: %d' % (nRats))
            print('  non-zero elements percentage: %.3f' % (sparsity))

            t0 = time.time()

            batchCost = calculate_batch_cost(lamb, batch, userIds)

            t1 = time.time()

            print('  seconds spent on forward: %.5f' % (t1-t0))

            t0 = time.time()

            batchCost.backward(retain_variables=False)

            t1 = time.time()

            print('  seconds spent on backward: %.5f' % (t1-t0))

            t0 = time.time()

            optimizer1.step()

            optimizer1.zero_grad()

            t1 = time.time()

            print('  seconds spent on optimizer step: %.5f' % (t1-t0))

            print('----------------------------------------------')

In [None]:
training_v2()

----------------------------------------------
Batch 1/8542
  number of users: 79
  number of items: 388
  number of ratings: 2000
  non-zero elements percentage: 0.065
  seconds spent on forward: 0.18940
  seconds spent on backward: 7.99823
  seconds spent on optimizer step: 0.06890
----------------------------------------------
----------------------------------------------
Batch 2/8542
  number of users: 86
  number of items: 261
  number of ratings: 2000
  non-zero elements percentage: 0.089
  seconds spent on forward: 0.21897
  seconds spent on backward: 6.88235
  seconds spent on optimizer step: 0.06568
----------------------------------------------
----------------------------------------------
Batch 3/8542
  number of users: 68
  number of items: 300
  number of ratings: 2000
  non-zero elements percentage: 0.098
  seconds spent on forward: 0.25429
  seconds spent on backward: 6.79909
  seconds spent on optimizer step: 0.06324
----------------------------------------------
----