In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse.linalg import svds, eigs
import utils,models,time,sys
import os
import itertools

import eval_utils

In [2]:
class MatrixFactorization(torch.nn.Module):

    def __init__(self,
                 n_users,
                 n_items,
                 n_factors=40,
                 dropout_p=0,
                 sparse=False):
        """
        Parameters
        ----------
        n_users : int
            Number of users
        n_items : int
            Number of items
        n_factors : int
            Number of latent factors (or embeddings or whatever you want to
            call it).
        dropout_p : float
            p in nn.Dropout module. Probability of dropout.
        sparse : bool
            Whether or not to treat embeddings as sparse. NOTE: cannot use
            weight decay on the optimizer if sparse=True. Also, can only use
            Adagrad.
        """
        super(MatrixFactorization, self).__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.user_biases = torch.nn.Embedding(n_users, 1, sparse=sparse)
        self.item_biases = torch.nn.Embedding(n_items, 1, sparse=sparse)
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors, sparse=sparse)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors, sparse=sparse)
        
        self.dropout_p = dropout_p
        self.dropout = torch.nn.Dropout(p=self.dropout_p)

        self.sparse = sparse
        
    def forward(self, users, items):
        """
        Forward pass through the model. For a single user and item, this
        looks like:
        user_bias + item_bias + user_embeddings.dot(item_embeddings)
        Parameters
        ----------
        users : np.ndarray
            Array of user indices
        items : np.ndarray
            Array of item indices
        Returns
        -------
        preds : np.ndarray
            Predicted ratings.
        """
        ues = self.user_embeddings(users)
        uis = self.item_embeddings(items)

        #preds = self.user_biases(users)
        #preds += self.item_biases(items)
        preds2 = (self.dropout(ues) * self.dropout(uis)).sum(1)
        return preds2
    
    def __call__(self, *args):
        return self.forward(*args)

    def predict(self, users, items):
        return self.forward(users, items)

In [2]:
dataset_name='msd-exp-40-160'
train_dir = "../train/"
print 'reading training and testing file.'

n_users, n_items, train_data = utils.load_train_data(os.path.join(train_dir, '%s.train.rating' % dataset_name))
testRatings =  utils.load_rating_file_as_list("../train/%s.test.rating" % dataset_name)
testNegatives = utils.load_negative_file("../train/%s.test.negative" % dataset_name)

reading training and testing file.


In [19]:
train_data[0].nonzero()[1].tolist()

[148, 299, 476, 688, 1617, 1664]

In [29]:
hit = 100
a = [1,10,9]
a.pop
print a

ValueError: list.remove(x): x not in list

In [47]:
def make_training_tuple(train_data, users, n_items, neg_num=10):
    u_list = []
    i_list = []
    r_list = []
    for u in users:
        neg_sample = np.random.choice(n_items, 100)
        pos_items = train_data[u].nonzero()[1].tolist()
        for pos_item in pos_items:
            neg_sample = np.random.choice(n_items, 10).tolist()
            if pos_item in neg_sample:
                neg_sample.remove(pos_item)
            u_list += [u]*(len(neg_sample)+1)
            i_list.append(pos_item)
            r_list.append(1.0)
            i_list += neg_sample
            r_list += [0.0] * len(neg_sample)
    return u_list, i_list, r_list

In [53]:
model = MatrixFactorization(n_users, n_items, n_factors=16)
loss_fn = torch.nn.MSELoss(size_average=False) 
#loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                            lr=5e-3)

num_epochs = 100
N = train_data.shape[0]
idxlist = range(N)
batch_size = 256
for epoch in range(num_epochs):
    np.random.shuffle(idxlist)
    total_loss = torch.Tensor([0])
    for bnum, st_idx in enumerate(range(0, N, batch_size)):
            end_idx = min(st_idx + batch_size, N)
            u,i,r = make_training_tuple(train_data, idxlist[st_idx:end_idx], n_items, 20)
            #X = train_data[idxlist[st_idx:end_idx]].tocoo()
            users = Variable(torch.LongTensor(u))
            items = Variable(torch.LongTensor(i))
            ratings = Variable(torch.FloatTensor(r))
            prediction = model(users, items)
            optimizer.zero_grad()
            loss = loss_fn(prediction, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
    topk = 10
    limit = 10000
    evaluation = eval_utils.mf_evaluation(model, testRatings, testNegatives, topk)
    (hits, ndcgs) = evaluation.evaluate(1, limit_size=limit)
    hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print('epoch = %d, test_size = %d, HR = %.4f, NDCG = %.4f, loss = %.4f'  % (epoch, len(hits), hr, ndcg, total_loss))

epoch = 0, test_size = 10000, HR = 0.0962, NDCG = 0.0430, loss = 15507809.0000
epoch = 1, test_size = 10000, HR = 0.0952, NDCG = 0.0426, loss = 10190454.0000
epoch = 2, test_size = 10000, HR = 0.0954, NDCG = 0.0426, loss = 6650565.5000
epoch = 3, test_size = 10000, HR = 0.0957, NDCG = 0.0430, loss = 4336228.5000
epoch = 4, test_size = 10000, HR = 0.0945, NDCG = 0.0426, loss = 2857521.2500
epoch = 5, test_size = 10000, HR = 0.0945, NDCG = 0.0428, loss = 1900734.3750
epoch = 6, test_size = 10000, HR = 0.0926, NDCG = 0.0421, loss = 1268879.1250
epoch = 7, test_size = 10000, HR = 0.0931, NDCG = 0.0423, loss = 859844.5000
epoch = 8, test_size = 10000, HR = 0.0936, NDCG = 0.0425, loss = 590780.3750
epoch = 9, test_size = 10000, HR = 0.0931, NDCG = 0.0425, loss = 417914.8125
epoch = 10, test_size = 10000, HR = 0.0963, NDCG = 0.0434, loss = 303211.9688
epoch = 11, test_size = 10000, HR = 0.0986, NDCG = 0.0445, loss = 231273.1875
epoch = 12, test_size = 10000, HR = 0.0985, NDCG = 0.0446, loss =

KeyboardInterrupt: 

In [56]:
A = sparse.csc_matrix([[1, 0, 0], [5, 0, 2], [0, -1, 0], [0, 0, 3]], dtype=float)
u, s, vt = svds(A, k=2)

In [86]:
np.matmul(u[1],(s*vt.T).T)

array([ 5.00000000e+00, -6.43251766e-18,  2.00000000e+00])

In [105]:
u, s, vt = svds(train_data, k=64)

In [106]:
pred = np.matmul(u,(s*vt.T).T)

In [108]:
evaluator = eval_utils.vaecf_evaluation(pred, testRatings, testNegatives, topk)
(hits, ndcgs) = evaluator.evaluate(1, limit_size=100000)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('test_size = %d, HR = %.4f, NDCG = %.4f'  % ( len(hits), hr, ndcg))

test_size = 44967, HR = 0.3433, NDCG = 0.1959


In [6]:
def nonzeros(m, row):
    """ returns the non zeroes of a row in csr_matrix """
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]

def alternating_least_squares(Cui, factors, regularization, iterations=20):
    users, items = Cui.shape

    X = np.random.rand(users, factors) * 0.01
    Y = np.random.rand(items, factors) * 0.01

    Ciu = Cui.T.tocsr()
    for iteration in range(iterations):
        print iteration
        least_squares(Cui, X, Y, regularization)
        least_squares(Ciu, Y, X, regularization)

    return X, Y

def least_squares(Cui, X, Y, regularization):
    users, factors = X.shape
    YtY = Y.T.dot(Y)

    for u in range(users):
        # accumulate YtCuY + regularization * I in A
        A = YtY + regularization * np.eye(factors)

        # accumulate YtCuPu in b
        b = np.zeros(factors)

        for i, confidence in nonzeros(Cui, u):
            factor = Y[i]
            A += (confidence - 1) * np.outer(factor, factor)
            b += confidence * factor

        # Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
        X[u] = np.linalg.solve(A, b)

In [12]:
X, Y = alternating_least_squares(train_data, 32, 0.1, iterations=30)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [13]:
pred = np.dot(X,Y.T)

In [14]:
topk=10
evaluator = eval_utils.vaecf_evaluation(pred, testRatings, testNegatives, topk)
(hits, ndcgs) = evaluator.evaluate(1, limit_size=100000)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('ALS, test_size = %d, HR = %.4f, NDCG = %.4f'  % ( len(hits), hr, ndcg))

ALS, test_size = 100000, HR = 0.4807, NDCG = 0.3022


In [3]:
def test_item_popularity(train_data, testRatings, testNegatives, topk=10, limit=100000):
    item_pop = {}
    for i in range(n_items):
        item_pop[i] = train_data[:,i].count_nonzero()
    item_pop_evaluation = eval_utils.item_popularity_evaluation(item_pop, testRatings, testNegatives, topk)
    (hits, ndcgs) = item_pop_evaluation.evaluate(num_thread=1, limit_size=limit)
    item_pop_hr, item_pop_ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print 'Item Popularity, testSize=%d, HitRate=%.4f, NDCG=%.4f' % (len(hits), item_pop_hr, item_pop_ndcg)
    return item_pop_hr, item_pop_ndcg

In [4]:
hr1, ndcg1 = test_item_popularity(train_data, testRatings, testNegatives)

Item Popularity, testSize=100000, HitRate=0.3690, NDCG=0.2194
