In [1]:
import numpy as np
import itertools
import pandas as pd
import random

In [2]:


class NMF:
    
    """
    Fit a matrix factorization model for a matrix X with missing values.
    such that
        X = W H.T + E 
    where
        X is of shape (m, n)    - data matrix
        W is of shape (m, rank) - approximated row space
        H is of shape (n, rank) - approximated column space
        E is of shape (m, n)    - residual (error) matrix
    """
    
    def __init__(self, rank=10, max_iter=100, eta=0.01):
        """
        :param rank: Rank of the matrices of the model.
        :param max_iter: Maximum nuber of SGD iterations.
        :param eta: SGD learning rate.
        """
        self.rank = rank
        self.max_iter = max_iter
        self.eta = eta
    
    
    def fit(self, X):
        """
        Fit model parameters W, H.
        :param X: 
            Non-negative data matrix of shape (m, n)
            Unknown values are assumed to take the value of zero (0).
        """
        m, n = X.shape
        
        W = np.random.rand(m, self.rank)
        H = np.random.rand(n, self.rank)
        
        # Indices to model variables
        w_vars = list(itertools.product(range(m), range(self.rank)))
        h_vars = list(itertools.product(range(n), range(self.rank)))
        
        # Indices to nonzero rows/columns
        nzcols = dict([(j, X[:, j].nonzero()[0]) for j in range(n)])
        nzrows = dict([(i, X[i, :].nonzero()[0]) for i in range(m)])
        #print(nzcols, nzrows)
        # Errors
        self.error = np.zeros((self.max_iter,))

        for t in range(self.max_iter):
            np.random.shuffle(w_vars)
            np.random.shuffle(h_vars)

            for i, k in w_vars:
                W[i, k] = W[i, k] + self.eta * np.sum(
                    [X[i, j] - 
                    np.sum(
                            [W[i, l] * H[j, l]for l in range(0, self.rank)]
                            
                        )
                    #W[i:].dot(H[:j]) 
                    for j in nzcols if X[i, j] != 0]) * W[i, k]

            for j, k in h_vars:
                H[j, k] = H[j, k] + self.eta * np.sum(
                    [X[i, j] - 
                    np.sum(
                            [W[i, l] * H[j, l]for l in range(0, self.rank)]
                        )
                    for i in nzrows if X[i, j] != 0]) * H[j, k]

 
            self.error[t] = np.linalg.norm((X - W.dot(H.T))[X > 0])**2
            print(t, self.error[t])
            #self.eta = self.eta*0.97
        
        self.W = W
        self.H = H
        return (W, H)
    
    
    def predict(self, i, j):
        """
        Predict score for row i and column j
        :param i: Row index.
        :param j: Column index.
        """
        return self.W[i, :].dot(self.H[j, :])
    

    def predict_all(self):
        """
        Return approximated matrix for all
        columns and rows.
        """
        return self.W.dot(self.H.T)

In [12]:
def removeTopK(data, k=5):
    movieRatingsDF = data
    updates=[]
    for userId, group in movieRatingsDF.groupby('userId'):
        g = group.sort_values(by = 'rating', ascending=False)
        for i in range(k):
            updates.append((g.iloc[i]["movieId"], userId, g.iloc[i]["rating"]))



    updatedMovieRatingsDF = movieRatingsDF.copy()
    for movieId, userId, rating in updates:
        updatedMovieRatingsDF.loc[
            (updatedMovieRatingsDF['movieId'] == movieId) & (updatedMovieRatingsDF['userId'] == userId), "rating"] = 0
    return (updates, updatedMovieRatingsDF)

moviesDF = pd.read_csv('movies.csv')
ratingsDF = pd.read_csv('ratings.csv')
movieRatingsDF = pd.merge(moviesDF, ratingsDF, on='movieId', how='inner')

#movies with more than 20 views
x = movieRatingsDF.groupby('movieId').agg('count')
x = x[x["title"]>=20]
x.reset_index(inplace = True)
x = x["movieId"].tolist()

#user with more than 20 views
y = movieRatingsDF[movieRatingsDF["movieId"].isin(x)]
y = y.groupby('userId').agg('count')
y = y[y["title"]>=20]
y.reset_index(inplace = True)
y = y["userId"].tolist()
movieRatingsDF = movieRatingsDF[movieRatingsDF["userId"].isin(y) & movieRatingsDF["movieId"].isin(x)]
%xdel x
%xdel y


K = 5
updates, learn = removeTopK(movieRatingsDF, K)
#learn

In [13]:
p = learn.pivot(index='userId', columns='movieId', values='rating')
p = p.reset_index()

p = p.fillna(0)
learn = p.as_matrix(p.columns.tolist()[1:])

rank =2
itera = 10
model = NMF(rank=rank, max_iter=itera, eta=0.0001)
#model.fit(X)
W, H = model.fit(learn)

0 674344.347962
1 651236.132584
2 626889.464261
3 601742.068548
4 576333.670346
5 551245.083379
6 527018.26125
7 504095.91539
8 482767.824735
9 463176.714853


In [14]:
pred = W.dot(H.T)
pred.shape

test = movieRatingsDF.pivot(index='userId', columns='movieId', values='rating')
collList = test.columns.tolist()
test = test.fillna(0)
test = test.as_matrix(p.columns.tolist()[1:])
changes = test!=learn

print("MSE: ", np.mean((pred[changes] - test[changes])**2))




predPerUser = []

#learn = učna
#test = vsi podatki
#pred = naučen model

changes = test != learn

allExp = []
for item in range(len(test)):
    uReal = test[item]
    uChange = changes[item]
    uPred = pred[item]

    l = list(zip(uPred, uChange, uReal))
    l.sort(reverse=True)
    movieIDCounter = 0
    expected = []
    for r, b, a in l:
        if(b==True):
            expected.append(movieIDCounter)
            #print(r, b, a, movieIDCounter)
        movieIDCounter+=1
    #print("EXP: ", np.mean(np.array(expected)))
    allExp.append(np.mean(np.array(expected)))
print("EXP total: ", np.mean(np.array(allExp)))
            

MSE:  15.9971952835
EXP total:  527.57605178
