In [53]:
import pandas as pd
import numpy as np

In [54]:
ratingsNames = ["userId", "movieId", "rating", "timestamp"]
ratings = pd.read_table("/Users/daniel/Documents/Blog/ml-1m/ratings.t", header=None, sep="#", names=ratingsNames)
usersNames = ["userId", "gender", "age", "occupation", "zipCode"]
users = pd.read_table("/Users/daniel/Documents/Blog/ml-1m/users.t", header=None, sep="#", names=usersNames)
moviesNames = ["movieId", "title", "genres"]
movies = pd.read_table("/Users/daniel/Documents/Blog/ml-1m/movies.t", header=None, sep="#", names=moviesNames)

In [55]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [82]:
f = 20
regLamba = 0.1
iters = 20
n = max(movies.movieId)
m = max(users.userId)

In [57]:
def normaliseRow(x):
    return x / sum(x)

def initialiseMatrix(n, f):
    A = abs(np.random.randn(n, f))
    return np.apply_along_axis(normaliseRow, 1, A)
    

In [58]:
# Initialise Y matrix, n x f
Y = initialiseMatrix(n, f)
# Initialise X matrix, m x f
X = initialiseMatrix(m, f)

In [59]:
# Create a dummy entry for each movie
temp = np.zeros((n, 4))
for i in range(1, n):
    temp[i,] = [m+1, i, 0, 0]
    
ratings = ratings.append(pd.DataFrame(temp, columns =ratingsNames))

In [70]:
ratingsMatrix = ratings.pivot_table(columns=['movieId'], index =['userId'], values='rating', dropna = False)

In [71]:
ratingsMatrix = ratingsMatrix.fillna(0).as_matrix()

In [72]:
# Drop the dummy movie
ratingsMatrix = ratingsMatrix[0:m,0:n]

In [74]:
def ratingsPred(X, Y):
    return np.dot(X, Y.T)

def MSE(ratingsPred, ratingsMatrix):
    idx = ratingsMatrix > 0
    return sum((ratingsPred[idx] - ratingsMatrix[idx]) ** 2) / np.count_nonzero(ratingsMatrix)
    
print(MSE(ratingsPred(X, Y), ratingsMatrix))   


13.7194679337


In [75]:
nonZero = ratingsMatrix > 0

In [76]:
reg = regLamba * np.eye(f,f)

In [None]:
for k in range(1, iters):
    for i in range(1, m):
        idx = nonZero[i,:]
        a = Y[idx,]
        b = np.dot(np.transpose(Y[idx,]), ratingsMatrix[i, idx])
        updateX = np.linalg.solve(np.dot(np.transpose(a), a) + reg, b)
        X[i,] = updateX
    
    for j in range(1, n):
        idx = nonZero[:,j]
        a = X[idx,]
        b = np.dot(np.transpose(X[idx,]), ratingsMatrix[idx, j])
        updateY = np.linalg.solve(np.dot(np.transpose(a), a) + reg, b)
        Y[j,] = updateY
        
    ratingsP = ratingsPred(X, Y)
    mse = MSE(ratingsP, ratingsMatrix)
    print("MSE: " + str(mse))
        
print("Done")