# Matrix completion

In [14]:

import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:

# read in data
train = np.loadtxt("ratings-train.csv", delimiter=',', skiprows=1).astype(np.int64)
test = np.loadtxt("ratings-test.csv", delimiter=',', skiprows=1).astype(np.int64)

m_train, n_train = np.shape(train)
m_test, n_test = np.shape(test)


# create list of all unique movie ids
train_movie_ids = set(train[:,1]) # set() eliminates duplicates
test_movie_ids = set(test[:,1])
all_movie_ids = list(train_movie_ids.union(test_movie_ids))

num_users = len(set(train[:,0]).union(set(test[:,0]))) # counts number of unique user IDs
num_movies = len(all_movie_ids)


#create unique movie_ids
new_id = {movie_id : all_movie_ids.index(movie_id) for movie_id in all_movie_ids}

# create movie rating matrix, fill with training data
ratings = np.zeros((num_users, num_movies))
for row in train[:5]:
    user_id = row[0] - 1
    movie_id = new_id[row[1]]
    ratings[user_id, movie_id] = row[2] #inserts zero if person has not rated

features = 10
max_iters = 100
regularization = 100
users = np.random.rand(num_users, features)
movies = np.random.rand(num_movies, features)

#create (lamda)(I) matrix
lam = regularization*np.eye(features)

#start iterating
for k in range(max_iters):
    for n in range(num_users):
        movie_ids_rated = ratings[k, :] > 0 #find all movies that having ratings
        movies_known = movies[movie_ids_rated]
        ratings_known = ratings[k, movie_ids_rated]

        if movies_known.size > 0:
            mat = movies_known.T @ movies_known + lam
            rhs = movies_known.T @ ratings_known
            users[n, :] = np.linalg.solve(mat, rhs)

    for m in range(num_movies):
        user_ids_known = ratings[ : , m] > 0
        users_known = users[user_ids_known]
        ratings_known = ratings[user_ids_known, m]

        if users_known.size > 0:
            mat = users_known.T @ users_known + lam
            rhs = users_known.T @ ratings_known
            movies[m, :] = np.linalg.solve(mat, rhs)

    predictions = []

#start testing/predicting
for row in test:
    user_id = row[0] -1
    movie_id = new_id[row[1]]

    rating_pred = users[user_id, :] @ movies[movie_id, :].T
    rating_true = ratings[user_id, movie_id]
    predictions.append((rating_pred, rating_true))


rmse = np.sqrt(np.mean([(true - pred) **2 for true, pred in predictions]))
print(f"The MSE is {rmse}")
print(f"Amount of iterations: {max_iters}")





The MSE is 0.4752699825623252
Amount of iterations: 100
