In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

import warnings; warnings.simplefilter('ignore')

In [2]:
movies = pd.read_csv('data/movies.csv')
movies = movies[['movieId', 'title']].drop_duplicates('title')

In [3]:
ratings = pd.read_csv('data/ratings.csv')
ratings = ratings[ratings.movieId.isin(set(movies.movieId))]

In [4]:
# verwerfen aller Film mit weniger als 25 ratings
ratings = ratings.groupby('movieId').filter(lambda x: x['movieId'].count() > 25)

In [5]:
# movieIds anpassen
movies = movies[movies.movieId.isin(set(ratings.movieId))].reset_index(drop=True)
ratings['movieId'] = ratings.movieId.map({mi: i for i, mi in movies.movieId.iteritems()})
movies['movieId'] = movies.movieId.map({mi: i for i, mi in movies.movieId.iteritems()})

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,0,841,2.5,1260759144,Dangerous Minds
1,0,550,3.0,1260759182,Sleepers
2,0,554,2.0,1260759185,Escape from New York
3,0,598,3.0,1260759117,Blazing Saddles
4,0,835,1.0,1260759200,Time Bandits
...,...,...,...,...,...
99805,670,591,2.0,1064245565,High Fidelity
99806,670,414,4.0,1065149296,Dances with Wolves
99807,670,704,4.0,1063502737,Animal House
99808,670,595,4.0,1064245557,About a Boy


In [7]:
# Ratings matrix erstellen
rat = coo_matrix(
    (ratings.rating, (ratings.movieId, ratings.userId)), 
    shape=(
        len(set(ratings.movieId)),
        len(set(ratings.userId))
    )
)
ratings_matrix = pd.DataFrame(
    rat.todense(), 
    columns=range(rat.shape[1]), 
    index=movies.title
)
ratings_matrix[ratings_matrix == 0] = np.nan
ratings_matrix.shape

(994, 671)

In [8]:
def rmse(A, B):
    return np.sqrt(np.nanmean(np.power((A - B),2)))

def minimize_U(M, U, V, r, s):       
    U = U.copy()
    a = 0
    b = 0
    for j in range(M.shape[1]):        
        if np.isnan(M[r,j]):
            continue
        a += V[s,j] * (M[r,j] - sum(U[r,k] * V[k,j] for k in range(U.shape[1]) if k != s))
        b += np.power(V[s,j],2) 
    if b != 0:
        U[r,s] = a/b
    return U

def minimize_V(M, U, V, r, s):       
    V = V.copy()
    a = 0
    b = 0
    for i in range(M.shape[0]):        
        if np.isnan(M[i,s]):
            continue
        a += U[i,r] * (M[i,s] - sum(U[i,k] * V[k,s] for k in range(V.shape[0]) if k != r))
        b += np.power(U[i,r],2) 
    if b != 0:
        V[r,s] = a/b
    return V

def UV_decomposition(M, k, delta_treshold=0.01, max_iter=5):
    init_value = np.sqrt(np.nanmean(M)/k)
    U = np.ones((M.shape[0],k)) * init_value
    V = np.ones((k,M.shape[1])) * init_value
    
    delta = np.inf
    last_error = np.inf
    i = 0
    while delta > delta_treshold and i < max_iter:
        for r in range(U.shape[0]):
            for s in range(U.shape[1]):
                U = minimize_U(M, U, V, r, s)
            print(f'{i:>2}: {rmse(M,U @ V):.7} - {r}/{U.shape[0]}', end="\r")

        for r in range(V.shape[0]):
            for s in range(V.shape[1]):
                V = minimize_V(M, U, V, r, s)
            print(f'{i:>2}: {rmse(M,U @ V):.7} - {r}/{V.shape[0]}', end="\r")

        P = U @ V
        error = rmse(M,P)
        delta = last_error - error
        last_error = error
        print(f'{i:>2}: {error:.7}')
        i += 1
    return U, V

In [9]:
M = ratings_matrix.to_numpy()
M.shape

(994, 671)

In [10]:
# split of test set
user_test_size = int(M.shape[0] * 0.1)
item_test_size = int(M.shape[1] * 0.1)
M_test = M[-user_test_size:,-item_test_size:]
M_train = M.copy()
M_train[-user_test_size:,-item_test_size:] = np.nan

In [11]:
# run algorithm
U, V = UV_decomposition(M_train,50)

 0: 0.8253362 - 49/5094
 1: 0.7938519 - 49/5094
 2: 0.7630398 - 49/5094
 3: 0.7316421 - 49/5094
 4: 0.7038188 - 49/5094


In [12]:
# get predictions
M_pred = U.dot(V)

In [13]:
# get error on test set
rmse(M_test, M_pred[-user_test_size:,-item_test_size:])

0.9843042415286726