In [1]:
import torch
import numpy as np
import pandas as pd

films = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

movies = films.copy()
genres = films['genres'].str.split('|')

movies.loc[:, 'genres'] = genres

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [2]:
df = ratings.merge(movies, on='movieId')
df = df.drop(columns=['timestamp'])

df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"


In [3]:
from torchvision.transforms.functional import to_tensor

movie_matrix = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0)
useritem_matrix = movie_matrix.to_numpy()
useritem_tensor = to_tensor(useritem_matrix)
useritem_tensor.shape

torch.Size([1, 610, 9719])

In [4]:
from torch.linalg import svd

useritem_tensor_svd = svd(useritem_tensor, full_matrices=True)
useritem_tensor_svd

torch.return_types.linalg_svd(
U=tensor([[[ 5.5554e-02,  6.1675e-02, -1.0898e-02,  ...,  3.0200e-03,
          -2.9025e-04,  4.3090e-04],
         [ 5.8663e-03, -1.7738e-02, -4.4238e-03,  ..., -3.2674e-03,
          -8.8687e-03,  8.8603e-04],
         [ 1.3558e-03,  2.0737e-03,  1.7134e-03,  ..., -2.3453e-03,
          -2.0808e-03, -1.5509e-03],
         ...,
         [ 1.1611e-01,  1.1845e-02, -9.7664e-03,  ...,  8.8508e-04,
          -7.6575e-04,  1.3795e-03],
         [ 7.5794e-03,  1.3785e-02, -3.9741e-02,  ...,  1.2189e-02,
          -3.2129e-03, -1.7574e-02],
         [ 1.3886e-01, -2.0219e-01,  9.2669e-02,  ..., -7.6931e-05,
          -3.3862e-04,  7.0527e-04]]], dtype=torch.float64),
S=tensor([[534.4209, 231.2361, 191.1513, 170.4226, 154.5538, 147.3359, 135.6559,
         122.6614, 121.4392, 113.1229, 109.6005, 107.9337, 105.9723, 102.0401,
          99.8580,  99.3077,  97.1087,  93.4096,  92.3162,  90.9771,  90.4270,
          88.8285,  87.2876,  86.0539,  85.1525,  83.0449,  

In [5]:
U, S, Vh = useritem_tensor_svd
print(U.shape, S.shape, Vh.shape)

torch.Size([1, 610, 610]) torch.Size([1, 610]) torch.Size([1, 9719, 9719])


In [6]:
energy = torch.cumsum(S, dim=-1)/torch.sum(S, dim=-1, keepdim=True)
k = (energy >= 0.90).nonzero(as_tuple=True)[1][0].item()

U_tk = U[:, :, :k]
S_tk = torch.diag_embed(S[:, :k])
Vh_tk = Vh[:, :k, :]

svd_matrix = U_tk @ S_tk @ Vh_tk
svd_matrix

tensor([[[ 1.0014e-04, -4.7855e-04,  1.9363e-03,  ..., -2.0768e-04,
           3.9728e+00, -5.0104e-04],
         [-7.9501e-03, -8.8968e-03, -1.4419e-02,  ...,  1.4081e-02,
          -1.1897e-01,  3.9915e-03],
         [-2.1376e-03, -1.8693e-03, -1.0417e-02,  ...,  1.5225e-03,
          -1.9290e-02,  1.6746e-03],
         ...,
         [-4.9335e-04,  1.1554e-03,  1.7208e-03,  ...,  4.6091e-04,
          -8.1877e-03,  1.0804e-04],
         [-5.0988e-03,  5.7097e-03, -3.7368e-03,  ...,  1.3514e-02,
          -7.3031e-02, -8.0606e-03],
         [ 3.9972e+00, -1.7863e-03, -2.8948e-03,  ...,  1.5000e+00,
          -2.1534e-03,  9.2187e-05]]], dtype=torch.float64)

In [7]:
torch.allclose(useritem_tensor, svd_matrix, atol=1e-6)

False

In [8]:
filled_tensor = torch.where(useritem_tensor == 0, svd_matrix, useritem_tensor)
filled_tensor

tensor([[[ 1.0014e-04, -4.7855e-04,  1.9363e-03,  ..., -2.0768e-04,
           4.0000e+00, -5.0104e-04],
         [-7.9501e-03, -8.8968e-03, -1.4419e-02,  ...,  1.4081e-02,
          -1.1897e-01,  3.9915e-03],
         [-2.1376e-03, -1.8693e-03, -1.0417e-02,  ...,  1.5225e-03,
          -1.9290e-02,  1.6746e-03],
         ...,
         [-4.9335e-04,  1.1554e-03,  1.7208e-03,  ...,  4.6091e-04,
          -8.1877e-03,  1.0804e-04],
         [-5.0988e-03,  5.7097e-03, -3.7368e-03,  ...,  1.3514e-02,
          -7.3031e-02, -8.0606e-03],
         [ 4.0000e+00, -1.7863e-03, -2.8948e-03,  ...,  1.5000e+00,
          -2.1534e-03,  9.2187e-05]]], dtype=torch.float64)

In [9]:
useritem_tensor_2d = useritem_tensor.squeeze(0)
useritem_tensor_2d.shape

torch.Size([610, 9719])

In [10]:
user_movies = useritem_tensor_2d[3]

svd_matrix_2d = svd_matrix.squeeze(0)
user_prediction = svd_matrix_2d[1]

mask = user_movies == 0

predictions = user_prediction[mask]
prediction_indices = (torch.sort(predictions, descending=True)).indices
prediction_indices

tensor([9332, 4226, 4262,  ..., 5588, 1026, 4010])

In [11]:
title_list = list(movie_matrix.columns)
predicted_titles = []

for i in prediction_indices:
    movie_name = title_list[i.type(torch.int64)]
    predicted_titles.append(movie_name)
print(predicted_titles[:6])

['Weird Science (1985)', 'Ice Age 4: Continental Drift (2012)', 'Ilsa, She Wolf of the SS (1974)', 'Da geht noch was! (2013)', 'Get Real (1998)', 'Death Wish 5: The Face of Death (1994)']


In [12]:
mask = useritem_tensor != 0
l1_error = torch.abs(useritem_tensor[mask] - svd_matrix[mask]).mean()
l2_error = torch.sqrt(torch.mean((useritem_tensor[mask] - svd_matrix[mask]) ** 2))
l1_error, l2_error

(tensor(0.1001, dtype=torch.float64), tensor(0.3026, dtype=torch.float64))