In [1]:
import pandas as pd

films = pd.read_csv('ml-latest-small/movies.csv')

print(films.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [2]:
genres = films['genres'].str.split('|')

genres

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [3]:
movies = films.copy()
movies.loc[:, 'genres'] = genres

print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                     [Adventure, Children, Fantasy]  
2                                  [Comedy, Romance]  
3                           [Comedy, Drama, Romance]  
4                                           [Comedy]  


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

df = movies.copy()

df_binarized = mlb.fit_transform(df['genres'])

df_binarized

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(9742, 20))

In [5]:
list(mlb.classes_)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [6]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

cosine_similarities = 1-pairwise_distances(df_binarized, metric='cosine')

print(cosine_similarities[:5, :5])

[[1.         0.77459667 0.31622777 0.25819889 0.4472136 ]
 [0.77459667 1.         0.         0.         0.        ]
 [0.31622777 0.         1.         0.81649658 0.70710678]
 [0.25819889 0.         0.81649658 1.         0.57735027]
 [0.4472136  0.         0.70710678 0.57735027 1.        ]]


In [7]:
movie = df[df['title'].str.contains('age of ultron', case=False, na=False)].index[0]
movie

cosine_similarities[8686, 9714]

np.float64(0.0)

In [8]:
def movie_recommender(movie_name):
    df.reset_index()
    movie = df[df['title'].str.contains(movie_name.title(), case=False, na=False)].index[0]
    similar_movies = cosine_similarities[movie].argsort()[::-1][1:6]
    recommendation = df.iloc[similar_movies]['title']
    return recommendation

movie_recommender(input())

6727         Street Kings (2008)
2475          Boiler Room (2000)
9374       The Accountant (2016)
9375             Imperium (2016)
6659    Cassandra's Dream (2007)
Name: title, dtype: object

In [9]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
df = ratings.merge(movies, on='movieId')
df = df.drop(columns=['timestamp'])

print(df.head())

   userId  movieId  rating                        title  \
0       1        1     4.0             Toy Story (1995)   
1       1        3     4.0      Grumpier Old Men (1995)   
2       1        6     4.0                  Heat (1995)   
3       1       47     5.0  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0   Usual Suspects, The (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                                  [Comedy, Romance]  
2                          [Action, Crime, Thriller]  
3                                [Mystery, Thriller]  
4                         [Crime, Mystery, Thriller]  


In [10]:
from torchvision.transforms.functional import to_tensor

useritem_matrix = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0)
useritem_matrix = useritem_matrix.to_numpy()
useritem_tensor = to_tensor(useritem_matrix)
useritem_tensor.shape

torch.Size([1, 610, 9719])

In [11]:
from torch.linalg import svd

useritem_matrix_svd = svd(useritem_tensor, full_matrices=True)
useritem_matrix_svd

torch.return_types.linalg_svd(
U=tensor([[[ 5.5554e-02,  6.1675e-02, -1.0898e-02,  ...,  3.0200e-03,
          -2.9025e-04,  4.3090e-04],
         [ 5.8663e-03, -1.7738e-02, -4.4238e-03,  ..., -3.2674e-03,
          -8.8687e-03,  8.8603e-04],
         [ 1.3558e-03,  2.0737e-03,  1.7134e-03,  ..., -2.3453e-03,
          -2.0808e-03, -1.5509e-03],
         ...,
         [ 1.1611e-01,  1.1845e-02, -9.7664e-03,  ...,  8.8508e-04,
          -7.6575e-04,  1.3795e-03],
         [ 7.5794e-03,  1.3785e-02, -3.9741e-02,  ...,  1.2189e-02,
          -3.2129e-03, -1.7574e-02],
         [ 1.3886e-01, -2.0219e-01,  9.2669e-02,  ..., -7.6931e-05,
          -3.3862e-04,  7.0527e-04]]], dtype=torch.float64),
S=tensor([[534.4209, 231.2361, 191.1513, 170.4226, 154.5538, 147.3359, 135.6559,
         122.6614, 121.4392, 113.1229, 109.6005, 107.9337, 105.9723, 102.0401,
          99.8580,  99.3077,  97.1087,  93.4096,  92.3162,  90.9771,  90.4270,
          88.8285,  87.2876,  86.0539,  85.1525,  83.0449,  

In [12]:
import torch

svd_U = useritem_matrix_svd[0]
svd_S = useritem_matrix_svd[1]
svd_Vh = useritem_matrix_svd[2]

In [13]:
print(svd_U.shape, svd_S.shape, svd_Vh.shape)


torch.Size([1, 610, 610]) torch.Size([1, 610]) torch.Size([1, 9719, 9719])


In [14]:
svd_U_mat = svd_U
svd_S_mat = torch.diag_embed(svd_S)
svd_Vh_mat = svd_Vh[:, :610, :]

svd_matrix = svd_U_mat @ svd_S_mat @ svd_Vh_mat
svd_matrix

tensor([[[-3.8566e-15, -6.0771e-14, -3.9918e-14,  ...,  2.0110e-15,
           4.0000e+00, -4.2826e-16],
         [ 3.0014e-14,  7.9206e-14, -1.2008e-14,  ..., -1.9993e-16,
          -2.3419e-15, -6.3811e-16],
         [ 2.4124e-15,  6.5219e-15, -1.5732e-13,  ..., -4.4127e-16,
           1.2993e-15,  2.3018e-16],
         ...,
         [ 3.6822e-14, -3.2483e-14, -2.1023e-14,  ..., -3.6322e-15,
           7.5951e-15,  8.3456e-17],
         [-4.3877e-15, -9.7752e-16, -3.4755e-16,  ..., -4.1113e-16,
          -8.3267e-16,  2.7756e-17],
         [ 4.0000e+00, -8.3127e-14, -3.1817e-14,  ...,  1.5000e+00,
           7.4940e-15, -4.0939e-16]]], dtype=torch.float64)

In [15]:
torch.allclose(useritem_tensor, svd_matrix, atol=1e-6)  # Adjust tolerance if needed

True

In [16]:
k = 100  # Choose a fixed number of singular values

U_k = svd_U[:, :, :k]  # Keep top-k columns of U
S_k_mat = torch.diag_embed(svd_S[:, :k])  # Keep top-k singular values and make diagonal
Vh_k = svd_Vh[:, :k, :]  # Keep top-k rows of Vh

A_reduced = U_k @ S_k_mat @ Vh_k  # Reconstruct the low-rank matrix


In [17]:
from torch import topk

energy = torch.cumsum(svd_S,  dim=-1)/torch.sum(svd_S, dim=-1, keepdim=True)
k = (energy >= 0.95).nonzero(as_tuple=True)[1][0].item()

U_tk = svd_U[:, :, :k]
S_tk = torch.diag_embed(svd_S[:, :k])
Vh_tk = svd_Vh[:, :k, :]

svd_matrix_tk = U_tk @ S_tk @ Vh_tk
svd_matrix_tk

tensor([[[-1.4644e-04,  1.9855e-03,  2.6089e-03,  ..., -1.6980e-03,
           3.9902e+00,  3.9167e-04],
         [-3.5057e-03, -4.9381e-03, -2.0176e-03,  ...,  4.5122e-03,
           2.9462e-02, -5.3825e-04],
         [-1.4908e-04,  5.5784e-03,  2.9729e-03,  ...,  7.4938e-04,
          -1.6697e-02,  8.0324e-05],
         ...,
         [-1.7909e-04, -2.2292e-03, -1.0488e-03,  ...,  7.6652e-04,
          -4.6936e-03,  5.8369e-05],
         [ 1.0216e-03,  2.4268e-02,  2.9653e-02,  ..., -7.0136e-03,
           3.9066e-02,  2.2483e-03],
         [ 3.9989e+00, -2.9725e-04, -4.1607e-04,  ...,  1.4998e+00,
           1.6774e-03,  1.2441e-04]]], dtype=torch.float64)

In [18]:
mask = useritem_tensor != 0
l1_error = torch.abs(useritem_tensor[mask] - svd_matrix_tk[mask]).mean()
l2_error = torch.sqrt(torch.mean((useritem_tensor[mask] - svd_matrix_tk[mask]) ** 2))
l1_error, l2_error

(tensor(0.0482, dtype=torch.float64), tensor(0.1680, dtype=torch.float64))