In [1]:
import pandas as pd

films = pd.read_csv('ml-latest-small/movies.csv')

print(films.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [2]:
genres = films['genres'].str.split('|')

genres

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [3]:
movies = films.copy()
movies.loc[:, 'genres'] = genres

print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                     [Adventure, Children, Fantasy]  
2                                  [Comedy, Romance]  
3                           [Comedy, Drama, Romance]  
4                                           [Comedy]  


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

df = movies.copy()

df_binarized = mlb.fit_transform(df['genres'])

df_binarized

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(9742, 20))

In [5]:
list(mlb.classes_)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [6]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

cosine_similarities = 1-pairwise_distances(df_binarized, metric='cosine')

print(cosine_similarities[:5, :5])

[[1.         0.77459667 0.31622777 0.25819889 0.4472136 ]
 [0.77459667 1.         0.         0.         0.        ]
 [0.31622777 0.         1.         0.81649658 0.70710678]
 [0.25819889 0.         0.81649658 1.         0.57735027]
 [0.4472136  0.         0.70710678 0.57735027 1.        ]]


In [202]:
movie = df[df['title'].str.contains('age of ultron', case=False, na=False)].index[0]
movie

cosine_similarities[9332, 2111]

np.float64(0.0)

In [None]:
def movie_recommender(movie_name):
    df.reset_index()
    movie = df[df['title'].str.contains(movie_name.title(), case=False, na=False)].index[0]
    similar_movies = cosine_similarities[movie].argsort()[::-1][1:6]
    recommendation = df.iloc[similar_movies]['title']
    return recommendation

movie_recommender('weird science')

67      Monty Python and the Holy Grail (1975)
9734                    Thelma & Louise (1991)
1949                      Wayne's World (1992)
74                           Goodfellas (1990)
6661                           Rain Man (1988)
Name: title, dtype: object

: 

In [9]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
df = ratings.merge(movies, on='movieId')
df = df.drop(columns=['timestamp'])

print(df.head())

   userId  movieId  rating                        title  \
0       1        1     4.0             Toy Story (1995)   
1       1        3     4.0      Grumpier Old Men (1995)   
2       1        6     4.0                  Heat (1995)   
3       1       47     5.0  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0   Usual Suspects, The (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                                  [Comedy, Romance]  
2                          [Action, Crime, Thriller]  
3                                [Mystery, Thriller]  
4                         [Crime, Mystery, Thriller]  


In [187]:
from torchvision.transforms.functional import to_tensor

useritem_matrix = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0)
users_movies = useritem_matrix.copy()
useritem_matrix = useritem_matrix.to_numpy()
useritem_tensor = to_tensor(useritem_matrix)
useritem_tensor.shape

torch.Size([1, 610, 9719])

In [188]:
from torch.linalg import svd

useritem_matrix_svd = svd(useritem_tensor, full_matrices=True)
useritem_matrix_svd

torch.return_types.linalg_svd(
U=tensor([[[ 5.5554e-02,  6.1675e-02, -1.0898e-02,  ...,  3.0200e-03,
          -2.9025e-04,  4.3090e-04],
         [ 5.8663e-03, -1.7738e-02, -4.4238e-03,  ..., -3.2674e-03,
          -8.8687e-03,  8.8603e-04],
         [ 1.3558e-03,  2.0737e-03,  1.7134e-03,  ..., -2.3453e-03,
          -2.0808e-03, -1.5509e-03],
         ...,
         [ 1.1611e-01,  1.1845e-02, -9.7664e-03,  ...,  8.8508e-04,
          -7.6575e-04,  1.3795e-03],
         [ 7.5794e-03,  1.3785e-02, -3.9741e-02,  ...,  1.2189e-02,
          -3.2129e-03, -1.7574e-02],
         [ 1.3886e-01, -2.0219e-01,  9.2669e-02,  ..., -7.6931e-05,
          -3.3862e-04,  7.0527e-04]]], dtype=torch.float64),
S=tensor([[534.4209, 231.2361, 191.1513, 170.4226, 154.5538, 147.3359, 135.6559,
         122.6614, 121.4392, 113.1229, 109.6005, 107.9337, 105.9723, 102.0401,
          99.8580,  99.3077,  97.1087,  93.4096,  92.3162,  90.9771,  90.4270,
          88.8285,  87.2876,  86.0539,  85.1525,  83.0449,  

In [189]:
import torch

svd_U = useritem_matrix_svd[0]
svd_S = useritem_matrix_svd[1]
svd_Vh = useritem_matrix_svd[2]

In [190]:
print(svd_U.shape, svd_S.shape, svd_Vh.shape)


torch.Size([1, 610, 610]) torch.Size([1, 610]) torch.Size([1, 9719, 9719])


In [191]:
from torch import topk

energy = torch.cumsum(svd_S,  dim=-1)/torch.sum(svd_S, dim=-1, keepdim=True)
k = (energy >= 0.99999).nonzero(as_tuple=True)[1][0].item()

U_tk = svd_U[:, :, :k]
S_tk = torch.diag_embed(svd_S[:, :k])
Vh_tk = svd_Vh[:, :k, :]

svd_matrix_tk = U_tk @ S_tk @ Vh_tk
#svd_matrix_tk

Vh_tk.shape

torch.Size([1, 609, 9719])

In [192]:
torch.allclose(useritem_tensor, svd_matrix_tk, atol=1e-6)  # Adjust tolerance if needed

False

In [193]:
mask = useritem_tensor != 0
l1_error = torch.abs(useritem_tensor[mask] - svd_matrix_tk[mask]).mean()
l2_error = torch.sqrt(torch.mean((useritem_tensor[mask] - svd_matrix_tk[mask]) ** 2))
l1_error, l2_error

(tensor(0.0004, dtype=torch.float64), tensor(0.0036, dtype=torch.float64))

In [194]:
print((useritem_tensor == 0).sum())

tensor(5827758)


In [195]:
filled_ui_tensor = torch.where(useritem_tensor == 0, svd_matrix_tk, useritem_tensor)
filled_ui_tensor

tensor([[[-1.2156e-06,  9.5219e-07,  1.8470e-06,  ...,  2.4439e-06,
           4.0000e+00,  7.8019e-07],
         [-2.4996e-06,  1.9579e-06,  3.7979e-06,  ...,  5.0251e-06,
           2.1678e-05,  1.6042e-06],
         [ 4.3753e-06, -3.4272e-06, -6.6480e-06,  ..., -8.7962e-06,
          -3.7945e-05, -2.8081e-06],
         ...,
         [-3.8917e-06,  3.0484e-06,  5.9132e-06,  ...,  7.8239e-06,
           3.3751e-05,  2.4977e-06],
         [ 4.9578e-05, -3.8834e-05, -7.5330e-05,  ..., -9.9671e-05,
          -4.2996e-04, -3.1819e-05],
         [ 4.0000e+00,  1.5585e-06,  3.0231e-06,  ...,  1.5000e+00,
           1.7255e-05,  1.2770e-06]]], dtype=torch.float64)

In [196]:
print((filled_ui_tensor == 0).sum()) 


tensor(0)


In [197]:
us_uitensor = useritem_tensor.squeeze(0)
us_uitensor.shape

user_id = 5
user_index = user_id - 1
user_vector = us_uitensor[user_index, :]
user_vector

df_grouped = df.groupby('userId')
df_grouped.get_group(1)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"
...,...,...,...,...,...
227,1,3744,4.0,Shaft (2000),"[Action, Crime, Thriller]"
228,1,3793,5.0,X-Men (2000),"[Action, Adventure, Sci-Fi]"
229,1,3809,4.0,What About Bob? (1991),[Comedy]
230,1,4006,4.0,Transformers: The Movie (1986),"[Adventure, Animation, Children, Sci-Fi]"


In [198]:
user_dict = {user_id: group for user_id, group in df_grouped}
user_dict[1]

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"
...,...,...,...,...,...
227,1,3744,4.0,Shaft (2000),"[Action, Crime, Thriller]"
228,1,3793,5.0,X-Men (2000),"[Action, Adventure, Sci-Fi]"
229,1,3809,4.0,What About Bob? (1991),[Comedy]
230,1,4006,4.0,Transformers: The Movie (1986),"[Adventure, Animation, Children, Sci-Fi]"


In [199]:
df_grouped = df.groupby('userId')['title'].agg(list).reset_index()

user_rated_movies = df_grouped[df_grouped['userId'] == 1]['title'].to_list()

user_rated_movies 

[['Toy Story (1995)',
  'Grumpier Old Men (1995)',
  'Heat (1995)',
  'Seven (a.k.a. Se7en) (1995)',
  'Usual Suspects, The (1995)',
  'From Dusk Till Dawn (1996)',
  'Bottle Rocket (1996)',
  'Braveheart (1995)',
  'Rob Roy (1995)',
  'Canadian Bacon (1995)',
  'Desperado (1995)',
  'Billy Madison (1995)',
  'Clerks (1994)',
  'Dumb & Dumber (Dumb and Dumber) (1994)',
  'Ed Wood (1994)',
  'Star Wars: Episode IV - A New Hope (1977)',
  'Pulp Fiction (1994)',
  'Stargate (1994)',
  'Tommy Boy (1995)',
  'Clear and Present Danger (1994)',
  'Forrest Gump (1994)',
  'Jungle Book, The (1994)',
  'Mask, The (1994)',
  'Blown Away (1994)',
  'Dazed and Confused (1993)',
  'Fugitive, The (1993)',
  'Jurassic Park (1993)',
  'Mrs. Doubtfire (1993)',
  "Schindler's List (1993)",
  'So I Married an Axe Murderer (1993)',
  'Three Musketeers, The (1993)',
  'Tombstone (1993)',
  'Dances with Wolves (1990)',
  'Batman (1989)',
  'Silence of the Lambs, The (1991)',
  'Pinocchio (1940)',
  'Fargo (1

In [201]:
import numpy as np

user_movies = [i for sublist in user_rated_movies for i in sublist]

users_movies

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
svd_matrix_2d = svd_matrix_tk.squeeze(0)
all_movie_indices = np.arange(svd_matrix_2d.shape[1])
not_rated = np.isin(all_movie_indices, user_rated_indices)
not_rated

array([False, False, False, ..., False, False, False], shape=(9719,))

In [180]:
print(user_rated_indices[:10])       # Should show numeric indices
print(all_movie_indices[:10])        # Should match the same range
print(user_rated_indices.dtype)      # Should be int32 or int64


[99534 98668 99535 99538 99539 99540 96125 99542 97067 97392]
[0 1 2 3 4 5 6 7 8 9]
int64
