In [1]:
!pip install numpy pandas scikit-learn scipy




In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
import zipfile
import urllib.request
import os

url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
urllib.request.urlretrieve(url, "ml.zip")

with zipfile.ZipFile("ml.zip", 'r') as zip_ref:
    zip_ref.extractall("ml")

movies = pd.read_csv("ml/ml-latest-small/movies.csv")
ratings = pd.read_csv("ml/ml-latest-small/ratings.csv")

print(movies.head())
print(ratings.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [4]:
data = pd.merge(ratings, movies, on="movieId")
data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [5]:
user_movie_matrix = data.pivot_table(
    index="userId",
    columns="title",
    values="rating"
).fillna(0).astype("float32")

user_movie_matrix.shape


(610, 9719)

In [6]:
svd = TruncatedSVD(
    n_components=50,
    random_state=42
)

latent_matrix = svd.fit_transform(user_movie_matrix)


In [7]:
similarity_matrix = cosine_similarity(latent_matrix)


In [8]:
def recommend_movies(user_id, num_recommendations=5):
    user_index = user_id - 1  # userId starts from 1
    similarity_scores = similarity_matrix[user_index]

    similar_users = np.argsort(similarity_scores)[::-1][1:11]

    recommended_movies = (
        user_movie_matrix.iloc[similar_users]
        .mean(axis=0)
        .sort_values(ascending=False)
    )

    already_watched = user_movie_matrix.iloc[user_index]
    recommended_movies = recommended_movies[already_watched == 0]

    return recommended_movies.head(num_recommendations)


In [9]:
recommend_movies(user_id=1, num_recommendations=10)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Aliens (1986),4.1
Die Hard (1988),3.7
Terminator 2: Judgment Day (1991),3.5
"Godfather, The (1972)",3.4
Jaws (1975),3.3
"Hunt for Red October, The (1990)",3.1
"Godfather: Part II, The (1974)",2.9
Speed (1994),2.7
"Sixth Sense, The (1999)",2.6
Galaxy Quest (1999),2.6
