In [15]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [16]:
def load_data():
    ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
    movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
    return ratings, movies

In [17]:
def preprocess_data(ratings, movies):
    movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
    movie_stats.columns = movie_stats.columns.droplevel()
    return movie_stats


In [18]:
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [19]:
def find_similar_movies(movie_id, X, k, movie_mapper, movie_inv_mapper, metric='cosine', show_distance=False):
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [20]:
def main():
    ratings, movies = load_data()
    movie_stats = preprocess_data(ratings, movies)
    X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

    movie_titles = dict(zip(movies['movieId'], movies['title']))
    movie_id = 3

    similar_ids = find_similar_movies(movie_id, X, 10, movie_mapper, movie_inv_mapper)
    movie_title = movie_titles[movie_id]

    print(f"Movie Recommendations based on '{movie_title}':")
    print("------------------------------------------------")
    for i, sim_id in enumerate(similar_ids, start=1):
        similar_movie_title = movie_titles[sim_id]
        rating_count = movie_stats.loc[sim_id, 'count']
        avg_rating = movie_stats.loc[sim_id, 'mean']
        print(f"{i}. {similar_movie_title} (Ratings: {rating_count}, Average Rating: {avg_rating:.2f})")

In [21]:
if __name__ == '__main__':
    main()


Movie Recommendations based on 'Grumpier Old Men (1995)':
------------------------------------------------
1. Grumpy Old Men (1993) (Ratings: 29, Average Rating: 3.29)
2. Striptease (1996) (Ratings: 41, Average Rating: 2.41)
3. Nutty Professor, The (1996) (Ratings: 82, Average Rating: 2.73)
4. Twister (1996) (Ratings: 123, Average Rating: 3.32)
5. Father of the Bride Part II (1995) (Ratings: 49, Average Rating: 3.07)
6. Broken Arrow (1996) (Ratings: 84, Average Rating: 3.02)
7. Bio-Dome (1996) (Ratings: 31, Average Rating: 2.53)
8. Truth About Cats & Dogs, The (1996) (Ratings: 58, Average Rating: 3.39)
9. Sabrina (1995) (Ratings: 54, Average Rating: 3.19)
10. Birdcage, The (1996) (Ratings: 86, Average Rating: 3.49)
