In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from pymongo import MongoClient
from mongo_credentials import connection_string

In [2]:
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
mongo = MongoClient(connection_string)
# confirm that our new database was created
print(mongo.list_database_names())

['movie_recommendations', 'admin', 'local']


In [3]:
# assign the database to a variable name
db = mongo['movie_recommendations']
# review the collections in our new database
print(db.list_collection_names())

['ratings', 'movies', 'ratings_small', 'movies_small']


In [4]:
# Access the 'movies' collection
movies_collection = db['movies']

# Read data from the 'movies' collection into a Pandas DataFrame
movies_data = list(movies_collection.find())  # Convert the collection data to a list of dictionaries
movies_df = pd.DataFrame(movies_data)  # Create a DataFrame from the list of dictionaries

# Display the first few rows of the DataFrame
print(movies_df.head())


                        _id  movieId                               title  \
0  675640fbcaf6abb67dda95dd        1                    Toy Story (1995)   
1  675640fbcaf6abb67dda95de        2                      Jumanji (1995)   
2  675640fbcaf6abb67dda95df        3             Grumpier Old Men (1995)   
3  675640fbcaf6abb67dda95e0        4            Waiting to Exhale (1995)   
4  675640fbcaf6abb67dda95e1        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [5]:
# Extract the release year from the title column
movies_df["release_year"] = movies_df["title"].str.extract(r'\((\d{4})\)')

# Display the updated DataFrame
movies_df

Unnamed: 0,_id,movieId,title,genres,release_year
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,675640fbcaf6abb67dda95de,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,675640fbcaf6abb67dda95df,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,675640fbcaf6abb67dda95e0,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,675640fbcaf6abb67dda95e1,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...,...
87580,67564105caf6abb67ddbebf9,292731,The Monroy Affaire (2022),Drama,2022
87581,67564105caf6abb67ddbebfa,292737,Shelter in Solitude (2023),Comedy|Drama,2023
87582,67564105caf6abb67ddbebfb,292753,Orca (2023),Drama,2023
87583,67564105caf6abb67ddbebfc,292755,The Angry Breed (1968),Drama,1968


In [6]:
#cleaning up the genres column to a list instead of a string
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,_id,movieId,title,genres,release_year
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,675640fbcaf6abb67dda95de,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,675640fbcaf6abb67dda95df,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,675640fbcaf6abb67dda95e0,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,675640fbcaf6abb67dda95e1,5,Father of the Bride Part II (1995),[Comedy],1995


In [7]:
# Access the 'ratings' collection
ratings_collection = db['ratings']

# Read data from the 'movies' collection into a Pandas DataFrame
ratings_data = list(ratings_collection.find())  # Convert the collection data to a list of dictionaries
ratings_df = pd.DataFrame(ratings_data)  # Create a DataFrame from the list of dictionaries

# Display the first few rows of the DataFrame
print(ratings_df.head())

                        _id  userId  movieId  rating  timestamp
0  67564f35caf6abb67d653dc7       1       17     4.0  944249077
1  67564f35caf6abb67d653dc8       1       25     1.0  944250228
2  67564f35caf6abb67d653dc9       1       29     2.0  943230976
3  67564f35caf6abb67d653dca       1       30     5.0  944249077
4  67564f35caf6abb67d653dcb       1       32     5.0  943228858


In [8]:
# Merge the DataFrames on movieId
consolidated_movies_df = movies_df.merge(ratings_df, on = 'movieId', how ='inner')
consolidated_movies_df.head()

Unnamed: 0,_id_x,movieId,title,genres,release_year,_id_y,userId,rating,timestamp
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d653ff6,10,2.5,1169265231
1,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65428a,11,3.0,850085076
2,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65448c,17,4.0,1027305751
3,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65456c,19,3.0,974704488
4,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65459b,20,5.0,1553184230


In [9]:
# Convert timestamp to year and store in a new column `rating_year`
consolidated_movies_df['rating_year'] = consolidated_movies_df['timestamp'].apply(
    lambda x: datetime.utcfromtimestamp(x).year
)

# Display the DataFrame
consolidated_movies_df.head()

Unnamed: 0,_id_x,movieId,title,genres,release_year,_id_y,userId,rating,timestamp,rating_year
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d653ff6,10,2.5,1169265231,2007
1,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65428a,11,3.0,850085076,1996
2,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65448c,17,4.0,1027305751,2002
3,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65456c,19,3.0,974704488,2000
4,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,67564f35caf6abb67d65459b,20,5.0,1553184230,2019


In [10]:
# Drop the non-beneficial ID columns, 'timestamp','release_year' and 'rating_year'.
consolidated_movies_df = consolidated_movies_df.drop(columns=['_id_x','release_year','_id_y','timestamp','rating_year'])
consolidated_movies_df

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",10,2.5
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",11,3.0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.0
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",19,3.0
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",20,5.0
...,...,...,...,...,...
3529994,292349,Totally Killer (2023),"[Comedy, Horror]",11190,4.0
3529995,292371,Pet Sematary: Bloodlines (2023),"[Fantasy, Horror]",18699,0.5
3529996,292467,Space Wars: Quest for the Deepstar (2023),"[Action, Adventure, Sci-Fi]",3367,0.5
3529997,292529,Something to Remind Me (2002),"[Drama, Thriller]",13783,2.5


In [11]:
consolidated_movies_df.dtypes

movieId      int64
title       object
genres      object
userId       int64
rating     float64
dtype: object

In [12]:
print(consolidated_movies_df.isnull().sum())

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64


**Data Preprocessing**:

*Collaborative Filtering* 

We will use a method called collaborative filtering to generate user recommendations. This approach operates on the principle that individuals with similar preferences tend to enjoy similar items.

The first step involves converting our dataset into a user-item matrix, also referred to as a "utility matrix." In this matrix, each row corresponds to a user, and each column represents a movie. A key advantage of collaborative filtering is its ability to generate recommendations without relying on additional information about the users or the items.

The create_X() function produces a sparse matrix along with four mapping dictionaries:

rating_mapper: Maps ratings to their corresponding row indices.

movie_mapper: Maps movie IDs to their corresponding column indices.

ratings_inv_mapper: Maps row indices back to ratings.

movie_inv_mapper: Maps column indices back to movie IDs.

These dictionaries are essential for linking the rows and columns of the utility matrix to their respective user and movie IDs.

The resulting user-item matrix is a scipy.sparse.csr_matrix, designed to efficiently store and handle sparse data.


In [13]:
import numpy as np
from scipy.sparse import csr_matrix

def create_X(ratings_df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        ratings_df: pandas dataframe containing columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict mapping user id's to user indices
        user_inv_mapper: dict mapping user indices to user id's
        movie_mapper: dict mapping movie id's to movie indices
        movie_inv_mapper: dict mapping movie indices to movie id's
    """
    # Number of unique users and movies
    M = ratings_df['userId'].nunique()
    N = ratings_df['movieId'].nunique()

    # Mapping dictionaries with explicit conversion to Python int
    user_mapper = {int(k): int(v) for k, v in zip(np.unique(ratings_df["userId"]), range(M))}
    movie_mapper = {int(k): int(v) for k, v in zip(np.unique(ratings_df["movieId"]), range(N))}
    
    user_inv_mapper = {int(v): int(k) for k, v in user_mapper.items()}
    movie_inv_mapper = {int(v): int(k) for k, v in movie_mapper.items()}
    
    # Map user and movie IDs to their matrix indices
    user_index = [user_mapper[i] for i in ratings_df['userId']]
    item_index = [movie_mapper[i] for i in ratings_df['movieId']]

    # Create the sparse matrix
    X = csr_matrix((ratings_df["rating"], (user_index, item_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
    X = csr_matrix((ratings_df["rating"], (user_index, item_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

# Example usage
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings_df)

In [14]:
X.shape

(22208, 44586)

Our X matrix contains 22208 users and 44586 movies

Evaluating Sparsity : Evaluating sparsity in collaborative filtering models is essential to address data challenges, improve performance, and ensure effective recommendations. Sparse data limits the system's ability to learn preferences, exacerbates cold start issues, and risks overfitting or biasing predictions. It influences algorithm suitability, often favoring matrix factorization over neighborhood-based methods, and highlights the need for hybrid approaches. Sparse datasets also demand efficient computational structures and can complicate optimization. Understanding sparsity aids in refining data collection, imputation, and preprocessing strategies, while revealing user behavior and item popularity trends. This evaluation is vital for designing robust, scalable recommendation systems. Here, we calculate sparsity by dividing the number of stored elements by total number of elements. The number of stored (non-empty) elements in our matrix ([nnz](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.nnz.html)) is equivalent to the number of ratings in our dataset.

In [15]:
n_total = X.shape[0] * X.shape[1]  # Total entries
n_ratings = X.nnz                 # Non-zero entries
density = n_ratings / n_total     # Matrix density
sparsity = 1 - density            # Matrix sparsity

print(f"Matrix density: {round(density * 100, 2)}%")
print(f"Matrix sparsity: {round(sparsity * 100, 2)}%")

Matrix density: 0.36%
Matrix sparsity: 99.64%


Usually a sparsity of 0.1 or above we are good to use this method of collaborative filtering else we have to use the Content based filtering. The **cold start problem** is when there are new users and movies in our matrix that do not have any ratings. In our Movielens dataset, all users and movies have at least one rating but in general, it's useful to check which users and movies have few interactions.

In [16]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)

22208

In [17]:
print(f"Most active user rated {n_ratings_per_user.max()} movies.")
print(f"Least active user rated {n_ratings_per_user.min()} movies.")

Most active user rated 9577 movies.
Least active user rated 20 movies.


In [18]:
n_ratings_per_movie = X.getnnz(axis=0)
len(n_ratings_per_movie)

44586

In [19]:
print(f"Most rated movie has {n_ratings_per_movie.max()} ratings.")
print(f"Least rated movie has {n_ratings_per_movie.min()} ratings.")

Most rated movie has 11388 ratings.
Least rated movie has 1 ratings.


We will be Implementing Collaborative Filtering using the K-Nearest Neighbors algorithm, considering that the matrix is dense, to identify movies with similar user engagement patterns

In [20]:
import time
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Start the timer
start_time = time.time()

# Pivot the dataset to create the user-item matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# End the timer
end_time = time.time()

# Print the time taken
print(f"Time taken to pivot the dataset and create the user-item matrix: {end_time - start_time:.2f} seconds")

user_item_matrix

Time taken to pivot the dataset and create the user-item matrix: 36.61 seconds


movieId,1,2,3,4,5,6,7,8,9,10,...,292113,292139,292141,292313,292343,292349,292371,292467,292529,292619
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Method to remove noise ie., remove noise from the data, focusing on filtering out users and movies with too few ratings


In [21]:
import numpy as np
from scipy.sparse import csr_matrix

def create_cleaned_X(consolidated_movies_df, min_user_ratings=1000, min_movie_ratings=200):
    """
    Generates a sparse matrix from a cleaned ratings dataframe.
    
    Args:
        consolidated_movies_df: pandas dataframe containing columns (userId, movieId, rating)
        min_user_ratings: Minimum number of ratings a user must have to be included
        min_movie_ratings: Minimum number of ratings a movie must have to be included

    Returns:
        X: sparse matrix
        user_mapper: dict mapping user ids to user indices
        user_inv_mapper: dict mapping user indices to user ids
        movie_mapper: dict mapping movie ids to movie indices (includes only filtered movies)
        movie_inv_mapper: dict mapping movie indices to movie ids
    """
    # Filter movies and users based on minimum rating thresholds
    movie_counts = consolidated_movies_df.groupby('movieId')['rating'].count()
    user_counts = consolidated_movies_df.groupby('userId')['rating'].count()

    # Filter dataset to include only users and movies with enough ratings
    filtered_ratings = consolidated_movies_df[
        consolidated_movies_df['movieId'].isin(movie_counts[movie_counts >= min_movie_ratings].index) &
        consolidated_movies_df['userId'].isin(user_counts[user_counts >= min_user_ratings].index)
    ]
    
    # Number of unique users and movies in the filtered data
    M = filtered_ratings['userId'].nunique()
    N = filtered_ratings['movieId'].nunique()

    # Create movie_mapper only for filtered movies, ensure integer keys
    filtered_movies = np.unique(filtered_ratings["movieId"])
    movie_mapper = {int(movie_id): index for index, movie_id in enumerate(filtered_movies)}
    
    # Create inverse mapping for filtered movies
    movie_inv_mapper = {index: movie_id for movie_id, index in movie_mapper.items()}
    
    # User mappings are based on the filtered data (i.e., after applying the rating thresholds)
    user_mapper = dict(zip(np.unique(filtered_ratings["userId"]), list(range(M))))
    user_inv_mapper = dict(zip(list(range(M)), np.unique(filtered_ratings["userId"])))

    # Map user and movie IDs to their matrix indices based on the filtered data
    user_index = [user_mapper[i] for i in filtered_ratings['userId']]
    movie_index = [movie_mapper[int(i)] for i in filtered_ratings['movieId']]  # Ensure integer movie ID mapping

    # Create the sparse matrix
    X = csr_matrix((filtered_ratings["rating"], (user_index, movie_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

# Example usage
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_cleaned_X(consolidated_movies_df)

# To inspect the filtered dataset
print(f"Original dataset size: {consolidated_movies_df.shape}")
print(f"Filtered dataset size: {X.shape}")


Original dataset size: (3529999, 5)
Filtered dataset size: (379, 3080)


In [22]:
# Check movie_mapper keys to ensure it contains only filtered movie IDs
print(movie_mapper.keys())

dict_keys([1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 31, 32, 34, 36, 39, 41, 42, 43, 44, 45, 46, 47, 48, 50, 52, 57, 58, 60, 61, 62, 63, 64, 65, 66, 69, 70, 73, 74, 76, 79, 81, 82, 85, 86, 88, 89, 92, 93, 94, 95, 97, 100, 101, 102, 104, 105, 107, 110, 111, 112, 122, 123, 125, 132, 135, 140, 141, 144, 145, 147, 150, 151, 153, 154, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 168, 169, 170, 172, 173, 174, 175, 176, 177, 180, 181, 185, 186, 188, 193, 194, 195, 196, 198, 199, 203, 204, 207, 208, 213, 215, 216, 218, 222, 223, 224, 225, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 239, 246, 247, 248, 249, 252, 253, 256, 257, 259, 260, 261, 262, 265, 266, 267, 272, 273, 276, 277, 280, 281, 282, 288, 289, 290, 292, 293, 296, 299, 300, 303, 305, 306, 307, 308, 314, 315, 316, 317, 318, 319, 322, 327, 328, 329, 332, 333, 337, 338, 339, 342, 344, 345, 347, 348, 349, 350, 351, 353, 355, 356, 357, 358, 360, 361, 362, 364, 365, 366, 

calculate the sparsity matrix

In [23]:
n_total = X.shape[0] * X.shape[1]  # Total entries
n_ratings = X.nnz                 # Non-zero entries
density = n_ratings / n_total     # Matrix density
sparsity = 1 - density            # Matrix sparsity

print(f"Matrix density: {round(density * 100, 2)}%")
print(f"Matrix sparsity: {round(sparsity * 100, 2)}%")

Matrix density: 33.85%
Matrix sparsity: 66.15%


In [24]:
    # Calculate CSR data
    csr_data = {
        "data": X.data,         # Non-zero values (ratings)
        "indices": X.indices,   # Column indices for non-zero values
        "indptr": X.indptr,     # Row pointers (starting index of each row in 'data')
        "shape": X.shape        # Shape of the matrix (rows, cols)
    }

# Print CSR data
print("CSR Matrix Data:")
print("Non-zero values (data):", csr_data["data"])
print("Column indices (indices):", csr_data["indices"])
print("Row pointers (indptr):", csr_data["indptr"])
print("Shape of the matrix:", csr_data["shape"])

CSR Matrix Data:
Non-zero values (data): [4. 3. 4. ... 4. 3. 4.]
Column indices (indices): [   0    1    2 ... 3077 3078 3079]
Row pointers (indptr): [     0   1927   3347   4499   5459   6261   7246   8235   8825   9570
  10445  11366  11996  12808  13856  14647  16123  16924  17952  18697
  19640  20790  22111  22862  23503  24687  25767  26731  27776  28655
  29517  30579  31676  32661  33556  34691  35839  36526  37350  38243
  39649  40731  41694  42655  43450  44233  45174  46142  47133  47764
  48092  48838  50082  51749  52634  53695  55383  56550  57458  58166
  58833  59732  60556  62092  62551  63384  64304  65195  66736  67886
  69148  70075  71194  72161  73147  74275  75212  76221  77174  78003
  78723  79520  80912  81824  82619  83424  84672  85523  86966  88088
  89174  90093  91030  92356  93162  94052  94804  96311  97435  98252
  99072  99743 101148 102160 103300 103835 104628 105677 106830 107848
 109002 109693 110287 111355 112904 115356 116179 117376 118157 11910

Movie recommendations with K Nearest Neighbours

In [25]:
#Fit the Nearest Neighbors model
# Assuming X is the sparse matrix from the earlier function
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')

# Fit the Nearest Neighbors model with the CSR matrix
knn.fit(X)  # Pass the entire sparse matrix X (not its components)

In [26]:
def find_similar_movies(movieId, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movieId: id of the movie of interest
        X: user-item utility matrix
        movie_mapper: mapping from movie IDs to matrix indices
        movie_inv_mapper: mapping from matrix indices to movie IDs
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Output: returns list of k similar movie IDs
    """
    # Check if the movie_id exists in the movie_mapper
    if movieId not in movie_mapper:
        print(f"Movie ID {movieId} not found in movie_mapper. Skipping.")
        return []  # Or return some default behavior, like an empty list

    X = X.T  # Transpose X to align movies with rows (movies are columns in the original matrix)
    movie_ind = movie_mapper[movieId]
    movie_vec = X[movie_ind].reshape(1, -1)  # Reshape the movie vector for kNN
    
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    
    neighbours = kNN.kneighbors(movie_vec, return_distance=False)
    neighbour_indices = neighbours[0][1:]  # Exclude the movie itself (first neighbor is the movie itself)
    
    # Use the inverse mapping to convert the indices back to movie IDs
    neighbour_ids = [movie_inv_mapper[n] for n in neighbour_indices]
    
    return neighbour_ids



In [27]:

# Training, testing and evaluation of the model

import time
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

def predict_rating(userId, movieId, X_train, movie_mapper, user_mapper, k, metric='cosine'):
    """
    Predict the rating for a user and a movie based on similar movies.
    """
    movie_index = movie_mapper[movieId]
    user_index = user_mapper[userId]
    
    # Get k similar movies
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X_train.T)  # Fit on movie-item matrix
    neighbours = kNN.kneighbors(X_train.T[movie_index].reshape(1, -1), return_distance=False)
    
    # Predict the rating as the average rating of the k similar movies
    total_rating = 0
    count = 0
    for neighbour in neighbours[0][1:]:  # Exclude the movie itself
        if X_train[user_index, neighbour] > 0:  # Only consider rated movies
            total_rating += X_train[user_index, neighbour]
            count += 1
    return total_rating / count if count > 0 else 0  # Avoid division by zero

def calculate_rmse_mae(actuals, predictions):
    """
    Calculate RMSE and MAE for given actual and predicted ratings.
    """
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    return rmse, mae

def train_test_knn(X, movie_mapper, user_mapper, k, metric='cosine', test_size=0.25, threshold=3.0):
    """
    Train and test the kNN model and calculate RMSE and MAE for both train and test sets.
    """
    # Start the timer
    start_time = time.time()
    
    # Split the data into train and test sets
    train_data, test_data = train_test_split(X, test_size=test_size, random_state=42)
    
    # Initialize lists to store predictions and actual ratings
    train_predictions = []
    train_actuals = []
    test_predictions = []
    test_actuals = []

    # Train RMSE and MAE Calculation
    for userId in range(train_data.shape[0]):
        for movieId in range(train_data.shape[1]):
            if train_data[userId, movieId] > 0:  # Only consider rated movies in train set
                predicted_rating = predict_rating(userId, movieId, train_data, movie_mapper, user_mapper, k, metric)
                train_predictions.append(predicted_rating)
                train_actuals.append(train_data[userId, movieId])

    # Test RMSE and MAE Calculation
    for userId in range(test_data.shape[0]):
        for movieId in range(test_data.shape[1]):
            if test_data[userId, movieId] > 0:  # Only consider rated movies in test set
                predicted_rating = predict_rating(userId, movieId, train_data, movie_mapper, user_mapper, k, metric)
                test_predictions.append(predicted_rating)
                test_actuals.append(test_data[userId, movieId])

    # Calculate RMSE and MAE for train and test sets
    train_rmse, train_mae = calculate_rmse_mae(train_actuals, train_predictions)
    test_rmse, test_mae = calculate_rmse_mae(test_actuals, test_predictions)
    
    # End the timer
    end_time = time.time()
    
    # Print results
    print(f"Train RMSE: {train_rmse:.2f}")
    print(f"Train MAE: {train_mae:.2f}")
    print(f"Test RMSE: {test_rmse:.2f}")
    print(f"Test MAE: {test_mae:.2f}")
    print(f"Time taken for training, testing, and evaluation: {end_time - start_time:.2f} seconds")

# Example Usage:
# Replace these placeholders with your actual data
X = np.random.rand(100, 50) * 5  # Random user-item matrix with ratings from 0 to 5
movie_mapper = {i: i for i in range(X.shape[1])}  # Example mapping
user_mapper = {i: i for i in range(X.shape[0])}  # Example mapping

train_test_knn(X, movie_mapper, user_mapper, k=10, metric='cosine', test_size=0.25)

Train RMSE: 1.34
Train MAE: 1.14
Test RMSE: 1.54
Test MAE: 1.30
Time taken for training, testing, and evaluation: 8.68 seconds


In [28]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def find_similar_movies(movieId, X, movie_mapper, movie_inv_mapper, metric='cosine', k=10):
    """
    Finds k-nearest neighbours for a given movie id, including the cosine distance.
    
    Input:
    - movieId: ID of the movie for which we want to find similar movies
    - X: User-item matrix (n_users x n_movies)
    - movie_mapper: A dictionary mapping movie_id to movie index in the matrix
    - movie_inv_mapper: A dictionary mapping matrix indices back to movie ids
    - metric: The similarity metric to be used (default is 'cosine')
    - k: The number of similar movies to return
    
    Output:
    - A list of movie ids (excluding the movie itself) and their cosine distances
    """
    # Check if movie_id exists in movie_mapper
    if movieId not in movie_mapper:
        raise ValueError(f"Movie ID {movieId} not found in the movie_mapper")

    # Get the index of the movie in the matrix
    movie_ind = movie_mapper[movieId]
    movie_vec = X.T[movie_ind].reshape(1, -1)  # Reshape the movie vector for kNN
    
    # Use NearestNeighbors to find similar movies
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X.T)  # Fit on the transposed X (movies as rows)
    
    # Find the k nearest neighbors
    distances, neighbours = kNN.kneighbors(movie_vec, return_distance=True)
    
    # Exclude the movie itself (index 0 corresponds to the movie itself)
    similar_movies = [(movie_inv_mapper[n], distances[0][i+1]) for i, n in enumerate(neighbours[0][1:])]
    
    return similar_movies

# Example Usage:
movie_titles = dict(zip(consolidated_movies_df['movieId'], consolidated_movies_df['title']))
movieId = 36

# Check if movie_id exists in movie_mapper
if movieId in movie_mapper:
    # Find similar movies
    similar_movies = find_similar_movies(movieId, X, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
    movie_title = movie_titles[movieId]

    print(f"Because you watched {movie_title}:")

    # Display recommended movie titles with their cosine distance
    for movie, distance in similar_movies:
        print(f"Movie: {movie_titles[movie]} | Cosine Distance: {distance:.2f}")
else:
    print(f"Movie ID {movieId} is not available in the movie_mapper.")

Because you watched Dead Man Walking (1995):
Movie: Get Shorty (1995) | Cosine Distance: 0.17
Movie: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) | Cosine Distance: 0.18
Movie: Leaving Las Vegas (1995) | Cosine Distance: 0.20
Movie: Father of the Bride Part II (1995) | Cosine Distance: 0.21
Movie: Mighty Aphrodite (1995) | Cosine Distance: 0.21
Movie: Waiting to Exhale (1995) | Cosine Distance: 0.21
Movie: Dead Man Walking (1995) | Cosine Distance: 0.21
Movie: Postman, The (Postino, Il) (1994) | Cosine Distance: 0.21
Movie: Home for the Holidays (1995) | Cosine Distance: 0.22
Movie: Balto (1995) | Cosine Distance: 0.22
