In [1]:
import pandas as pd

Importing the data needed to make the recommender

In [3]:
movie_features = pd.read_csv('movies_preprocessed.csv', index_col='movieId')
reviews200 = pd.read_csv('user_reviews_200moviesplus.csv')
movie_titles = pd.read_csv('clean_movies_id_appended.csv', usecols=['movieId', 'original_title'], index_col='movieId')

FileNotFoundError: [Errno 2] No such file or directory: 'movies_preprocessed_unstandardized.csv'

Creating a function to make a matrix where every row is a user and every column is a movie. Then replacing null values for unseen movies with 0's

In [None]:
def create_user_review_matrix(user_reviews, user_id):
    review_matrix = user_reviews.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    if user_id not in review_matrix.index:
        raise ValueError("User ID not found in review matrix")

    return review_matrix

Creating the user-review matrix using only users with over 200 movies reviewed. Originally I wanted to use a larger sample, but decided to scale down the size of the data. 

In [None]:
user_review_matrix = create_user_review_matrix(reviews200, 3)

movie_features = movie_features[movie_features.index.isin(user_review_matrix.columns)]  #Filtering out movies that are not in the user reviews
movie_titles = movie_titles[movie_titles.index.isin(user_review_matrix.columns)]

Next, I wanted to find the most similar users to a given user.

In [None]:
from sklearn.neighbors import NearestNeighbors

This function will return the k-nearest users to a given user, based on cosine similarity

In [None]:
def find_similar_users(review_matrix, user_vector, k=25):
    knn = NearestNeighbors(n_neighbors=k + 1, metric='cosine', n_jobs=-1)
    knn.fit(review_matrix)
    distances, indices = knn.kneighbors(user_vector, n_neighbors=k + 1)
    nearest_neighbors = [review_matrix.index[indices[0, j]] for j in range(1, k + 1)]
    neighbor_distances = distances[0,1:k+1]
    return nearest_neighbors , neighbor_distances

Testing this function on a user

In [None]:
import numpy as np
user = user_review_matrix.loc[3].to_numpy().reshape(1,-1)

neighbors, distances = find_similar_users(user_review_matrix, user)
print(neighbors)
print(distances)

Next I needed to create a way to get a user's predicted rating for a given movie

In [None]:
def weighted_predict_ratings(movie_id, review_matrix, neighbors, distances):

    if movie_id not in review_matrix.columns:
        return np.nan
    
    # Get indices of neighbors who have rated this movie
    valid_indices = [i for i, neighbor in enumerate(neighbors) if review_matrix.loc[neighbor, movie_id] > 0]
    
    if not valid_indices:
        return np.nan
    
    # Getting ratings and distances for valid neighbors
    ratings = np.array([review_matrix.loc[neighbors[i], movie_id] for i in valid_indices])
    valid_distances = distances[valid_indices]
    
    # Convert distances to weights (smaller distance = larger weight)
    # Add small constant to avoid division by zero
    weights = 1 / (valid_distances ** 2 + 0.001)
    
    # Calculate weighted average
    weighted_avg = np.sum(ratings * weights) / np.sum(weights)
    
    return weighted_avg

Testing this on the user

In [None]:
predicted_rating = weighted_predict_ratings(1,user_review_matrix, neighbors, distances)
print(predicted_rating)

I also wanted to create a way to find similar movies, which is accomplished with this function. It finds the most similar neighbors to a given movie based on the vector of its attributes. It returns the movie ids.

In [None]:
def find_similar_movies(movie_vector, movies_df, movies_titles, num_recommendations=5):
    knn = NearestNeighbors(n_neighbors=num_recommendations + 1, metric='cosine', n_jobs=-1)
    knn.fit(movies_df)
    distances, indices = knn.kneighbors(movie_vector, n_neighbors=num_recommendations + 1)
    similar_movies = movies_titles.iloc[indices[0][1:]].index.tolist()
    
    return similar_movies

Testing this on movie ID one, which is Toy Story

In [None]:
toy_story = movie_features.loc[1].to_numpy().reshape(1,-1)

similar_ids = find_similar_movies(toy_story, movie_features, movie_titles, 5)

for i in similar_ids:
    print(movie_titles.loc[i]['original_title'])

Next, I needed to create a version of the recommender that allows me to test predicted ratings against actual ratings of new users.

In [None]:
def test_recommend(user_reviews, movies_df, movie_titles, review_matrix, user_id, test_users_matrix):

    predicted_ratings = []      #Empty lists to store predicted and true ratings
    true_ratings = []

    reviewed_movies = user_reviews.loc[user_reviews > 0]    #Finding the movies that the user has seen
    reviewed_movies = reviewed_movies.index

    favorite_movies = user_reviews[user_reviews >= 4]   #Finding the movies that the user has rated over 4. This is so we only find movies similar to movies they liked
    valid_favorite_movies = favorite_movies[favorite_movies.index.isin(user_review_matrix.columns)]     #Making sure movies are present in the user-review matrix

    if len(valid_favorite_movies) == 0:  
        return None, None
    
    movie_vectors = movies_df.loc[favorite_movies.index]    #These are the vectors of all the users liked movies

    composite_vector = movie_vectors.mean(axis=0)   #This vector is the average value of all the features of the users liked movies
    composite_vector = composite_vector.values.reshape(1,-1)
    
    user_ratings = user_reviews.to_frame().T    
    
    similar_users, user_distances = find_similar_users(review_matrix, user_ratings)     #Here I am finding the most similar users to the given user
    similar_movies = find_similar_movies(composite_vector, movies_df, movie_titles, 10)    #Here I am finding the 10 most similar movies to the average favorite movie vector

    similar_movies = [movie for movie in similar_movies if movie not in reviewed_movies]    #Excluding movies that we have a rating for already

    for movie in similar_movies:
        
        rating = weighted_predict_ratings(movie, review_matrix, similar_users, user_distances)
        
        if pd.isna(rating):
            rating = user_review_matrix[user_review_matrix[movie]>0][movie].mean()

        if test_users_matrix.loc[user_id, movie] > 0:   #If the test user has actually reviewed the movie append the rating and predicted rating

            true_ratings.append(test_users_matrix.loc[user_id, movie])      
            predicted_ratings.append(rating)
        
    return true_ratings, predicted_ratings

To test the recommender, I created several test groups of users from the large original review dataset.

These sample sets contain users with 10, 20, 50, and 100 reviews of movies.

In [None]:
reviews_test_10 = pd.read_csv('user_reviews_10_reviews.csv')
reviews_test_20 = pd.read_csv('user_reviews_20_reviews.csv')
reviews_test_50 = pd.read_csv('user_reviews_50_reviews.csv')
reviews_test_100 = pd.read_csv('user_reviews_100_reviews.csv')

Creating matrices out of these data samples, reindexing to account for movies missing from these reviews

In [None]:
test_users_matrix_10 = create_user_review_matrix(reviews_test_10, 182).reindex(columns=user_review_matrix.columns, fill_value=0)
test_users_matrix_20 = create_user_review_matrix(reviews_test_20, 24).reindex(columns=user_review_matrix.columns, fill_value=0)
test_users_matrix_50 = create_user_review_matrix(reviews_test_50, 99).reindex(columns=user_review_matrix.columns, fill_value=0)
test_users_matrix_100 = create_user_review_matrix(reviews_test_100, 8).reindex(columns=user_review_matrix.columns, fill_value=0)

I needed to create a function to split the user-review matrices into training and testing segments

In [None]:
from random import sample

def train_test_split_user(user_ratings, test_ratio= 0.2):

    reviewed_movies = user_ratings[user_ratings > 0].index.tolist()

    test_movies = sample(reviewed_movies, max(1, int(len(reviewed_movies) * test_ratio)))   

    train_ratings = user_ratings.copy()
    test_ratings = user_ratings.copy()

    train_ratings[test_movies] = 0

    test_ratings[~test_ratings.index.isin(test_movies)] = 0

    train_ratings = train_ratings.to_numpy().reshape(1,-1)
    test_ratings = test_ratings.to_numpy().reshape(1,-1)

    return train_ratings, test_ratings

I also needed a way to test the RMSE on the tested samples

In [None]:
from tqdm import tqdm   #This enables a progress bar, not necessary, but this can take a while to run

def testing_rmse(test_users_matrix):
    true_ratings___ = []
    predicted_ratings___ = []
    user_ids = []

    for i in tqdm(test_users_matrix.index, desc="Testing RMSE", unit="user"):

        train_user, test_user = train_test_split_user(test_users_matrix.loc[i])     #Splitting the users reviews so we have some movies to predict
        train_user = np.array(train_user[0])
        train_user = pd.Series(train_user, index=test_users_matrix.loc[i].index)

        tr, pr = test_recommend(train_user, movie_features, movie_titles, user_review_matrix, i, test_users_matrix)
        
        if tr == None:
            continue

        for j in range(len(tr)):
            true_ratings___.append(tr[j])
            predicted_ratings___.append(pr[j])
            user_ids.append(i)
    
    return true_ratings___, predicted_ratings___, user_ids

Testing the RMSE on users with 10 movie reviews

This returns the RMSE and MAE found on movies that would be recommended to the user, that they have already reviewed, but were excluded from the testing data

In [None]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)    #Suppressing this warning: X does not have valid feature names, but NearestNeighbors was fitted with feature names

tr, pr, uid = testing_rmse(test_users_matrix_10)

rmse = root_mean_squared_error(tr, pr)
mae = mean_absolute_error(tr, pr)
print(f'RMSE for 10 review users: {rmse}')
print(f'MAE for 10 review users: {mae}')

This function returns the true ratings and the predicted rating when using the movies average rating as the prediction. This is to compare versus our model

In [None]:
def mean_rating(user_review_matrix):
    mean_ratings = {}
    
    for i in tqdm(user_review_matrix.columns, desc="Calculating mean ratings", unit="movie"):
        movie_mean_rating = user_review_matrix[user_review_matrix[i] > 0][i].mean()
        mean_ratings[i] = movie_mean_rating
    
    return mean_ratings

In [None]:
mean_ratings = mean_rating(user_review_matrix)

Adding the mean rating to the reviews dataframe and finding the RMSE and MAE

In [None]:
reviews_test_10['mean_rating'] = reviews_test_10['movieId'].map(mean_ratings)
reviews_test_10['mean_rating'] = reviews_test_10['mean_rating'].fillna(0)

rmse = root_mean_squared_error(reviews_test_10['rating'], reviews_test_10['mean_rating'])
mae = mean_absolute_error(reviews_test_10['rating'], reviews_test_10['mean_rating'])
print(f'RMSE for 10 review users: {rmse}')
print(f'MAE for 10 review users: {mae}')

The RMSE obtained from our models predictions is an improvement over the baseline of avg. movie rating

Testing the RMSE on users with 20 reviews

In [None]:
tr_20, pr_20, uid_20 = testing_rmse(test_users_matrix_20)

rmse = root_mean_squared_error(tr_20, pr_20)
mae = mean_absolute_error(tr_20, pr_20)
print(f'RMSE for 20 review users: {rmse}')
print(f'MAE for 20 review users: {mae}')

Comparing to the baseline of average movie rating

In [None]:
reviews_test_20['mean_rating'] = reviews_test_20['movieId'].map(mean_ratings)
reviews_test_20['mean_rating'] = reviews_test_20['mean_rating'].fillna(0)

rmse = root_mean_squared_error(reviews_test_20['rating'], reviews_test_20['mean_rating'])
mae = mean_absolute_error(reviews_test_20['rating'], reviews_test_20['mean_rating'])
print(f'RMSE for 20 review users: {rmse}')
print(f'MAE for 20 review users: {mae}')

Testing the RMSE on users with 50 reviews

In [None]:
tr_50, pr_50, uid_50 = testing_rmse(test_users_matrix_50)

rmse = root_mean_squared_error(tr_50, pr_50)
mae = mean_absolute_error(tr_50, pr_50)
print(f'RMSE for 50 review users: {rmse}')
print(f'MAE for 50 review users: {mae}')

Comparing to the baseline of average movie ratings

In [None]:
reviews_test_50['mean_rating'] = reviews_test_50['movieId'].map(mean_ratings)
reviews_test_50['mean_rating'] = reviews_test_50['mean_rating'].fillna(0)

rmse = root_mean_squared_error(reviews_test_50['rating'], reviews_test_50['mean_rating'])
mae = mean_absolute_error(reviews_test_50['rating'], reviews_test_50['mean_rating'])
print(f'RMSE for 50 review users: {rmse}')
print(f'MAE for 50 review users: {mae}')

Testing the RMSE on users with 100 reviews

In [None]:
tr_100, pr_100, uid_100 = testing_rmse(test_users_matrix_100)

rmse = root_mean_squared_error(tr_100, pr_100)
mae = mean_absolute_error(tr_100, pr_100)
print(f'RMSE for 100 review users: {rmse}')
print(f'MAE for 100 review users: {mae}')

Comparing to the baseline of average movie ratings

In [None]:
reviews_test_100['mean_rating'] = reviews_test_100['movieId'].map(mean_ratings)
reviews_test_100['mean_rating'] = reviews_test_100['mean_rating'].fillna(0)

rmse = root_mean_squared_error(reviews_test_100['rating'], reviews_test_100['mean_rating'])
mae = mean_absolute_error(reviews_test_100['rating'], reviews_test_100['mean_rating'])
print(f'RMSE for 20 review users: {rmse}')
print(f'MAE for 20 review users: {mae}')

After running these tests multiple times, we can see that the model consistently out-performs the baseline. Also, the RMSE decreases as the user reviews more movies, which is as expected! There is a point of diminishing returns when a user has reviewed ~50 movies.