In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

## Task1: Data pre-processing

In [32]:
google_colab = False

local_root = 'Project2-data'
colab_root = '/content/drive/MyDrive/Colab Notebooks/Project2-data/'
root = colab_root if google_colab else local_root

# Load data
users = pd.read_csv(f'{root}/users.txt', sep=' ', names=['user_id'], header=None)
movies = pd.read_csv(f'{root}/movie_titles.txt', names=['movie_id', 'year', 'title'], header=None, on_bad_lines='skip')
movie_ids = movies['movie_id']

ratings_train = pd.read_csv(f'{root}/netflix_train.txt', sep=' ', names=['user_id', 'movie_id', 'rating', 'date'], header=None)
ratings_test = pd.read_csv(f'{root}/netflix_test.txt', sep=' ', names=['user_id', 'movie_id', 'rating', 'date'], header=None)

# Create matrix
matrix = ratings_train.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

# Convert to numpy matrix for computation
vector_matrix = matrix.values

In [33]:
# Map user and movie IDs to matrix indices
user_id_to_index = {user_id: idx for idx, user_id in enumerate(matrix.index)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(matrix.columns)}

# Task2: Collaborative filtering

In [34]:
def compute_user_similarities(matrix):
    norm_matrix = np.linalg.norm(matrix, axis=1)
    similarity_matrix = np.dot(matrix, matrix.T) / (norm_matrix[:, None] * norm_matrix)
    
    # Avoid division by zero
    similarity_matrix[np.isnan(similarity_matrix)] = 0
    return similarity_matrix

In [35]:
def userCF(matrix, user_id, movie_id, similarity_matrix, filter_only_rated=True):
    # Map user_id and movie_id to matrix indices
    user_index = user_id_to_index.get(user_id, None)
    movie_index = movie_id_to_index.get(movie_id, None)

    user_ratings = matrix.iloc[user_index].to_numpy()

    # Similar users to the target user
    similarities = similarity_matrix[user_index]

    # Only consider users who have rated the movie
    if filter_only_rated:
        rated_users = user_ratings != 0
        similarities = similarities[rated_users]
        user_ratings = user_ratings[rated_users]
    
    # Compute weighted sum of ratings for the movie
    numerator = np.dot(similarities, user_ratings)
    denominator = np.sum(np.abs(similarities))  # Sum of similarities

    if denominator == 0:
        return 0  # If no similar users, return 0 (or can return the average rating)
    return numerator / denominator

In [36]:
def userCF_rmse(similarity_matrix, matrix, ratings_test):
    # Create a matrix for the predicted ratings
    predicted_ratings = np.zeros_like(matrix.values)
    
    # Get the non-zero entries (rated items) in the test set
    test_ratings = ratings_test[['user_id', 'movie_id', 'rating']]
    
    # Map user and movie IDs to matrix indices
    user_index_test = test_ratings['user_id'].map(user_id_to_index)
    movie_index_test = test_ratings['movie_id'].map(movie_id_to_index)
    
    # Filter out rows where user or movie is not found in the matrix
    valid_test_ratings = test_ratings[~user_index_test.isna() & ~movie_index_test.isna()]
    user_index_test = user_index_test[valid_test_ratings.index]
    movie_index_test = movie_index_test[valid_test_ratings.index]
    actual_ratings = valid_test_ratings['rating'].values
    
    # Get predicted ratings for test data
    predicted_ratings = np.array([
        userCF(matrix, user_id, movie_id, similarity_matrix, filter_only_rated=True)
        for user_id, movie_id in zip(valid_test_ratings['user_id'], valid_test_ratings['movie_id'])
    ])
    
    # Compute RMSE
    rmse = np.sqrt(np.mean((predicted_ratings - actual_ratings) ** 2))
    return rmse


In [37]:
def compare_predictions(predictions=10, similarity_matrix=None):
    print(f"\nPredicting {predictions} random values")

    for i in range(predictions):
        actual_rating = 0

        while(actual_rating == 0):
            movie_index = random.randint(0, len(matrix.columns) - 1)
            user_index = random.randint(0, len(matrix.index) - 1)
            movie_id = matrix.columns[movie_index]
            user_id = matrix.index[user_index]
            actual_rating = matrix.iloc[user_index, movie_index]

        predicted_rating = userCF(matrix, user_id, movie_id, similarity_matrix, filter_only_rated=True)
        actual_rating = matrix.iloc[user_index, movie_index]
        print(f"user {user_id}, movie {movie_id}: {predicted_rating:.1f} (actual: {actual_rating:.1f})")

In [38]:
print("Computing user similarities")
similarity_matrix = compute_user_similarities(matrix)
print("Done computing similarities :) \n")

Computing user similarities
Done computing similarities :) 



In [39]:
# Evaluate RMSE for a specific test set size
print("Computing RMSE")
test_rmse = userCF_rmse(similarity_matrix=similarity_matrix, matrix=matrix, ratings_test=ratings_test)
print(f"Test RMSE: {test_rmse}")

Computing RMSE
Test RMSE: 0.9881447545647434


In [40]:
# We will now test 10 random values and see what is predicted
compare_predictions(predictions=10, similarity_matrix=similarity_matrix)


Predicting 10 random values
user 2345454, movie 1661: 3.3 (actual: 3.0)
user 1028463, movie 2068: 3.6 (actual: 5.0)
user 1658098, movie 3463: 3.1 (actual: 3.0)
user 743598, movie 2128: 2.3 (actual: 3.0)
user 2578830, movie 6042: 3.2 (actual: 3.0)
user 1886585, movie 7510: 3.1 (actual: 3.0)
user 780253, movie 4577: 3.1 (actual: 2.0)
user 1094019, movie 9189: 4.2 (actual: 5.0)
user 1419139, movie 6196: 3.0 (actual: 2.0)
user 981753, movie 438: 3.4 (actual: 3.0)


# Task3: Matrix Decomposition Algorithm

In [41]:
def matrix_decomposition(X_train, X_test, k=50, lambda_=0.5, alpha=1e-3, max_iter=100, tolerance=1e-4):
    print("Test")

In [42]:
def calculate_RMSE(X_pred, X_test):
    print("Test")

In [43]:
def check_predictions(X_pred, X_test, n=5):
     print("Test")

In [44]:
print("Computing matrix decomposition")

Computing matrix decomposition
