In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Load the dataset
file_path = '/content/drive/My Drive/NUS/ratings.csv'
ratings = pd.read_csv(file_path)

# Display the first few rows of the dataset
display(ratings.head())


Mounted at /content/drive


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [2]:
# Task 3a - Build a user-item matrix (utility matrix) where each row represents a user, and each column represents a movie.
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
display(user_item_matrix.head())


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [3]:
# Task 3b - Calculate the sparsity of the user-item matrix and explain the results.
num_users, num_movies = user_item_matrix.shape
num_ratings = ratings.shape[0]

# Sparsity = Ratio of missing entries to total entries
sparsity = 1 - (num_ratings / (num_users * num_movies))

print(f"Sparsity of the user-item matrix: {sparsity:.2%}")


Sparsity of the user-item matrix: 98.30%


In [4]:
# Task 3c - Use the k-nearest neighbours algorithm and matrix factorization to recommend movies based on user ratings. Implement a function to recommend the top 5 movies for a given user based on their rating history.
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

# Function to recommend top 5 movies for a given user
def recommend_movies(user_id, user_item_matrix, n_neighbors=5, n_components=20):
    # Fill NaN values with 0
    matrix_filled = user_item_matrix.fillna(0)

    # Perform matrix factorization using SVD
    svd = TruncatedSVD(n_components=n_components)
    latent_matrix = svd.fit_transform(matrix_filled)

    # Fit the k-nearest neighbors model on the latent factors
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(latent_matrix)

    # Find the k-nearest neighbors for the given user
    user_latent_factors = latent_matrix[user_id - 1].reshape(1, -1)  # Adjust for zero-indexing
    distances, indices = model_knn.kneighbors(user_latent_factors, n_neighbors=n_neighbors+1)

    # Get the indices of the recommended movies
    similar_users = indices.flatten()[1:]

    # Aggregate ratings from similar users
    recommended_movies = user_item_matrix.iloc[similar_users].mean(axis=0)

    # Sort the movies by the aggregated ratings
    recommended_movies = recommended_movies.sort_values(ascending=False)

    # Return the top 5 recommended movies
    return recommended_movies.head(5)

# Test
user_id = 1
recommended_movies = recommend_movies(user_id, user_item_matrix)
print(f"Top 5 recommended movies for user {user_id}:")
print(recommended_movies)


Top 5 recommended movies for user 1:
movieId
4915    5.0
2064    5.0
2116    5.0
7153    5.0
2117    5.0
dtype: float64


In [5]:
# Task 3d - Handle the cold-start problem for new users by using matrix factorization and item-based collaborative filtering to recommend movies similar to those the user has already rated.

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Function to recommend movies for new users based on matrix factorization and item-based collaborative filtering
def recommend_movies_for_new_user(rated_movies, user_item_matrix, top_n=5, n_components=20):
    # Fill NaN values with 0
    matrix_filled = user_item_matrix.fillna(0)

    # Perform matrix factorization using SVD
    svd = TruncatedSVD(n_components=n_components)
    latent_matrix = svd.fit_transform(matrix_filled)

    # Transform the original item features using the latent factors
    item_factors = svd.components_.T

    # Calculate item-item similarity matrix using transformed features
    item_similarity = cosine_similarity(item_factors)
    item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

    # Aggregate scores for unrated movies
    scores = {}
    for movie in user_item_matrix.columns:
        if movie not in rated_movies:
            similar_movies = item_similarity_df[movie].loc[rated_movies]
            scores[movie] = similar_movies.mean()

    # Sort movies by score
    recommended_movies = pd.Series(scores).sort_values(ascending=False)

    # Return the top N recommended movies
    return recommended_movies.head(top_n)

# Test - New user who has rated movies 1 and 3
rated_movies = [1, 3]
recommended_movies_new_user = recommend_movies_for_new_user(rated_movies, user_item_matrix)
print("Recommended movies for a new user:")
print(recommended_movies_new_user)


Recommended movies for a new user:
5       0.618434
1073    0.597697
304     0.593231
141     0.566775
788     0.555087
dtype: float64


In [7]:
# Task 4a - Evaluate the performance of the recommender systems using RMSE
from sklearn.metrics import mean_squared_error

# Function to calculate RMSE for multiple users
def calculate_average_rmse(user_item_matrix, recommend_func, user_ids, is_content_based=False):
    rmses = []
    for user_id in user_ids:
        actual_ratings = user_item_matrix.loc[user_id].dropna()
        if is_content_based:
            rated_movies = actual_ratings.index.tolist()
            predicted_ratings = recommend_func(rated_movies, user_item_matrix).reindex(actual_ratings.index)
        else:
            predicted_ratings = recommend_func(user_id, user_item_matrix).reindex(actual_ratings.index)

        # Ensure no NaNs in predictions and actual ratings
        predicted_ratings = predicted_ratings.dropna()
        actual_ratings = actual_ratings.reindex(predicted_ratings.index).dropna()

        # Check if there are common indices
        if not predicted_ratings.empty and not actual_ratings.empty:
            rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
            rmses.append(rmse)
        else:
            continue

    if rmses:
        return np.mean(rmses)
    else:
        return None

# Test
user_ids = user_item_matrix.index[:10]  # Use the first 10 users for evaluation
average_rmse_collaborative = calculate_average_rmse(user_item_matrix, recommend_movies, user_ids)
average_rmse_content = calculate_average_rmse(user_item_matrix, recommend_movies_for_new_user, user_ids, is_content_based=True)

print(f"Average RMSE for collaborative filtering: {average_rmse_collaborative}")
print(f"Average RMSE for content-based filtering: {average_rmse_content}")


Average RMSE for collaborative filtering: 1.5189990231222075
Average RMSE for content-based filtering: None


In [12]:
# Task 4b - Test performance on a subset of the movie dataset
# Create a random subset of 30 users
subset_user_item_matrix = user_item_matrix.sample(n=30, random_state=42)

# Define a modified recommend_movies function for the subset
def recommend_movies_subset(user_id, matrix, n_neighbors=5, n_components=20):
    # Fill NaN values with 0
    matrix_filled = matrix.fillna(0)

    # Perform matrix factorization using SVD
    svd = TruncatedSVD(n_components=n_components)
    latent_matrix = svd.fit_transform(matrix_filled)

    # Get the user's index in the matrix
    user_idx = matrix.index.get_loc(user_id)

    # Fit the k-nearest neighbors model on the latent factors
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(latent_matrix)

    # Find the k-nearest neighbors for the given user
    user_latent_factors = latent_matrix[user_idx].reshape(1, -1)
    distances, indices = model_knn.kneighbors(user_latent_factors, n_neighbors=n_neighbors+1)

    # Get the indices of the recommended movies
    similar_users = indices.flatten()[1:]

    # Aggregate ratings from similar users
    recommended_movies = matrix.iloc[similar_users].mean(axis=0)

    # Sort the movies by the aggregated ratings
    recommended_movies = recommended_movies.sort_values(ascending=False)

    return recommended_movies.head(5)

# Test collaborative filtering on subset
subset_users = subset_user_item_matrix.index
rmse_collaborative_subset = calculate_average_rmse(subset_user_item_matrix, recommend_movies_subset, subset_users)

# Test content-based filtering on subset
rmse_content_subset = calculate_average_rmse(subset_user_item_matrix, recommend_movies_for_new_user, subset_users, is_content_based=True)

# Helper function to format RMSE values
def format_rmse(value):
    return f"{value:.4f}" if value is not None else "N/A"

print("\nPerformance Analysis on Subset:")
print("-" * 30)
print(f"Subset size: {len(subset_user_item_matrix)} users")
print(f"RMSE for collaborative filtering: {format_rmse(rmse_collaborative_subset)}")
print(f"RMSE for content-based filtering: {format_rmse(rmse_content_subset)}")

# Compare with full dataset performance
print("\nComparison with Full Dataset:")
print("-" * 30)
print(f"Full dataset size: {len(user_item_matrix)} users")
print(f"Full dataset collaborative RMSE: {format_rmse(average_rmse_collaborative)}")
print(f"Full dataset content-based RMSE: {format_rmse(average_rmse_content)}")

# Calculate and display performance differences only if both values are available
print("\nPerformance Differences:")
print("-" * 30)

# For collaborative filtering
if all(v is not None for v in [rmse_collaborative_subset, average_rmse_collaborative]):
    collab_diff = ((rmse_collaborative_subset - average_rmse_collaborative) / average_rmse_collaborative) * 100
    print(f"Collaborative filtering: {collab_diff:+.2f}%")
else:
    print("Collaborative filtering: Not enough data")

# For content-based filtering
if all(v is not None for v in [rmse_content_subset, average_rmse_content]):
    content_diff = ((rmse_content_subset - average_rmse_content) / average_rmse_content) * 100
    print(f"Content-based filtering: {content_diff:+.2f}%")
else:
    print("Content-based filtering: Not enough data")



Performance Analysis on Subset:
------------------------------
Subset size: 30 users
RMSE for collaborative filtering: 0.8395
RMSE for content-based filtering: N/A

Comparison with Full Dataset:
------------------------------
Full dataset size: 610 users
Full dataset collaborative RMSE: 1.5190
Full dataset content-based RMSE: N/A

Performance Differences:
------------------------------
Collaborative filtering: -44.73%
Content-based filtering: Not enough data


# 4c) Analysis of Recommender System Performance

Based on the results from Tasks 4a and 4b

## 1. Collaborative Filtering Performance

### Full Dataset (610 users):
- RMSE: 1.5190
- This baseline performance indicates moderate prediction accuracy, considering that ratings are on a 1-5 scale
- The error of ~1.52 stars suggests room for improvement in the prediction accuracy

### Subset Performance (30 users):
- RMSE: 0.8395
- Surprisingly better performance on the smaller dataset
- 44.73% improvement compared to the full dataset

### Analysis of the Difference:
- The better performance on the subset might be due to:
  1. Less noise and variability in the smaller dataset
  2. More consistent rating patterns among the randomly selected users
  3. Potentially more dense rating matrix in the subset
- However, this might not be representative of real-world performance due to the limited sample size

## 2. Content-Based Filtering Performance

- Both full dataset and subset show "N/A" for RMSE
- This indicates possible sparsity issues in the similarity calculations. There is limited overlap between recommended and rated items