TASK_2

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
from tqdm import tqdm
import sklearn

class MatrixFactorizationRecommender:
    def __init__(self, n_components=100, init='nndsvdar', random_state=42, max_iter = 200):
        """
        Initialize the recommender system with NMF
        
        Args:
            n_components (int): Number of latent factors
            init (str): NMF initialization method
            random_state (int): Random seed for reproducibility
        """
        self.model = NMF(
            n_components=n_components,
            init=init,
            random_state=random_state,
            max_iter= max_iter
        )
        self.user_features = None
        self.item_features = None
        
    def prepare_rating_matrix(self, ratings_df):
        """
        Convert ratings dataframe to user-item matrix
        
        Args:
            ratings_df (pd.DataFrame): DataFrame with columns ['user_id', 'movie_id', 'rating']
            
        Returns:
            pd.DataFrame: User-item rating matrix
        """
        return ratings_df.pivot(
            index='user_id', 
            columns='movie_id', 
            values='rating'
        ).fillna(np.nan)

    def fit(self, ratings_matrix):
        """
        Train the NMF model

        Args:
            ratings_matrix (pd.DataFrame): User-item rating matrix
        """
        # Fill missing values with row or column means
        ratings_matrix_filled = ratings_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)

        # Fit NMF model
        self.user_features = self.model.fit_transform(ratings_matrix_filled)
        self.item_features = self.model.components_

        # Store original matrix indices
        self.user_ids = ratings_matrix.index
        self.movie_ids = ratings_matrix.columns

        
    def predict(self, user_idx, movie_idx):
        """
        Predict rating for a specific user-movie pair
        
        Args:
            user_idx (int): User index
            movie_idx (int): Movie index
            
        Returns:
            float: Predicted rating
        """
        return np.dot(self.user_features[user_idx], self.item_features[:, movie_idx])
    
    def predict_all_users_for_movie(self, movie_id):
        """
        Predict ratings for all users for a specific movie
        
        Args:
            movie_id (int): Movie ID
            
        Returns:
            dict: Dictionary of user_id: predicted_rating pairs
        """
        try:
            movie_idx = np.where(self.movie_ids == movie_id)[0][0]
        except IndexError:
            print(f"Warning: Movie ID {movie_id} not found in training data.")
            return {}

        predictions = {}
        
        for user_idx, user_id in enumerate(self.user_ids):
            pred_rating = self.predict(user_idx, movie_idx)
            predictions[user_id] = max(0.5, min(5, pred_rating))  # Clip predictions to valid range
            
        return predictions



In [2]:
# !pip install scikit-learn


In [18]:
class ImprovedMatrixFactorizationRecommender(MatrixFactorizationRecommender):
    def __init__(self, n_components=100, init='nndsvdar', random_state=42, max_iter=200):
        super(ImprovedMatrixFactorizationRecommender, self).__init__(
            n_components=n_components, 
            init=init, 
            random_state=random_state, 
            max_iter=max_iter
        )
        self.user_biases = None
        self.item_biases = None
        self.global_mean = None
        
    def fit(self, ratings_matrix):
        """
        Train the improved NMF model with biases.
        """
        # Calculate biases
        self.global_mean = np.nanmean(ratings_matrix.values)
        
        # User biases
        user_means = ratings_matrix.mean(axis=1)
        self.user_biases = user_means - self.global_mean
        
        # Item biases
        item_means = ratings_matrix.mean(axis=0)
        self.item_biases = item_means - self.global_mean
        
        # Remove biases from ratings
        normalized_ratings = ratings_matrix.copy()
        for i, user_id in enumerate(ratings_matrix.index):
            for j, movie_id in enumerate(ratings_matrix.columns):
                if not np.isnan(ratings_matrix.iloc[i, j]):  # If rating exists
                    normalized_ratings.iloc[i, j] -= (
                        self.global_mean + 
                        self.user_biases[user_id] + 
                        self.item_biases[movie_id]
                    )
        
        # Clip any negative values to zero to ensure compatibility with NMF
        normalized_ratings = normalized_ratings.clip(lower=0).fillna(0)
        
        # Fit NMF on normalized ratings
        super().fit(normalized_ratings)
        
    def predict(self, user_id, movie_id):
        """
        Predict rating with bias terms.
        """
        try:
            # Map user_id and movie_id to indices
            user_idx = self.user_ids.get_loc(user_id)
            movie_idx = self.movie_ids.get_loc(movie_id)

            # Calculate baseline estimate
            baseline_estimate = (
                self.global_mean +
                self.user_biases[user_idx] +
                self.item_biases[movie_idx]
            )

            # Use NMF estimate
            nmf_estimate = np.dot(self.user_features[user_idx], self.item_features[:, movie_idx])

            return baseline_estimate + nmf_estimate

        except KeyError:
            print(f"KeyError: User ID {user_id} or Movie ID {movie_id} not found. Using global mean as fallback.")
            return self.global_mean  # Fallback to global mean if biases are unavailable
        except IndexError:
            print(f"IndexError: User or Movie ID out of bounds. Using global mean as fallback.")
            return self.global_mean  # Fallback if index is out of bounds




In [4]:
def evaluate_recommender(ratings_df, test_movies, recommender):
    """
    Evaluate recommender system using RMSE
    
    Args:
        ratings_df (pd.DataFrame): Complete ratings dataset
        test_movies (list): List of movie IDs to test
        recommender: Trained recommender system
        
    Returns:
        float: RMSE score
    """
    actual_ratings = []
    predicted_ratings = []
    
    # For each test movie with progress tracking
    for movie_id in tqdm(test_movies, desc="Evaluating Test Movies"):
        # Get actual ratings for this movie
        movie_ratings = ratings_df[ratings_df['movie_id'] == movie_id]
        
        # Get predictions for all users who rated this movie
        predictions = recommender.predict_all_users_for_movie(movie_id)
        
        for _, row in movie_ratings.iterrows():
            user_id = row['user_id']
            actual_rating = row['rating']
            predicted_rating = predictions.get(user_id)
            
            if predicted_rating is not None:
                actual_ratings.append(actual_rating)
                predicted_ratings.append(predicted_rating)
    
    return sqrt(mean_squared_error(actual_ratings, predicted_ratings))

In [5]:

ratings_df = pd.read_csv('ratings.dat', 
                         sep='::', 
                         names=['user_id', 'movie_id', 'rating', 'timestamp'],
                         engine='python')

# Randomly select 5 movies for testing
all_movies = ratings_df['movie_id'].unique()
test_movies = random.sample(list(all_movies), 5)

# Prepare training data
ratings_matrix = pd.pivot_table(
    ratings_df,
    values='rating',
    index='user_id',
    columns='movie_id'
)

#     # Train and evaluate original NMF
#     print("Training original NMF model...")
#     basic_recommender = MatrixFactorizationRecommender()
#     basic_recommender.fit(ratings_matrix)
#     basic_rmse = evaluate_recommender(ratings_df, test_movies, basic_recommender)
#     print(f"Basic NMF RMSE: {basic_rmse:.4f}")
    
#     # Train and evaluate improved NMF
#     print("\nTraining improved NMF model...")
#     improved_recommender = ImprovedMatrixFactorizationRecommender()
#     improved_recommender.fit(ratings_matrix)
#     improved_rmse = evaluate_recommender(ratings_df, test_movies, improved_recommender)
#     print(f"Improved NMF RMSE: {improved_rmse:.4f}")
    
#     return basic_rmse, improved_rmse, test_movies



In [6]:
# !pip install tqdm


In [7]:
# Train and evaluate original NMF
print("Training original NMF model...")
basic_recommender = MatrixFactorizationRecommender()
basic_recommender.fit(ratings_matrix)
basic_rmse = evaluate_recommender(ratings_df, test_movies, basic_recommender)
print(f"Basic NMF RMSE: {basic_rmse:.4f}")

Training original NMF model...


Evaluating Test Movies: 100%|████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 36.59it/s]

Basic NMF RMSE: 0.8874





In [8]:
# Check for negative values in the matrix
print("Negative values in ratings_matrix:", (ratings_matrix < 0).any().any())


Negative values in ratings_matrix: False


In [9]:
# Train and evaluate improved NMF
print("\nTraining improved NMF model...")
improved_recommender = ImprovedMatrixFactorizationRecommender()
improved_recommender.fit(ratings_matrix)
improved_rmse = evaluate_recommender(ratings_df, test_movies, improved_recommender)
print(f"Improved NMF RMSE: {improved_rmse:.4f}")


Training improved NMF model...


Evaluating Test Movies: 100%|████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.03it/s]

Improved NMF RMSE: 0.8676





In [10]:
print("Negative values in ratings_matrix:", (ratings_matrix < 0).any().any())

Negative values in ratings_matrix: False


In [11]:
print(basic_rmse)
print(improved_rmse)
print(test_movies)

0.8874316689626542
0.8676134310128855
[2106, 1473, 2267, 2332, 2940]


TASK 3

In [12]:
from surprise import Dataset, Reader, KNNBasic
import pandas as pd

ratings_df = pd.read_csv('ratings.dat', 
                         sep='::', 
                         names=['user_id', 'movie_id', 'rating', 'timestamp'],
                         engine='python')
# Load the ratings data into the Surprise format
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

# Train the kNN collaborative filtering model
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': True}  # Set user-based collaborative filtering
knn_recommender = KNNBasic(sim_options=sim_options)
knn_recommender.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x209253ad310>

In [13]:
def get_top_n_recommendations(model, user_id, n=10):
    """
    Generate the top N recommendations for a given user.
    
    Args:
        model: The trained recommender model (either kNN from Surprise or custom model).
        user_id: ID of the user for whom recommendations are generated.
        n: Number of recommendations to return.
    
    Returns:
        List of top N recommended movie IDs.
    """
    # Get the list of all movie IDs
    all_movie_ids = ratings_df['movie_id'].unique()
    
    # Filter out movies that the user has already rated
    user_rated_movies = ratings_df[ratings_df['user_id'] == user_id]['movie_id'].values
    movie_candidates = [movie_id for movie_id in all_movie_ids if movie_id not in user_rated_movies]
    
    # Predict ratings for all candidate movies
    predictions = []
    for movie_id in movie_candidates:
        prediction = model.predict(user_id, movie_id)
        # Check if prediction is a float or has 'est' attribute (for Surprise models)
        estimated_rating = prediction.est if hasattr(prediction, 'est') else prediction
        predictions.append((movie_id, estimated_rating))
    
    # Sort by predicted rating and return the top N
    top_n_recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return [movie_id for movie_id, _ in top_n_recommendations]


In [19]:
# Randomly select test users who have rated more than 100 movies
import random
active_users = ratings_df.groupby('user_id').filter(lambda x: len(x) > 100)['user_id'].unique()
test_users = random.sample(list(active_users), 10)

# Generate recommendations for both models
knn_recommendations = {user_id: get_top_n_recommendations(knn_recommender, user_id) for user_id in test_users}
imfr_recommendations = {user_id: get_top_n_recommendations(improved_recommender, user_id) for user_id in test_users}


IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out of bounds.
IndexError: User or Movie index is out of bounds.. User or Movie ID might be out

In [17]:
self.user_ids = ratings_matrix.index  # Assumes ratings_matrix is a DataFrame with user IDs as the index
self.movie_ids = ratings_matrix.columns  # Assumes ratings_matrix columns are movie IDs


NameError: name 'self' is not defined