In [6]:
from google.colab import files

# Open file upload dialog
uploaded = files.upload()

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv


In [7]:
import pandas as pd

# Load the uploaded CSV file
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

ratings_df = ratings_df.drop(columns=["timestamp"])

merged_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")

print(merged_df.head())



   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating  
0       1     4.0  
1       5     4.0  
2       7     4.5  
3      15     2.5  
4      17     4.5  


In [8]:
from sklearn.model_selection import train_test_split

# BETTER DATA SPLIT METHOD
train_data = pd.DataFrame()
test_data = pd.DataFrame()

# Group by userId and split ratings for each user
for user_id, group in merged_df.groupby('userId'):
    # Split ratings into 80% training and 20% testing
    if len(group) > 1:  # Ensure the user has enough ratings to split
        train, test = train_test_split(group, test_size=0.2, random_state=42)
        train_data = pd.concat([train_data, train])
        test_data = pd.concat([test_data, test])
    else:
        # If the user has only one rating, keep it in the training set
        train_data = pd.concat([train_data, group])

# Reset index to clean up the final datasets
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


In [12]:
# Pivot the DataFrame to create a user-item matrix
user_item_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')

# Fill NaN values with 0
user_item_matrix = user_item_matrix.fillna(0)

print(user_item_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  191005  193565  193567  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [13]:
import pandas as pd

# Compute the item-item Pearson similarity matrix
item_similarity_df = user_item_matrix.corr(method='pearson')
print(item_similarity_df.head())

movieId    1         2         3         4         5         6         7       \
movieId                                                                         
1        1.000000  0.239761  0.140823 -0.043744  0.160495  0.207737  0.127775   
2        0.239761  1.000000  0.174224  0.064306  0.165577  0.178082  0.110075   
3        0.140823  0.174224  1.000000  0.097267  0.373652  0.157595  0.225478   
4       -0.043744  0.064306  0.097267  1.000000  0.049839  0.077240  0.148339   
5        0.160495  0.165577  0.373652  0.049839  1.000000  0.219734  0.348856   

movieId    8         9         10      ...    191005    193565    193567  \
movieId                                ...                                 
1        0.056205  0.085553  0.223165  ... -0.019698 -0.019698 -0.019698   
2        0.209226 -0.034542  0.266904  ... -0.016680 -0.016680 -0.016680   
3        0.271932  0.352537  0.068092  ... -0.009303 -0.009303 -0.009303   
4        0.193817 -0.013119  0.000492  ... -0.003646

In [6]:
#Predicting Ratings
def predict_ratings_with_baseline_and_all(test_data, user_item_matrix, item_similarity_df, train_data, k=20):
    predictions = []
    actuals = []
    all_predictions = []  # Store predictions for all unrated items

    # Overall average
    mu = train_data['rating'].mean()

    # User and item deviations
    user_avg = train_data.groupby('userId')['rating'].mean() - mu
    item_avg = train_data.groupby('movieId')['rating'].mean() - mu

    # Predict ratings for test_data
    for _, row in test_data.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        actual_rating = row['rating']

        if user_id in user_item_matrix.index and movie_id in item_similarity_df.columns:
            user_bias = user_avg.get(user_id, 0)
            item_bias = item_avg.get(movie_id, 0)
            baseline = mu + user_bias + item_bias

            user_ratings = user_item_matrix.loc[user_id]
            rated_items = user_ratings[user_ratings > 0].index

            # Get top-k similar items
            if movie_id in item_similarity_df.index:
                top_k_similar_items = (
                    item_similarity_df.loc[movie_id, rated_items]
                    .nlargest(k)
                    .index
                )
            else:
                top_k_similar_items = []

            numerator = 0
            denominator = 0

            for rated_item in top_k_similar_items:
                if rated_item in item_similarity_df.columns:
                    similarity = item_similarity_df.loc[movie_id, rated_item]
                    neighbor_baseline = mu + user_avg.get(user_id, 0) + item_avg.get(rated_item, 0)
                    numerator += similarity * (user_ratings[rated_item] - neighbor_baseline)
                    denominator += abs(similarity)

            predicted_rating = baseline + (numerator / denominator if denominator != 0 else 0)
        else:
            predicted_rating = train_data['rating'].mean()

        predictions.append(predicted_rating)
        actuals.append(actual_rating)

    # Predict ratings for all unrated items
    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        unrated_items = user_ratings[user_ratings == 0].index  # Items not rated by the user

        for movie_id in unrated_items:
            if movie_id in item_similarity_df.columns:
                user_bias = user_avg.get(user_id, 0)
                item_bias = item_avg.get(movie_id, 0)
                baseline = mu + user_bias + item_bias

                rated_items = user_ratings[user_ratings > 0].index

                # Get top-k similar items
                if movie_id in item_similarity_df.index:
                    top_k_similar_items = (
                        item_similarity_df.loc[movie_id, rated_items]
                        .nlargest(k)
                        .index
                    )
                else:
                    top_k_similar_items = []

                numerator = 0
                denominator = 0

                for rated_item in top_k_similar_items:
                    if rated_item in item_similarity_df.columns:
                        similarity = item_similarity_df.loc[movie_id, rated_item]
                        neighbor_baseline = mu + user_avg.get(user_id, 0) + item_avg.get(rated_item, 0)
                        numerator += similarity * (user_ratings[rated_item] - neighbor_baseline)
                        denominator += abs(similarity)

                predicted_rating = baseline + (numerator / denominator if denominator != 0 else 0)
            else:
                predicted_rating = train_data['rating'].mean()

            all_predictions.append((user_id, movie_id, predicted_rating))

    return predictions, actuals, all_predictions


In [None]:
predictions, actuals, all_predictions = predict_ratings_with_baseline_and_all(
    test_data, user_item_matrix, item_similarity_df, train_data
)

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mean_squared_error(actuals, predictions))

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

MAE: 0.6649
RMSE: 0.8723


In [14]:
def generate_top_n_recommendations_with_mmr(all_predictions, train_data, item_similarity_df, n=10, lambda_mmr=0.5):

    recommendations = {}

    # Group predictions by user
    predictions_by_user = {}
    for user_id, movie_id, predicted_rating in all_predictions:
        if user_id not in predictions_by_user:
            predictions_by_user[user_id] = []
        predictions_by_user[user_id].append((movie_id, predicted_rating))

    # Generate top-N recommendations using MMR
    for user_id, predictions in predictions_by_user.items():
        # Get rated items from training data
        rated_items_train = train_data[train_data['userId'] == user_id]['movieId'].tolist()
        rated_items = set(rated_items_train)

        # Filter out already-rated items
        predictions = [(movie_id, rating) for movie_id, rating in predictions if movie_id not in rated_items]

        # Sort by predicted rating (relevance)
        predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

        # Apply MMR
        selected_items = []
        while len(selected_items) < n and predictions:
            # Calculate MMR scores
            mmr_scores = []
            for movie_id, relevance in predictions:
                # Compute diversity with already selected items
                diversity = max(
                    [item_similarity_df.get(movie_id, {}).get(selected, 0) for selected in selected_items]
                    if selected_items
                    else [0]
                )
                # Compute MMR score
                mmr_score = lambda_mmr * relevance - (1 - lambda_mmr) * diversity
                mmr_scores.append((movie_id, mmr_score, relevance))

            # Select item with highest MMR score
            mmr_scores = sorted(mmr_scores, key=lambda x: x[1], reverse=True)
            best_item = mmr_scores[0]
            selected_items.append(best_item[0])

            # Remove the selected item from predictions
            predictions = [p for p in predictions if p[0] != best_item[0]]

        recommendations[user_id] = selected_items

    return recommendations

top_n_recommendations = generate_top_n_recommendations_with_mmr(
    all_predictions, train_data, item_similarity_df, n=10, lambda_mmr=0.5
)


In [1]:
# Evaluating Recommendations
from sklearn.metrics import ndcg_score

def evaluate_recommendations(top_n_recommendations, test_data, n=10):

    precision_list = []
    recall_list = []
    ndcg_list = []

    for user_id, recommended_movies in top_n_recommendations.items():
        # items in the test set for this user
        true_items = test_data[test_data['userId'] == user_id]['movieId'].tolist()
        if not true_items:
            continue

        # Precision: Percentage of recommended items that are in the test set
        true_positives = len(set(recommended_movies).intersection(set(true_items)))
        precision = true_positives / n
        precision_list.append(precision)

        # Recall: Percentage of test items that are recommended
        recall = true_positives / len(true_items)
        recall_list.append(recall)

        # NDCG: Evaluates the ranking quality
        relevance = [1 if item in true_items else 0 for item in recommended_movies]
        ndcg = ndcg_score([relevance], [list(range(1, len(relevance) + 1))])
        ndcg_list.append(ndcg)

    # Compute average metrics
    avg_precision = sum(precision_list) / len(precision_list) if precision_list else 0
    avg_recall = sum(recall_list) / len(recall_list) if recall_list else 0
    avg_ndcg = sum(ndcg_list) / len(ndcg_list) if ndcg_list else 0
    f_measure = (
        2 * avg_precision * avg_recall / (avg_precision + avg_recall)
        if (avg_precision + avg_recall) > 0
        else 0
    )

    return {
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F-measure": f_measure,
        "NDCG": avg_ndcg,
    }

# Evaluate recommendations
metrics = evaluate_recommendations(top_n_recommendations, test_data)

# Print evaluation results
print("Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:3f}")

Evaluation Metrics:
Precision: 0.285
Recall: 0.185
F-measure: 0.223
NDCG: 0.670
