In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# Import necessary libraries
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy

In [None]:
# Task 1: Data Preprocessing
# Load the dataset
ratings = pd.read_csv(r'/gdrive/My Drive/ratings.csv')
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Show the number of users and the number of items in the training set
print(f"Number of users in training set: {trainset.n_users}")
print(f"Number of items in training set: {trainset.n_items}")

Number of users in training set: 610
Number of items in training set: 8928


In [None]:
trainset.rating_scale

(0.5, 5.0)

In [None]:
trainset.n_items

8928

In [None]:
trainset.n_users

610

In [None]:
trainset.n_ratings

80668

In [None]:
trainset.global_mean

3.503229285466356

In [None]:
trainset.ir

defaultdict(list,
            {0: [(0, 4.5), (12, 2.0), (299, 3.0)],
             1: [(1, 3.0),
              (134, 3.5),
              (2, 2.5),
              (331, 5.0),
              (317, 3.5),
              (381, 3.0),
              (571, 3.0),
              (325, 5.0),
              (111, 3.0),
              (293, 3.0),
              (213, 4.0),
              (87, 3.0),
              (66, 3.0),
              (241, 4.0),
              (140, 3.0),
              (374, 3.0),
              (527, 4.0),
              (184, 4.0),
              (427, 3.0),
              (458, 4.0),
              (484, 4.0),
              (351, 4.0),
              (62, 2.5),
              (96, 3.5),
              (255, 3.0),
              (117, 3.0),
              (319, 3.0),
              (382, 4.0),
              (101, 3.5),
              (159, 5.0),
              (279, 4.0),
              (483, 4.0),
              (570, 4.0),
              (93, 4.0),
              (196, 3.0),
              (599, 4.0),
 

In [None]:
trainset.ur

defaultdict(list,
            {0: [(0, 4.5),
              (398, 3.0),
              (572, 4.0),
              (501, 2.5),
              (1348, 3.5),
              (1456, 4.0),
              (658, 2.0),
              (441, 4.5),
              (414, 3.5),
              (1503, 3.0),
              (1986, 4.0),
              (241, 3.0),
              (374, 4.5),
              (387, 5.0),
              (2317, 4.0),
              (2394, 3.5),
              (2609, 3.0),
              (2932, 3.5),
              (2024, 5.0),
              (628, 4.0),
              (3047, 3.5),
              (1110, 3.5),
              (738, 2.5),
              (363, 4.5),
              (1191, 2.5),
              (1144, 4.0),
              (643, 4.5),
              (3828, 5.0),
              (1875, 3.5),
              (723, 4.0),
              (3971, 4.5),
              (37, 3.0),
              (661, 4.0),
              (474, 4.0),
              (3184, 3.0),
              (365, 2.5),
              (2127, 5.0),
  

In [None]:
trainset.ur

defaultdict(list,
            {0: [(0, 4.5),
              (398, 3.0),
              (572, 4.0),
              (501, 2.5),
              (1348, 3.5),
              (1456, 4.0),
              (658, 2.0),
              (441, 4.5),
              (414, 3.5),
              (1503, 3.0),
              (1986, 4.0),
              (241, 3.0),
              (374, 4.5),
              (387, 5.0),
              (2317, 4.0),
              (2394, 3.5),
              (2609, 3.0),
              (2932, 3.5),
              (2024, 5.0),
              (628, 4.0),
              (3047, 3.5),
              (1110, 3.5),
              (738, 2.5),
              (363, 4.5),
              (1191, 2.5),
              (1144, 4.0),
              (643, 4.5),
              (3828, 5.0),
              (1875, 3.5),
              (723, 4.0),
              (3971, 4.5),
              (37, 3.0),
              (661, 4.0),
              (474, 4.0),
              (3184, 3.0),
              (365, 2.5),
              (2127, 5.0),
  

In [None]:
# Task 2: Collaborative Filtering Algorithm
# User-based collaborative filtering
from surprise import KNNBaseline
algo_user = KNNBaseline(sim_options={"name": "pearson_baseline", "user_based": True, "shrinkage": 100})
algo_user.fit(trainset)
predictions_user = algo_user.test(testset)
rmse_user = accuracy.rmse(predictions_user)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8862


In [None]:
# Item-based collaborative filtering
algo_item = KNNBaseline(sim_options={"name": "pearson_baseline", "user_based": False, "shrinkage": 100})
algo_item.fit(trainset)
predictions_item = algo_item.test(testset)
rmse_item = accuracy.rmse(predictions_item)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8600


In [None]:
print(f"User-based CF RMSE: {rmse_user:.4f}")
print(f"Item-based CF RMSE: {rmse_item:.4f}")

User-based CF RMSE: 0.8862
Item-based CF RMSE: 0.8600


# **Summarizing my findings and recommendations for improving the recommender system**

*  In the above analysis, I implemented user-based and item-based collaborative 
filtering algorithms using the KNNBaseline method with Pearson Baseline similarity and a shrinkage value of 100. My above results show that the item-based approach with RMSE: 0.8600 performed better than the user-based approach with RMSE: 0.8862 in predicting movie user ratings.

*  I tested with KNNBaseline, KNNWithMeans and KNNWithZScore methods with Pearson Baseline, Cosine and Pearson similarity. I found that the KNNBaseline method with Pearson Baseline similarity and a shrinkage value of 100 has better RMSE than others, so I selected those in my analysis. To further improve, I can add more features or optimize the hyperparameters.


In [None]:
# Bonus Task: Improving the Recommender System
# Hybrid recommender system
def hybrid_recommender(user_id, movie_id):
    est_user = algo_user.predict(user_id, movie_id).est
    est_item = algo_item.predict(user_id, movie_id).est
    return (est_user + est_item) / 2

predictions_hybrid = []
for uid, iid, true_r in testset:
    est_hybrid = hybrid_recommender(uid, iid)
    predictions_hybrid.append((uid, iid, true_r, est_hybrid, None))

rmse_hybrid = accuracy.rmse(predictions_hybrid)

print(f"Hybrid CF RMSE: {rmse_hybrid:.4f}")

RMSE: 0.8529
Hybrid CF RMSE: 0.8529


# **Explaining your implementation and summarizing your findings**

* In the hybrid recommender system, after I combined user-based and item-based collaborative filtering algorithms, I got an RMSE: of 0.8529. 
It can be observed that the hybrid recommender system outperformed both the user-based and item-based collaborative filtering algorithms.  

* By combining the capabilities of user- and item-based algorithms, the hybrid recommender system may have captured more complex patterns in the data while compensating for the limitations of the individual methods. The improved performance highlights the potential benefits of combining multiple algorithms into a recommender system.
The following options can be taken to improve the performance of the recommender system.
   1. Optimize hyperparameters
   2. Try different algorithms
   3. Incorporate additional features