# DES431 Project 2: Recommendation System

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  

# 1. Preparing the Dataset
We provide two options to download the necessary dataset for Project 2: Recommendation System.

For this project, we used the following dataset:
1. ratings_train.csv
2. movies.csv
3. ratings_valid.csv


---


**Option 1**: Upload the .csv file onto google drive. Change the path of the variable "movie_path", "ratings_train_path", and "ratings_valid_path" to the correct path. The Google Drive must be "Mounted" first. 

The command below is used the Mount the Drive. This is a default option. If you want to mount the drive mannually, comment this code out and follow the step given after the code snippet.
```
from google.colab import drive
drive.mount('/content/drive')
```
This process can be done mannually by:
1. Select the "File" icon on the navigation tabe on the left (the fourth option_
2. Click the icon with "File and Google Drive Icon".
3. You drive is successfully mounted


---


**Option 2**: Upload the file from local storage onto the Google Colab directly using the following import and function.

```
from google.colab import files
uploaded = files.upload()
```

*REMARK: The default option is "Option 1". Option 2 is commented out on purpose.*

In [2]:
# Option 1: Load the .csv file from Google Drive into pandas dataframe
from google.colab import drive
drive.mount('/content/drive')

# Change each variable to the correspond file patch for movie, ratings_train, and ratings_valid
movie_path = "/content/drive/MyDrive/DES431_RecommendationSystem/movies.csv"
ratings_train_path = "/content/drive/MyDrive/DES431_RecommendationSystem/ratings_train.csv"
ratings_valid_path = "/content/drive/MyDrive/DES431_RecommendationSystem/ratings_valid.csv"

movies_df = pd.read_csv(movie_path)
ratings_df = pd.read_csv(ratings_train_path)
ratings_valid = pd.read_csv(ratings_valid_path)

Mounted at /content/drive


In [3]:
# Option 2: Upload the file from local storage
# from google.colab import files
# uploaded = files.upload()

# movies_df = pd.read_csv('movies.csv')
# ratings_df = pd.read_csv('ratings_train.csv')
# ratings_valid = pd.read_csv('ratings_valid.csv')

# 2. Constructing Utility Matrix

In [4]:
# User-item matrix
utility_matrix = ratings_df.pivot_table(index = "userId", columns = "movieId", values = "rating")

missing_columns = list(set(movies_df["movieId"]) - set(utility_matrix.columns))
for col in missing_columns:
    utility_matrix[col] = np.nan

utility_matrix = utility_matrix[sorted(utility_matrix.columns)]

utility_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


# 3. Global Bias

In [5]:
class GlobalBias:

    def __init__(self, utility_matrix):
        self.utility_matrix = utility_matrix
        self.overall_mean = np.nanmean(self.utility_matrix.values)

    def calculate_user_bias(self, user_id):
        user_rating_val = self.utility_matrix.loc[user_id].dropna().values
        user_rating_count = user_rating_val.shape[0]
        
        if user_rating_count == 0:
            user_mean = 0
        else:
            user_mean = user_rating_val.sum() / user_rating_count

        user_bias = user_mean - self.overall_mean

        return user_bias
        
    def calculate_movie_bias(self, movie_id):
        movie_rating_val = self.utility_matrix.loc[:,movie_id].dropna().values
        movie_rating_count = movie_rating_val.shape[0]
        if movie_rating_count == 0:
            movie_mean = 0
        else:
            movie_mean = movie_rating_val.sum() / movie_rating_count

        movie_bias = movie_mean - self.overall_mean

        return movie_bias

    def predict(self, user_id, movie_id):
        user_bias = self.calculate_user_bias(user_id)
        movie_bias = self.calculate_movie_bias(movie_id)
        
        # Find the predited rating given userId and movieId
        predicted_rating = self.overall_mean + user_bias + movie_bias
        
        return predicted_rating

user_id, movie_id = 4 , 45

global_bias_model = GlobalBias(utility_matrix)
predicted = global_bias_model.predict(user_id, movie_id)

predicted

3.4455772386081653

### Global Bias: model validation

In [6]:
from sklearn.metrics import mean_squared_error

r = ratings_valid[["userId","movieId"]]

predicted_ratings = []

for index, (user_id, movie_id) in enumerate(zip(r["userId"], r["movieId"])):
    pred_rat = global_bias_model.predict(user_id, movie_id)
    predicted_ratings.append(pred_rat)

r["predicted_rating"] = predicted_ratings

r_true = ratings_valid['rating'].to_numpy()
r_pred = r["predicted_rating"].to_numpy()

rmse = mean_squared_error(r_true, r_pred, squared = False)
print("RMSE: ", rmse)

RMSE:  0.888971186618481


# 4. Collaborative Filtering: User and Item based

## Item-based

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

class ItemBasedCollaborativeFiltering:

    def __init__(self, k = 15, method = "cosine"):
        self.k = k
        self.method = method
    
    def fit(self, utility_matrix):
        self.utility_matrix = utility_matrix
        
        if self.method == "pearson":
            self.item_similarity_matrix = utility_matrix.corr(method= self.method)
        elif self.method == "cosine":
            new_util_matrix = self.utility_matrix.fillna(0)
            new_util = new_util_matrix.copy()
            new_util_matrix = cosine_similarity(new_util_matrix.T)

            new_util_matrix_df = pd.DataFrame(
                data=new_util_matrix,
                index = new_util.columns,
                columns = new_util.columns
            )
            self.item_similarity_matrix = new_util_matrix_df
    
    def predict(self, user_id, movie_id):
        user_ratings = self.utility_matrix.loc[user_id]
        similar_movies = self.item_similarity_matrix[movie_id][user_ratings.notna()]
        top_similar_movies = similar_movies.nlargest(self.k+1)[1:]
        user_ratings_updated = user_ratings[top_similar_movies.index]
        if top_similar_movies.sum() == 0:
            predicted_rating = user_ratings.mean()
        else:
            predicted_rating = (user_ratings_updated * top_similar_movies).sum() / top_similar_movies.sum()
        
        if predicted_rating > 5:
            predicted_rating = 5
        
        if predicted_rating < 0.5:
            predicted_rating = 0.5 

        return predicted_rating

# Define Item-based Collaborative Filtering Object

# DEFAULT: Using cosine similarity and k=15 (15 nearest neighbours)
itembased_cf = ItemBasedCollaborativeFiltering()
itembased_cf.fit(utility_matrix)
predicted_rating = itembased_cf.predict(user_id=4, movie_id=45)
predicted_rating

3.646501776819902

### Item-based Model validation

In [8]:
r = ratings_valid[["userId","movieId"]]

predicted_ratings = []

for index, (user_id, movie_id) in enumerate(zip(r["userId"], r["movieId"])):
    pred_rat = itembased_cf.predict(user_id, movie_id)
    if pred_rat is None:
        print(f"{index}: {user_id}, {movie_id}")
    predicted_ratings.append(pred_rat)

r["predicted_rating"] = predicted_ratings

r_true = ratings_valid['rating'].to_numpy()
r_pred = r["predicted_rating"].to_numpy()

rmse = mean_squared_error(r_true, r_pred, squared = False)

print(f"RMSE: {rmse: .4f}")

RMSE:  0.8468


## User-based

In [9]:
class UserBasedCollaborativeFiltering:

    def __init__(self, k = 10):
        self.k = k
    
    def fit(self,utility_matrix):
        self.utility_matrix = utility_matrix

        user_similarity_matrix = cosine_similarity(utility_matrix.fillna(0))
        self.user_sim_df = pd.DataFrame(
            user_similarity_matrix, 
            index = self.utility_matrix.index, 
            columns = self.utility_matrix.index
        )

    def rescale(self, predicted_rating):
        # Rescale of the predicted score is outside of range 0.5 and 5.0
        if predicted_rating > 5:
            predicted_rating = 5
        
        if predicted_rating < 0.5:
            predicted_rating = 0.5 
        
        return predicted_rating

    def predict(self, user_id, movie_id):
        # Select only targeted rating for movieId
        target_movie_rating = self.utility_matrix[movie_id].dropna()

        # Select Top K Similar user
        # From user similarity matrix, select the target userId, select only user that have rated the movie 
        similar_users_simscore = self.user_sim_df[user_id][target_movie_rating.index] \
                                     .sort_values(ascending = False)[1:self.k+1]
        
        # Target user rating
        target_user_avrrating = ratings_df[ratings_df["userId"] == user_id]["rating"].mean()

        # Address the problem where there is no similar users
        if similar_users_simscore.sum() == 0:
            prediction = target_user_avrrating
        else: 
            # Select rating that similar users have given to the target movie
            similar_users_rating = target_movie_rating[similar_users_simscore.index]

            # Compute average rating of similar user
            similar_users_avrrating = ratings_df[ratings_df["userId"].isin(similar_users_simscore.index.tolist())] \
                                                            .groupby("userId").mean()["rating"] \
                                                            [similar_users_simscore.index]

            offset = (similar_users_simscore * (similar_users_rating - similar_users_avrrating)).sum() \
                                / similar_users_simscore.sum()
        
            predicted_rating = target_user_avrrating + offset
 
        return self.rescale(predicted_rating)

userbased_model = UserBasedCollaborativeFiltering()
userbased_model.fit(utility_matrix)

user_id, movie_id  = 4, 45
prediction = userbased_model.predict(user_id = user_id, movie_id = movie_id)
prediction

3.3420609627516202

### User-based Model validation

In [10]:
r = ratings_valid[["userId","movieId"]]

predicted_ratings = []

for index, (user_id, movie_id) in enumerate(zip(r["userId"], r["movieId"])):
    pred_rat = userbased_model.predict(user_id, movie_id)
    if pred_rat is None:
        print(f"{index}: {user_id}, {movie_id}")
    predicted_ratings.append(pred_rat)

r["predicted_rating"] = predicted_ratings

r_true = ratings_valid['rating'].to_numpy()
r_pred = r["predicted_rating"].to_numpy()

rmse = mean_squared_error(r_true, r_pred, squared = False)

print(f"RMSE: {rmse: .4f}")

RMSE:  0.8860


# 5. Latent Factor Model
Using Matrix Factorization techniques

For the Latent Factor Model, an extra package must be installed using the command in the next cell

In [11]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095441 sha256=14fe5d2ecea3d18ca8c79196ab8f4cb26f718962358fdccebe29651f862f4d3e
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [12]:
from surprise import Dataset, Reader, SVD
from sklearn.metrics import mean_squared_error

# Create a Reader object
reader = Reader(rating_scale=(0.5, 5))

# Load the data into a Surprise Dataset object
ratings_train = ratings_df.copy()
data = Dataset.load_from_df(ratings_train[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

# Define the SVD model and train it on the training set
svdmodel = SVD(n_factors=100, lr_all=0.01, reg_all=1)
svdmodel.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb93994b6d0>

### Latent Factor Model validation

In [13]:
r = ratings_valid[["userId","movieId"]]

predicted_ratings = []

for index, (user_id, movie_id) in enumerate(zip(r["userId"], r["movieId"])):
    pred_rat = svdmodel.predict(user_id, movie_id)
    predicted_ratings.append(pred_rat.est)

r["predicted_rating"] = predicted_ratings

r_true = ratings_valid['rating'].to_numpy()
r_pred = r["predicted_rating"].to_numpy()

rmse = mean_squared_error(r_true, r_pred, squared = False)
print("RMSE: ", rmse)

RMSE:  0.8697991632245714


# 6. Final: Ensemble Recommendation System

In [14]:
def predict_rating(df):
    '''
    Input:
        df = a dataframe with two columns: userId, movieId
    Output:
        a dataframe with three columns: userId, movieId, rating
    '''
    ############################## Model Construction ##################################
    # Global Bias Model
    global_bias_model = GlobalBias(utility_matrix)

    # Collaborative Filtering 
    # Item-based
    itembased_cf = ItemBasedCollaborativeFiltering()
    itembased_cf.fit(utility_matrix)

    # User-based
    userbased_model = UserBasedCollaborativeFiltering()
    userbased_model.fit(utility_matrix)

    # Laten Factor Model
    svdmodel = SVD(n_factors=100, lr_all=0.01, reg_all=1)
    svdmodel.fit(trainset)

    ############################## Model Prediction ##################################
    # Define list so storing the predicted values
    gb_pred = []
    itcf_pred = []
    ucf_pred = []
    svdm_pred = []

    # Predict the rating for each pair of userId and movieId
    for index, (user_id, movie_id) in enumerate(zip(r["userId"], r["movieId"])):
        # Global Bias
        gb = global_bias_model.predict(user_id, movie_id)
        gb_pred.append(gb)

        # Item-based
        itcf = itembased_cf.predict(user_id, movie_id)
        itcf_pred.append(itcf)

        # User-based 
        ucf = userbased_model.predict(user_id, movie_id)
        ucf_pred.append(ucf)

        # Latent Factor Model
        svd = svdmodel.predict(user_id, movie_id).est
        svdm_pred.append(svd)

    ############################## Construct Final Rating ##################################
    df["gb_pred"] = gb_pred
    df["itcf_pred"] = itcf_pred 
    df["ucf_pred"] = ucf_pred
    df["svdm_pred"] = svdm_pred

    # Calculate the weighted rating
    df["rating"] = 0.15*df["gb_pred"] + 0.4*df["itcf_pred"] + 0.2*df["ucf_pred"] + 0.25*df["svdm_pred"] 

    return df[["userId","movieId","rating"]]

# the ratnigs_valid.csv file is subject to change during the grading process
# this file can be change earlier in the problem when loading the necessary dataset
r = ratings_valid[["userId","movieId"]]

ratings_pred = predict_rating(r)

# 7. Rating Validation: Calculating RMSE
The result from the cell below is the RMSE we got using the Ensemble model

`Ensemble Recommendation System = Global Bias + Item-based Collaborative Filtering + User-based Collaborative Filtering + Latent Factor Model`

In [15]:
r_true = ratings_valid["rating"].to_numpy()
r_pred = r["rating"].to_numpy()

rmse = mean_squared_error(r_true, r_pred, squared=False)
print(f"RMSE = {rmse:.4f}")

RMSE = 0.8182
