<a href="https://colab.research.google.com/github/efandresena/large-scale-movie-recommendation/blob/main/mirindraf_mlas_practical_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Structure Supporting Genres and titles

In [2]:
import numpy as np
from numba import njit, prange
import random


class CompactDatasetCSR:
    """
    CSR (Compressed Sparse Row) format optimized for Numba.
    Stores ratings as separate parallel arrays instead of structured arrays.
    """

    def __init__(self, shared_index=None):
        """
        Args:
            shared_index: Optional tuple of shared index mappings from another dataset
                         Format: (userId_to_idx, idx_to_userId, movieId_to_idx, idx_to_movieId)
        """
        if shared_index is not None:
            # Share index with another dataset (for train/test split)
            self.userId_to_idx, self.idx_to_userId, self.movieId_to_idx, self.idx_to_movieId = shared_index
            self._owns_index = False
        else:
            # Create new index
            self.userId_to_idx = {}
            self.idx_to_userId = []
            self.movieId_to_idx = {}
            self.idx_to_movieId = []
            self._owns_index = True



        # Temporary storage for ratings before finalization
        self._temp_ratings = []  # Store as list of (user_idx, movie_idx, rating)

        # CSR format for user ratings (row-based)
        self.user_indptr = None      # Pointer to start of each user's ratings
        self.user_movie_ids = None   # Flat array of movie indices
        self.user_ratings = None     # Flat array of ratings

        # CSR format for movie ratings (column-based)
        self.movie_indptr = None     # Pointer to start of each movie's ratings
        self.movie_user_ids = None   # Flat array of user indices
        self.movie_ratings = None    # Flat array of ratings

        # For features :
        self.movie_titles = {}
        self.movie_genres = {}

        self._finalized = False

    @property
    def usr_size(self):
        return len(self.idx_to_userId)

    @property
    def movie_size(self):
        return len(self.idx_to_movieId)

    def get_shared_index(self):
        """Return index mappings to share with another dataset."""
        return (self.userId_to_idx, self.idx_to_userId,
                self.movieId_to_idx, self.idx_to_movieId)

    def add_rating(self, userId, movieId, rating_value):
        """Add a rating entry to the dataset."""
        if self._finalized:
            raise RuntimeError("Cannot add ratings after finalization")

        # Index user (only if we own the index)
        if self._owns_index:
            if userId not in self.userId_to_idx:
                user_pos = len(self.idx_to_userId)
                self.userId_to_idx[userId] = user_pos
                self.idx_to_userId.append(userId)

        # Index movie (only if we own the index)
        if self._owns_index:
            if movieId not in self.movieId_to_idx:
                movie_pos = len(self.idx_to_movieId)
                self.movieId_to_idx[movieId] = movie_pos
                self.idx_to_movieId.append(movieId)

        # Get indices
        user_pos = self.userId_to_idx.get(userId)
        movie_pos = self.movieId_to_idx.get(movieId)

        # Only add if both user and movie exist in index here pos mean the list index row and colum
        if user_pos is not None and movie_pos is not None:
            self._temp_ratings.append((user_pos, movie_pos, rating_value))


    def finalize(self):
        """Convert to CSR format for Numba optimization."""
        if self._finalized:
            return

        print(f"Finalizing dataset to CSR format... ({len(self._temp_ratings)} ratings)")

        M = self.usr_size
        N = self.movie_size


        # Sort ratings by user, then by movie (for CSR user format)
        print("  Sorting by user...")
        self._temp_ratings.sort(key=lambda x: (x[0], x[1]))


        # Build user CSR arrays
        print("  Building user CSR arrays...")
        self.user_indptr = np.zeros(M + 1, dtype=np.int64)
        self.user_movie_ids = np.zeros(len(self._temp_ratings), dtype=np.int32)
        self.user_ratings = np.zeros(len(self._temp_ratings), dtype=np.float32)

        current_user = -1
        for idx, (user_idx, movie_idx, rating) in enumerate(self._temp_ratings):
            # Update indptr for new users
            while current_user < user_idx:
                current_user += 1
                self.user_indptr[current_user] = idx

            self.user_movie_ids[idx] = movie_idx
            self.user_ratings[idx] = rating

        # Fill remaining indptr
        while current_user < M:
            current_user += 1
            self.user_indptr[current_user] = len(self._temp_ratings)


        # Sort ratings by movie, then by user (for CSR movie format)
        print("  Sorting by movie...")
        self._temp_ratings.sort(key=lambda x: (x[1], x[0]))

        # Build movie CSR arrays
        print("  Building movie CSR arrays...")
        self.movie_indptr = np.zeros(N + 1, dtype=np.int64)
        self.movie_user_ids = np.zeros(len(self._temp_ratings), dtype=np.int32)
        self.movie_ratings = np.zeros(len(self._temp_ratings), dtype=np.float32)

        current_movie = -1
        for idx, (user_idx, movie_idx, rating) in enumerate(self._temp_ratings):
            # Update indptr for new movies
            while current_movie < movie_idx:
                current_movie += 1
                self.movie_indptr[current_movie] = idx

            self.movie_user_ids[idx] = user_idx
            self.movie_ratings[idx] = rating

        # Fill remaining indptr
        while current_movie < N:
            current_movie += 1
            self.movie_indptr[current_movie] = len(self._temp_ratings)

        # Free temporary memory
        self._temp_ratings = None

        self._finalized = True
        print(f"âœ“ Finalized: {self}")

    def add_movie_metadata(self, movieId, title, genres):
        self.movie_titles[movieId] = title
        self.movie_genres[movieId] = genres

    def get_movie_metadata(self, movie_idx):
        movieId = self.idx_to_movieId[movie_idx]
        return movieId, self.movie_titles.get(movieId), self.movie_genres.get(movieId)

    def __repr__(self):
        if self._finalized:
            total = len(self.user_ratings) if self.user_ratings is not None else 0
        else:
            total = len(self._temp_ratings)
        return f"CompactDatasetCSR(users={self.usr_size}, movies={self.movie_size}, ratings={total})"




In [None]:
# adding features
import csv

# def add_title_genre(dataset: CompactDatasetCSR, **kwargs):
#   movieId, title, genre = kwargs
#   if
#   return 0


train = CompactDatasetCSR()

filepath = '/content/ml-25m/movies.csv'
# Step 3: Load data and split

with open(filepath, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header
    list_genre = []
    for i, row in enumerate(reader):
        movieId, title, genres = row
        genre = genres.strip().split('|')
        for g in genre:
          if g not in list_genre:
            list_genre.append(g)

        _,_,movieId_to_idx,idx_to_movieId = train.get_shared_index()
        if movieId
len(list_genre)
        # insert the title and genre


# Implementing the new feature in the training loop