In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import scipy
from helpers import *
from collaborative import * 
import joblib
import os
import time
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

2024-12-15 14:32:27.422383: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734269547.615774   52813 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734269547.670048   52813 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-15 14:32:28.032205: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
# Load the ratings and the final dataset to take a subset of movies
df_ratings = pd.read_csv('Data/TMdB/ratings.csv')
df_links = pd.read_csv('Data/TMdB/links.csv')
df_final_dataset = pd.read_csv('movie_metadata.csv')

In [19]:
# To have the same ids as the final dataset
df_links['imdbId'] = df_links['imdbId'].apply(format_imdb_id)

In [20]:
df_ratings_with_imdb_id = pd.merge(df_ratings, df_links, on='movieId', how='left')

In [21]:
df_ratings_with_imdb_id = df_ratings_with_imdb_id[df_ratings_with_imdb_id['imdbId'].isin(df_final_dataset['imdb_id'].unique())]

In [22]:
df_ratings_with_imdb_id.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,tt0112573,197.0
1,1,147,4.5,1425942435,tt0112461,10474.0
2,1,858,5.0,1425941523,tt0068646,238.0
3,1,1221,5.0,1425941546,tt0071562,240.0
4,1,1246,5.0,1425941556,tt0097165,207.0


In [23]:
df_ratings_with_imdb_id['imdbId'] = df_ratings_with_imdb_id['imdbId'].apply(unformat_imdb_id)

In [24]:
df_ratings = filter_ratings_dataframe(df_ratings_with_imdb_id, 5)
df_ratings.shape

Number of valid users: 255062
Number of valid movies: 19485


(25039215, 6)

In [25]:
user_map, reverse_user_map = create_mapping(df_ratings['userId'])
movie_map, reverse_movie_map = create_mapping(df_ratings['imdbId'])


Creating ID mappings...
Mapping created in 0.37 seconds
Total unique elements: 255062

Creating ID mappings...
Mapping created in 3.13 seconds
Total unique elements: 19485


In [26]:
df_ratings['sparse_user_id'] = df_ratings['userId'].map(user_map)
df_ratings['sparse_movie_id'] = df_ratings['imdbId'].map(movie_map)
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,sparse_user_id,sparse_movie_id
0,1,110,1.0,1425941529,112573,197.0,0,0
1,1,147,4.5,1425942435,112461,10474.0,0,1


In [27]:
df_ratings.drop(columns=['timestamp', 'userId', 'movieId', 'tmdbId', 'imdbId'], inplace=True)
df_ratings.head(2)

Unnamed: 0,rating,sparse_user_id,sparse_movie_id
0,1.0,0,0
1,4.5,0,1


In [41]:
df_ratings.to_csv('df_ratings_knn.csv')

In [12]:
n_items = 19485
n_users = 255062
ratings_matrix = create_sparse_ratings_matrix(df_ratings, n_users, n_items)

Global mean rating: 3.53


In [13]:
start = time.time()
train, test = split_into_train_and_test(df_ratings)
print(time.time()-start)


--- Performing Train-Test Split ---
Total ratings: 25039215
Training set size: 19934673 (79.61%)
Test set size: 5104542 (20.39%)
379.8217647075653


In [14]:
ratings_matrix_train = create_sparse_ratings_matrix(train, n_users, n_items)

Global mean rating: 3.53


In [15]:
ratings_matrix_test = create_sparse_ratings_matrix(test, n_users, n_items)

Global mean rating: 3.53


### SVD

In [3]:
class SparseSVDRecommender:
    def __init__(self, n_factors: int = 100, learning_rate: float = 0.005,
                 regularization: float = 0.02, n_epochs: int = 20):
        """
        Initialize the SVD recommender system with verbose logging and serialization.
        """
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.n_epochs = n_epochs

        # Model parameters
        self.user_factors = None
        self.item_factors = None
        self.user_biases = None
        self.item_biases = None
        self.global_mean = None

        # Mapping dictionaries
        self.user_id_map = {}
        self.movie_id_map = {}
        self.reverse_user_id_map = {}
        self.reverse_movie_id_map = {}


    def fit(self, ratings_matrix_train: csr_matrix, 
            user_id_map: dict = None, 
            movie_id_map: dict = None) -> 'SparseSVDRecommender3':
        """
        Train the model using a pre-computed sparse ratings matrix.
        
        Args:
            ratings_matrix_train (csr_matrix): Sparse training ratings matrix
            user_id_map (dict, optional): Mapping of original user IDs to matrix indices
            movie_id_map (dict, optional): Mapping of original movie IDs to matrix indices
        """
        print("\n--- Starting SVD Training ---")
        start_total_time = time.time()

        # If maps are not provided, create default mappings
        if user_id_map is None:
            self.user_id_map = {i: i for i in range(ratings_matrix_train.shape[0])}
            self.reverse_user_id_map = self.user_id_map
        else:
            self.user_id_map = user_id_map
            self.reverse_user_id_map = {idx: user for user, idx in user_id_map.items()}

        if movie_id_map is None:
            self.movie_id_map = {i: i for i in range(ratings_matrix_train.shape[1])}
            self.reverse_movie_id_map = self.movie_id_map
        else:
            self.movie_id_map = movie_id_map
            self.reverse_movie_id_map = {idx: movie for movie, idx in movie_id_map.items()}

            # Compute global mean
        self.global_mean = ratings_matrix_train.data.mean()
        print(f"Global mean rating: {self.global_mean:.2f}")

        # Get matrix dimensions
        n_users, n_items = ratings_matrix_train.shape
        print(f"Matrix dimensions: {n_users} users x {n_items} movies")

        # Initialize matrices
        self._init_matrices(n_users, n_items)

        # Get indices of non-zero elements
        users, items = ratings_matrix_train.nonzero()
        total_ratings = len(users)
        print(f"Total ratings to train on: {total_ratings}")

            # Training loop
        for epoch in range(self.n_epochs):
            epoch_start_time = time.time()

            # Shuffle the order of training examples
            shuffle_indices = np.random.permutation(len(users))

            # Track total error for the epoch
            total_error = 0

            for idx in shuffle_indices:
                u, i = users[idx], items[idx]
                r = ratings_matrix_train[u, i]

                # Compute current prediction
                pred = (self.global_mean +
                        self.user_biases[u] +
                        self.item_biases[i] +
                        self.user_factors[u] @ self.item_factors[i])

                # Compute error
                error = r - pred
                total_error += error ** 2

                # Update biases and factors
                self.user_biases[u] += self.learning_rate * (error - self.regularization * self.user_biases[u])
                self.item_biases[i] += self.learning_rate * (error - self.regularization * self.item_biases[i])

                user_factors_update = (error * self.item_factors[i] -
                                       self.regularization * self.user_factors[u])
                item_factors_update = (error * self.user_factors[u] -
                                       self.regularization * self.item_factors[i])

                self.user_factors[u] += self.learning_rate * user_factors_update
                self.item_factors[i] += self.learning_rate * item_factors_update

            # Print epoch summary
            rmse = np.sqrt(total_error / total_ratings)
            epoch_time = time.time() - epoch_start_time
            print(f"Epoch {epoch + 1}/{self.n_epochs}: RMSE = {rmse:.4f}, Time = {epoch_time:.2f}s")

        # Final training summary
        total_training_time = time.time() - start_total_time
        print(f"\n--- Training Complete ---")
        print(f"Total training time: {total_training_time:.2f} seconds")

        return self

    def evaluate(self, ratings_matrix_test: csr_matrix, 
         include_unrated: bool = False):
        """
        Evaluate model performance on test sparse matrix.
        
        Args:
            ratings_matrix_test (csr_matrix): Sparse test ratings matrix
            include_unrated (bool): Whether to include predictions for unrated items

        Returns:
            Dict with performance metrics, actual ratings, and predictions
        """
        print("\n--- Model Evaluation ---")
        
        # Compute predictions for test set
        predictions = []
        actual_ratings = []
        
        # Get test set non-zero indices
        test_users, test_items = ratings_matrix_test.nonzero()
        
        for idx in range(len(test_users)):
            u, i = test_users[idx], test_items[idx]
            actual_rating = ratings_matrix_test[u, i]
            
            # Translate back to original IDs if needed
            orig_user_id = self.reverse_user_id_map.get(u, u)
            orig_movie_id = self.reverse_movie_id_map.get(i, i)
            
            try:
                pred = self.predict(orig_user_id, orig_movie_id)
                predictions.append(pred)
                actual_ratings.append(actual_rating)
            except Exception as e:
                print(f"Prediction error for user {u}, movie {i}: {e}")
                continue
        
        # Compute metrics
        rmse = np.sqrt(mean_squared_error(actual_ratings, predictions))
        mae = mean_absolute_error(actual_ratings, predictions)
        
        print("Performance Metrics:")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"Total Test Ratings: {len(actual_ratings)}")
        
        return {
            'RMSE': rmse,
            'MAE': mae,
            'Total Test Ratings': len(actual_ratings)
        }, actual_ratings, predictions

    def save_model(self, filepath: str = 'svd_recommender_model.joblib'):
        """Save the entire model state to a file."""
        try:
            if self.user_factors is None:
                raise ValueError("Model must be trained before saving")

            model_state = {
                'user_factors': self.user_factors,
                'item_factors': self.item_factors,
                'user_biases': self.user_biases,
                'item_biases': self.item_biases,
                'global_mean': self.global_mean,
                'user_id_map': self.user_id_map,
                'movie_id_map': self.movie_id_map,
                'n_factors': self.n_factors,
                'learning_rate': self.learning_rate,
                'regularization': self.regularization,
                'n_epochs': self.n_epochs
            }

            joblib.dump(model_state, filepath)
            print(f"Model successfully saved to {filepath}")

        except Exception as e:
            print(f"Error saving model: {e}")

    def load_model(self, filepath: str = 'svd_recommender_model.joblib'):
        """Load a previously saved model state."""
        try:
            if not os.path.exists(filepath):
                raise FileNotFoundError(f"No model file found at {filepath}")

            model_state = joblib.load(filepath)

            # Restore model parameters
            self.user_factors = model_state['user_factors']
            self.item_factors = model_state['item_factors']
            self.user_biases = model_state['user_biases']
            self.item_biases = model_state['item_biases']
            self.global_mean = model_state['global_mean']

            # Restore mapping dictionaries
            self.user_id_map = model_state['user_id_map']
            self.movie_id_map = model_state['movie_id_map']

            # Recreate reverse mapping dictionaries
            self.reverse_user_id_map = {idx: user for user, idx in self.user_id_map.items()}
            self.reverse_movie_id_map = {idx: movie for movie, idx in self.movie_id_map.items()}

            print(f"Model successfully loaded from {filepath}")
            print(f"Loaded model details:")
            print(f"  - Latent Factors: {model_state['n_factors']}")
            print(f"  - Unique Users: {len(self.user_id_map)}")
            print(f"  - Unique Movies: {len(self.movie_id_map)}")

            return self

        except Exception as e:
            print(f"Error loading model: {e}")
            return None

    def _init_matrices(self, n_users: int, n_items: int):
        """Initialize model parameters with logging."""
        print(f"\nInitializing matrices with {self.n_factors} latent factors")
        self.user_factors = np.random.normal(0, 0.1, (n_users, self.n_factors))
        self.item_factors = np.random.normal(0, 0.1, (n_items, self.n_factors))
        self.user_biases = np.zeros(n_users)
        self.item_biases = np.zeros(n_items)

    def predict(self, user_id: int, movie_id: int) -> float:
        """Predict rating with error handling and logging."""
        if self.user_factors is None:
            print("Error: Model must be trained before making predictions")
            return None

        try:
            u_idx = self.user_id_map[user_id]
            m_idx = self.movie_id_map[movie_id]
        except KeyError:
            return self.global_mean

        prediction = (self.global_mean +
                      self.user_biases[u_idx] +
                      self.item_biases[m_idx] +
                      self.user_factors[u_idx] @ self.item_factors[m_idx])

        return prediction

    def recommend_items(self, user_id: int, n_items: int = 10,
                        exclude_rated: bool = True):
        """Recommend items with detailed logging."""
        if self.user_factors is None:
            print("Error: Model must be trained before making recommendations")
            return []

        try:
            u_idx = self.user_id_map[user_id]
        except KeyError:
            print(f"Warning: User {user_id} not found in training data")
            return []

        # Calculate predictions for all items
        user_vector = (self.global_mean +
                       self.user_biases[u_idx] +
                       self.user_factors[u_idx] @ self.item_factors.T)

        # Create array of (movie_id, prediction) tuples
        predictions = []
        for m_idx, pred in enumerate(user_vector):
            movie_id = self.reverse_movie_id_map[m_idx]
            predictions.append((movie_id, pred))

        # Sort by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)

        # Optional: exclude already rated movies
        if exclude_rated:
            rated_movies = set(df_filtered[df_filtered['sparse_user_id'] == user_id]['sparse_movie_id'])
            predictions = [rec for rec in predictions if rec[0] not in rated_movies]

        return predictions

    def handle_new_user(self, user_ratings, n_items=10):
        """
        Incorporate new user ratings into recommendations without full retraining

        Args:
            user_ratings (pd.DataFrame): DataFrame with columns 'movieId' and 'rating'
            n_items (int): Number of recommendations to return

        Returns:
            List of recommended movie IDs
        """
        # Check if model is trained
        if self.user_factors is None:
            raise ValueError("Model must be trained first")

        # Validate movie IDs exist in original training data
        valid_movies = [mid for mid in user_ratings['sparse_movie_id'] if mid in self.movie_id_map]
        valid_ratings = user_ratings[user_ratings['sparse_movie_id'].isin(valid_movies)]

        if len(valid_movies) == 0:
            print("No valid movie ratings found")
            return []

        # Ensure consistent dimensionality
        new_user_factors = np.zeros(self.item_factors.shape[1])
        new_user_bias = 0

        for _, row in valid_ratings.iterrows():
            movie_idx = self.movie_id_map[row['sparse_movie_id']]

            # Safely handle potential shape mismatches
            movie_factors = self.item_factors[movie_idx]

            # Ensure correct dimensionality
            if len(movie_factors) != len(new_user_factors):
                # Truncate or pad the movie factors to match new_user_factors
                if len(movie_factors) > len(new_user_factors):
                    movie_factors = movie_factors[:len(new_user_factors)]
                else:
                    padded_factors = np.zeros(len(new_user_factors))
                    padded_factors[:len(movie_factors)] = movie_factors
                    movie_factors = padded_factors

            # Update user vector with weighted item factors
            new_user_factors += row['rating'] * movie_factors
            new_user_bias += row['rating'] - self.global_mean

        # Normalize by number of ratings
        new_user_factors /= len(valid_ratings)
        new_user_bias /= len(valid_ratings)

        # Compute predictions for all items
        predictions = (self.global_mean +
                       new_user_bias +
                       new_user_factors @ self.item_factors.T)

        predictions = np.clip(predictions, None, 5)

        def normalize_rating(predicted_rating, min_pred, max_pred, min_rating, max_rating):
            return min_rating + (predicted_rating - min_pred) * (max_rating - min_rating) / (max_pred - min_pred)

        def transform_rating(predicted_rating, min_rating, max_rating):
            range_width = max_rating - min_rating
            return min_rating + range_width / (1 + np.exp(-predicted_rating))

        #print("normalizing")
        predictions = transform_rating(predictions, 0.5, 5)  
        #predictions = normalize_rating(predictions, np.min(predictions), np.max(predictions), 0.5, 5)

        # Create and sort recommendations
        recommendations = [
            (self.reverse_movie_id_map[idx], pred)
            for idx, pred in enumerate(predictions)
        ]
        recommendations.sort(key=lambda x: x[1], reverse=True)

        # Exclude movies already rated
        rated_movies = set(valid_ratings['sparse_movie_id'])
        recommendations = [rec for rec in recommendations if rec[0] not in rated_movies]

        return recommendations

In [5]:
recommender = SparseSVDRecommender()

recommender.load_model('sample_svd_model14.joblib')

Model successfully loaded from sample_svd_model14.joblib
Loaded model details:
  - Latent Factors: 20
  - Unique Users: 255062
  - Unique Movies: 19485


<__main__.SparseSVDRecommender at 0x70b59274a9f0>

In [None]:
def run_svd_recommender_example():
    # Model file path
    MODEL_PATH = 'sample_svd_model15.joblib'
    
    # Initialize recommender
    recommender = SparseSVDRecommender(
        n_factors=100,  # Number of latent factors
        learning_rate=0.005,
        regularization=0.02,
        n_epochs=10
    )
    
    # Check if model exists
    if os.path.exists(MODEL_PATH):
        print("\n--- Loading Existing Model ---")
        loaded_recommender = recommender.load_model(MODEL_PATH)
        
        if loaded_recommender is not None:
            # Use loaded model for evaluation and recommendations
            print("Existing model loaded successfully!")
            
            # Evaluate the loaded model
            metrics, actual_ratings, predictions = loaded_recommender.evaluate(ratings_matrix_test)
            
            # Demonstrate recommendations for a specific user
            print("\n--- Sample Recommendations ---")
            # Pick a random user from the original dataset
            sample_user = df_ratings['sparse_user_id'].sample(1).values[0]
            
            # Prepare sample user ratings for new user recommendation
            sample_user_ratings = train[train['sparse_user_id'] == sample_user][['sparse_movie_id', 'rating']].copy()
            
            # Get recommendations
            recommendations = loaded_recommender.handle_new_user(
                sample_user_ratings, 
                n_items=100
            )
            
            print(f"Recommendations for User {sample_user}:")
            for movie_id, score in recommendations:
                print(f"Movie ID: {movie_id}, Predicted Rating: {score:.2f}")
        else:
            print("Failed to load existing model. Training new model.")
            # Proceed with training new model
            train_and_save_model(recommender, ratings_matrix_train, ratings_matrix_test, MODEL_PATH)
    else:
        print("\n--- No Existing Model Found. Training New Model ---")
        # Train and save new model
        train_and_save_model(recommender, ratings_matrix_train, ratings_matrix_test, MODEL_PATH)

# Helper function to train and save model
def train_and_save_model(recommender, train_matrix, test_matrix, model_path):
    # Fit the model
    recommender.fit(
        ratings_matrix_train=train_matrix,
    )
    
    # Evaluate the model
    metrics, actual_ratings, predictions = recommender.evaluate(ratings_matrix_test)
    
    # Save the model
    recommender.save_model(model_path)
    
    # Demonstrate recommendations for a specific user
    print("\n--- Sample Recommendations ---")
    # Pick a random user from the original dataset
    sample_user = df_ratings['sparse_user_id'].sample(1).values[0]
    
    # Prepare sample user ratings for new user recommendation
    sample_user_ratings = train[train['sparse_user_id'] == sample_user][['sparse_movie_id', 'rating']].copy()
    
    # Get recommendations
    recommendations = recommender.handle_new_user(
        sample_user_ratings, 
        n_items=100
    )
    
    print(f"Recommendations for User {sample_user}:")
    print(len(recommendations))
    for movie_id, score in recommendations[:10]:
        print(f"Movie ID: {movie_id}, Predicted Rating: {score:.2f}")

# Run the example
run_svd_recommender_example()

In [141]:
recommender = SparseSVDRecommender()

In [142]:
loaded_recommender = recommender.load_model('sample_svd_model2.joblib')

Model successfully loaded from sample_svd_model2.joblib
Loaded model details:
  - Latent Factors: 20
  - Unique Users: 255062
  - Unique Movies: 19485


In [143]:
new_ratings = [('112573', 3.5), ('97165', 2), ('468569', 5), ('1010048', 4.5), ('814335', 4.5), ('209144', 3.5), ('796366', 2.5), 
               ('119094', 4)]

In [144]:
movie_map

{'112573': 0,
 '112461': 1,
 '68646': 2,
 '71562': 3,
 '97165': 4,
 '88847': 5,
 '167404': 6,
 '91042': 7,
 '137523': 8,
 '209144': 9,
 '246578': 10,
 '280760': 11,
 '372784': 12,
 '829482': 13,
 '468569': 14,
 '371746': 15,
 '796366': 16,
 '417741': 17,
 '988045': 18,
 '926084': 19,
 '1392170': 20,
 '1515091': 21,
 '1645080': 22,
 '1659337': 23,
 '1853728': 24,
 '113041': 25,
 '113627': 26,
 '114746': 27,
 '110877': 28,
 '118002': 29,
 '116731': 30,
 '115685': 31,
 '76759': 32,
 '114924': 33,
 '111257': 34,
 '117247': 35,
 '117381': 36,
 '117060': 37,
 '117765': 38,
 '116629': 39,
 '116213': 40,
 '117218': 41,
 '86190': 42,
 '82096': 43,
 '117731': 44,
 '116743': 45,
 '118880': 46,
 '107290': 47,
 '107614': 48,
 '108052': 49,
 '103772': 50,
 '83866': 51,
 '88763': 52,
 '93010': 53,
 '94715': 54,
 '109445': 55,
 '106292': 56,
 '93779': 57,
 '119731': 58,
 '118708': 59,
 '118883': 60,
 '119488': 61,
 '119345': 62,
 '118971': 63,
 '118715': 64,
 '120902': 65,
 '118998': 66,
 '99674': 67,

In [145]:
new_ratings_df = pd.DataFrame(new_ratings, columns=['original_movie_id', 'rating'])
new_ratings_df['sparse_movie_id'] = new_ratings_df['original_movie_id'].map(movie_map)
ratings_to_pass_df = new_ratings_df.drop(columns=['original_movie_id'])
ratings_to_pass_df

Unnamed: 0,rating,sparse_movie_id
0,3.5,0
1,2.0,4
2,5.0,14
3,4.5,163
4,4.5,7045
5,3.5,9
6,2.5,16
7,4.0,351


In [146]:
for _, row in ratings_to_pass_df.iterrows():
    imdbid = format_imdb_id(reverse_movie_map[row['sparse_movie_id']])
    name = df_final_dataset[df_final_dataset['imdb_id'] == imdbid]['title'].values
    if name.size > 0:  # Check if a title is found
        name = name[0]  # Extract the first title
    else:
        name = "Unknown"  # Handle missing titles
    print(f"Movie name: {name}, Rating: {row['rating']}")

Movie name: Braveheart, Rating: 3.5
Movie name: Dead Poets Society, Rating: 2.0
Movie name: The Dark Knight, Rating: 5.0
Movie name: Slumdog Millionaire, Rating: 4.5
Movie name: The Stepfather, Rating: 4.5
Movie name: Memento, Rating: 3.5
Movie name: Star Trek, Rating: 2.5
Movie name: Face/Off, Rating: 4.0


In [147]:
recommendations = loaded_recommender.handle_new_user(ratings_to_pass_df, n_items=5)
for movie_id, score in recommendations[:10]:
    imdbid = format_imdb_id(reverse_movie_map[movie_id])
    name = df_final_dataset[df_final_dataset['imdb_id'] == imdbid]['title'].values
    print(f"Movie name: {name}, Predicted Rating: {score:.2f}")

Movie name: ['Water'], Predicted Rating: 4.90
Movie name: ['The Eye of Vichy'], Predicted Rating: 4.90
Movie name: ['The Lovers'], Predicted Rating: 4.90
Movie name: ['Out of Reach'], Predicted Rating: 4.90
Movie name: ['Trailer Park of Terror'], Predicted Rating: 4.90
Movie name: ['Catch a Fire'], Predicted Rating: 4.90
Movie name: ['Skinwalkers'], Predicted Rating: 4.90
Movie name: ["Finder's Fee"], Predicted Rating: 4.90
Movie name: ['Duplicate'], Predicted Rating: 4.90
Movie name: ['The Oblong Box'], Predicted Rating: 4.90


### Hyperparameters
- _n_factors=10, learning_rate=0.005, regularization=0,    n_epochs=5, RMSE: 0.8445, MAE: 0.6451, sample_svd_model9
  --> This model predicts outside 5_
- _n_factors=10, learning_rate=0.02, regularization=0.02, n_epochs=5, RMSE: 0.8095, MAE: 0.6168, sample_svd_model13
  --> This model predicts outside 5_
- _n_factors=30, learning_rate=0.01, regularization=0.02, n_epochs=5, RMSE: 0.8250, MAE: 0.6298, sample_svd_model15
  --> This model predicts outside 5_
  
  
- n_factors=50, learning_rate=0.005, regularization=0.02, n_epochs=1, RMSE: 0.8941, MAE: 0.6874, sample_svd_model1
- n_factors=20, learning_rate=0.005, regularization=0.02, n_epochs=1, RMSE: 0.8939, MAE: 0.6871, sample_svd_model2
- n_factors=10, learning_rate=0.005, regularization=0.02, n_epochs=1, RMSE: 0.8937, MAE: 0.6870, sample_svd_model3
- n_factors=10, learning_rate=0.005, regularization=0.02, n_epochs=3, RMSE: 0.8774, MAE: 0.6731, sample_svd_model4
- n_factors=10, learning_rate=0.005, regularization=0.1,  n_epochs=3, RMSE: 0.8803, MAE: 0.6767, sample_svd_model5
- n_factors=10, learning_rate=0.005, regularization=0.05, n_epochs=3, RMSE: 0.8789, MAE: 0.6747, sample_svd_model6
- n_factors=10, learning_rate=0.005, regularization=0.01, n_epochs=3, RMSE: 0.8763, MAE: 0.6717, sample_svd_model7
- n_factors=10, learning_rate=0.005, regularization=0,    n_epochs=3, RMSE: 0.8720, MAE: 0.6680, sample_svd_model8
- n_factors=10, learning_rate=0.005, regularization=0.02, n_epochs=5, RMSE: 0.8635, MAE: 0.6613, sample_svd_model10, predicted ratings valids
- n_factors=10, learning_rate=0.003, regularization=0.02, n_epochs=5, RMSE: 0.8767, MAE: 0.6722, sample_svd_model11, predicted ratings valids
- n_factors=10, learning_rate=0.01, regularization=0.02, n_epochs=5, RMSE: 0.8300, MAE: 0.6337, sample_svd_model12, predicted ratings valids
- __n_factors=20, learning_rate=0.01, regularization=0.02, n_epochs=5, RMSE: 0.8278, MAE: 0.6319, sample_svd_model14, predicted ratings valids__


    

In [16]:
# Comprehensive workflow demonstration
def comprehensive_svd_demo(df_filtered=df_filtered, model_path='aaaaaaa.joblib'):
    print("\n=== Comprehensive SVD Recommender Workflow ===")
    
    # Check if a saved model exists
    if os.path.exists(model_path):
        print("\nLoading existing model...")
        recommender = SparseSVDRecommender3()
        loaded_recommender = recommender.load_model(model_path)
        
        if loaded_recommender:
            # Perform evaluation on loaded model
            train_df, test_df = SparseSVDRecommender3.train_test_split(df_filtered)
            metrics = loaded_recommender.evaluate(test_df)
            return loaded_recommender, metrics
    
    # If no saved model, train a new one
    print("\nTraining new model...")
    recommender = SparseSVDRecommender3(
        n_factors=50,
        learning_rate=0.005,
        regularization=0.15,
        n_epochs=5
    )
    
    # Split data
    train_df, test_df = recommender.train_test_split(df_filtered)
    
    # Fit on training data
    recommender.fit(train_df)
    
    # Evaluate on test data
    metrics = recommender.evaluate(test_df)
    
    # Save the model
    recommender.save_model(model_path)
    
    return recommender, metrics

# Uncomment to use in your environment
recommender_model, performance = comprehensive_svd_demo(df_ratings)


=== Comprehensive SVD Recommender Workflow ===

Training new model...

--- Performing Stratified Train-Test Split ---
Total ratings: 25039215
Training set size: 19934673 (79.61%)
Test set size: 5104542 (20.39%)

--- Starting SVD Training ---

Creating ID mappings...


KeyError: 'sparse_user_id'

In [17]:
df_ratings

Unnamed: 0,userId,rating,imdbId
0,1,1.0,112573
1,1,4.5,112461
2,1,5.0,68646
3,1,5.0,71562
4,1,5.0,97165
...,...,...,...
26024284,270896,5.0,468569
26024285,270896,5.0,910970
26024286,270896,4.5,1010048
26024287,270896,4.5,421715
