# Train and Save Model
## Ramy - Model Training

This notebook trains the final model and saves it for deployment.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"Training session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

‚úÖ All libraries imported successfully!
Training session: 2025-11-29 21:38:52


In [2]:
# Create the production-ready MovieRecommender class

class MovieRecommender:
    """
    Production-ready Movie Recommendation System
    Uses Item-Based Collaborative Filtering with Cosine Similarity
    
    Attributes:
        user_item_matrix: Pivot table of user ratings
        item_similarity_df: Item-item similarity matrix
        movies_df: Movie metadata (titles, genres)
        is_trained: Boolean flag indicating if model is trained
        training_info: Dictionary with training metadata
    """
    
    def __init__(self):
        """Initialize the recommender system"""
        self.user_item_matrix = None
        self.item_similarity_df = None
        self.movies_df = None
        self.is_trained = False
        self.training_info = {
            'trained_at': None,
            'num_users': 0,
            'num_movies': 0,
            'num_ratings': 0,
            'sparsity': 0.0,
            'model_version': '1.0'
        }
        print("üé¨ MovieRecommender initialized")
    
    def train(self, ratings_df, movies_df, verbose=True):
        """
        Train the recommendation model
        
        Args:
            ratings_df (DataFrame): User ratings with columns [userId, movieId, rating]
            movies_df (DataFrame): Movie metadata with columns [movieId, title, genres]
            verbose (bool): Print training progress
            
        Returns:
            dict: Training statistics
        """
        if verbose:
            print("=" * 80)
            print("TRAINING MOVIE RECOMMENDER")
            print("=" * 80)
        
        # Store movie metadata
        self.movies_df = movies_df.copy()
        
        # Remove duplicates
        ratings_df = ratings_df.drop_duplicates(['userId', 'movieId'], keep='last')
        
        if verbose:
            print(f"\nüìä Training data:")
            print(f"  ‚Ä¢ Users: {ratings_df['userId'].nunique():,}")
            print(f"  ‚Ä¢ Movies: {ratings_df['movieId'].nunique():,}")
            print(f"  ‚Ä¢ Ratings: {len(ratings_df):,}")
            print(f"  ‚Ä¢ Average rating: {ratings_df['rating'].mean():.2f}")
        
        # Create user-item matrix
        if verbose:
            print("\n‚è≥ Creating user-item matrix...")
        
        self.user_item_matrix = ratings_df.pivot_table(
            index='userId',
            columns='movieId',
            values='rating',
            fill_value=0
        )
        
        # Calculate sparsity
        num_ratings = (self.user_item_matrix > 0).sum().sum()
        total_cells = self.user_item_matrix.shape[0] * self.user_item_matrix.shape[1]
        sparsity = (1 - num_ratings / total_cells) * 100
        
        if verbose:
            print(f"  ‚úÖ Matrix shape: {self.user_item_matrix.shape}")
            print(f"  ‚úÖ Sparsity: {sparsity:.2f}%")
        
        # Calculate item-item similarity
        if verbose:
            print("\n‚è≥ Computing item-item similarity matrix...")
        
        item_similarity = cosine_similarity(self.user_item_matrix.T)
        
        self.item_similarity_df = pd.DataFrame(
            item_similarity,
            index=self.user_item_matrix.columns,
            columns=self.user_item_matrix.columns
        )
        
        if verbose:
            print(f"  ‚úÖ Similarity matrix: {self.item_similarity_df.shape}")
            print(f"  ‚úÖ Memory usage: {self.item_similarity_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Update training info
        self.training_info = {
            'trained_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'num_users': self.user_item_matrix.shape[0],
            'num_movies': self.user_item_matrix.shape[1],
            'num_ratings': len(ratings_df),
            'sparsity': sparsity,
            'avg_rating': float(ratings_df['rating'].mean()),
            'model_version': '1.0',
            'algorithm': 'Item-Based Collaborative Filtering',
            'similarity_metric': 'Cosine Similarity'
        }
        
        self.is_trained = True
        
        if verbose:
            print("\n" + "=" * 80)
            print("‚úÖ TRAINING COMPLETE!")
            print("=" * 80)
        
        return self.training_info
    
    def recommend(self, user_ratings, n_recommendations=10, min_similarity=0.0):
        """
        Generate movie recommendations for a user
        
        Args:
            user_ratings (dict): Dictionary of {movieId: rating}
            n_recommendations (int): Number of recommendations to return
            min_similarity (float): Minimum similarity threshold (0-1)
            
        Returns:
            DataFrame: Recommended movies with scores, titles, and genres
        """
        if not self.is_trained:
            raise Exception("‚ùå Model not trained! Call train() first.")
        
        if not user_ratings:
            raise ValueError("‚ùå user_ratings cannot be empty")
        
        # Calculate scores for all movies
        scores = {}
        
        for movie_id, rating in user_ratings.items():
            # Skip if movie not in our similarity matrix
            if movie_id not in self.item_similarity_df.columns:
                continue
            
            # Get similarities for this movie
            similar_movies = self.item_similarity_df[movie_id]
            
            # Calculate weighted scores
            for other_movie_id, similarity in similar_movies.items():
                # Skip already rated movies
                if other_movie_id in user_ratings:
                    continue
                
                # Apply similarity threshold
                if similarity <= min_similarity:
                    continue
                
                # Accumulate weighted score
                if other_movie_id not in scores:
                    scores[other_movie_id] = 0
                scores[other_movie_id] += similarity * rating
        
        # Handle case where no recommendations can be generated
        if not scores:
            return pd.DataFrame(columns=['movieId', 'title', 'genres', 'score'])
        
        # Get top N recommendations
        top_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
        recommended_ids = [movie_id for movie_id, score in top_movies]
        
        # Get movie details
        recommendations = self.movies_df[self.movies_df['movieId'].isin(recommended_ids)].copy()
        
        # Add scores
        score_dict = dict(top_movies)
        recommendations['score'] = recommendations['movieId'].map(score_dict)
        
        # Sort by score
        recommendations = recommendations.sort_values('score', ascending=False)
        
        return recommendations[['movieId', 'title', 'genres', 'score']].reset_index(drop=True)
    
    def get_similar_movies(self, movie_id, n_similar=10, min_similarity=0.3):
        """
        Find movies similar to a given movie
        
        Args:
            movie_id (int): Movie ID to find similar movies for
            n_similar (int): Number of similar movies to return
            min_similarity (float): Minimum similarity threshold
            
        Returns:
            DataFrame: Similar movies with similarity scores
        """
        if not self.is_trained:
            raise Exception("‚ùå Model not trained! Call train() first.")
        
        if movie_id not in self.item_similarity_df.columns:
            raise ValueError(f"‚ùå Movie ID {movie_id} not found in training data")
        
        # Get similarities for this movie
        similarities = self.item_similarity_df[movie_id]
        
        # Filter by threshold and exclude self
        similar = similarities[similarities >= min_similarity].sort_values(ascending=False)[1:n_similar+1]
        
        # Get movie details
        similar_movies = self.movies_df[self.movies_df['movieId'].isin(similar.index)].copy()
        similar_movies['similarity'] = similar_movies['movieId'].map(similar)
        similar_movies = similar_movies.sort_values('similarity', ascending=False)
        
        return similar_movies[['movieId', 'title', 'genres', 'similarity']].reset_index(drop=True)
    
    def get_info(self):
        """Get model information and statistics"""
        if not self.is_trained:
            return {"status": "Model not trained"}
        
        return self.training_info
    
    def save(self, filepath='models/recommender_model.pkl'):
        """
        Save the trained model to disk
        
        Args:
            filepath (str): Path where to save the model
        """
        if not self.is_trained:
            raise Exception("‚ùå Cannot save untrained model!")
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        # Prepare model data
        model_data = {
            'user_item_matrix': self.user_item_matrix,
            'item_similarity_df': self.item_similarity_df,
            'movies_df': self.movies_df,
            'training_info': self.training_info
        }
        
        # Save to file
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        file_size = os.path.getsize(filepath) / 1024**2
        print(f"‚úÖ Model saved to: {filepath}")
        print(f"üì¶ File size: {file_size:.2f} MB")
        
        return filepath
    
    def load(self, filepath='models/recommender_model.pkl'):
        """
        Load a trained model from disk
        
        Args:
            filepath (str): Path to the saved model
        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"‚ùå Model file not found: {filepath}")
        
        # Load model data
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        # Restore model state
        self.user_item_matrix = model_data['user_item_matrix']
        self.item_similarity_df = model_data['item_similarity_df']
        self.movies_df = model_data['movies_df']
        self.training_info = model_data['training_info']
        self.is_trained = True
        
        file_size = os.path.getsize(filepath) / 1024**2
        print(f"‚úÖ Model loaded from: {filepath}")
        print(f"üì¶ File size: {file_size:.2f} MB")
        print(f"üìä Model info: {self.training_info['num_movies']} movies, {self.training_info['num_users']} users")
        
        return self


print("‚úÖ MovieRecommender class created!")

‚úÖ MovieRecommender class created!


In [3]:
# Load data from previous notebook
print("=" * 80)
print("LOADING TRAINING DATA")
print("=" * 80)

# Try to load from pickle (faster), fallback to CSV
try:
    ratings_df = pd.read_pickle('../../data/ratings_sample.pkl')
    movies_df = pd.read_pickle('../../data/movies_sample.pkl')
    print("‚úÖ Loaded from pickle files")
except:
    try:
        ratings_df = pd.read_csv('../../data/ratings_sample.csv')
        movies_df = pd.read_csv('../../data/movies_sample.csv')
        print("‚úÖ Loaded from CSV files")
    except:
        print("‚ùå Data files not found! Please run notebook 01 first.")
        raise

print(f"\nüìä Dataset summary:")
print(f"  ‚Ä¢ Ratings: {len(ratings_df):,} rows")
print(f"  ‚Ä¢ Movies: {len(movies_df):,} rows")
print(f"  ‚Ä¢ Users: {ratings_df['userId'].nunique():,}")
print(f"  ‚Ä¢ Movies rated: {ratings_df['movieId'].nunique():,}")

LOADING TRAINING DATA
‚úÖ Loaded from pickle files

üìä Dataset summary:
  ‚Ä¢ Ratings: 100,000 rows
  ‚Ä¢ Movies: 10,329 rows
  ‚Ä¢ Users: 668
  ‚Ä¢ Movies rated: 10,283


In [4]:
# Create and train the recommender
print("\n" + "=" * 80)
print("INITIALIZING AND TRAINING MODEL")
print("=" * 80)

# Initialize recommender
recommender = MovieRecommender()

# Train the model
training_stats = recommender.train(ratings_df, movies_df, verbose=True)

# Display training statistics
print("\nüìä Training Statistics:")
for key, value in training_stats.items():
    print(f"  ‚Ä¢ {key}: {value}")


INITIALIZING AND TRAINING MODEL
üé¨ MovieRecommender initialized
TRAINING MOVIE RECOMMENDER

üìä Training data:
  ‚Ä¢ Users: 668
  ‚Ä¢ Movies: 10,283
  ‚Ä¢ Ratings: 100,000
  ‚Ä¢ Average rating: 3.44

‚è≥ Creating user-item matrix...
  ‚úÖ Matrix shape: (668, 10283)
  ‚úÖ Sparsity: 98.54%

‚è≥ Computing item-item similarity matrix...
  ‚úÖ Similarity matrix: (10283, 10283)
  ‚úÖ Memory usage: 807.07 MB

‚úÖ TRAINING COMPLETE!

üìä Training Statistics:
  ‚Ä¢ trained_at: 2025-11-29 21:41:27
  ‚Ä¢ num_users: 668
  ‚Ä¢ num_movies: 10283
  ‚Ä¢ num_ratings: 100000
  ‚Ä¢ sparsity: 98.54419334044155
  ‚Ä¢ avg_rating: 3.437665
  ‚Ä¢ model_version: 1.0
  ‚Ä¢ algorithm: Item-Based Collaborative Filtering
  ‚Ä¢ similarity_metric: Cosine Similarity


In [5]:
# Test the trained model with various scenarios

print("=" * 80)
print("TESTING THE TRAINED MODEL")
print("=" * 80)

# Test 1: Simple user profile
print("\nüß™ TEST 1: User who likes action movies")
print("-" * 80)

test_profile_1 = {
    1: 5.0,    # Example movie
    2: 4.5,    # Example movie
    50: 4.0    # Example movie
}

print("User's ratings:")
for movie_id, rating in test_profile_1.items():
    movie_info = movies_df[movies_df['movieId'] == movie_id]
    if not movie_info.empty:
        print(f"  ‚Ä¢ {movie_info['title'].values[0]}: {rating} ‚≠ê")

recs_1 = recommender.recommend(test_profile_1, n_recommendations=5)
print("\nüé¨ Top 5 Recommendations:")
for idx, row in recs_1.iterrows():
    print(f"  {idx+1}. {row['title']}")
    print(f"     Score: {row['score']:.3f} | Genres: {row['genres']}")

# Test 2: Find similar movies
print("\n\nüß™ TEST 2: Find movies similar to a specific movie")
print("-" * 80)

sample_movie_id = ratings_df['movieId'].iloc[0]
sample_movie = movies_df[movies_df['movieId'] == sample_movie_id]

if not sample_movie.empty:
    print(f"Base movie: {sample_movie['title'].values[0]}")
    print(f"Genres: {sample_movie['genres'].values[0]}")
    
    similar = recommender.get_similar_movies(sample_movie_id, n_similar=5)
    print("\nüé¨ Similar movies:")
    for idx, row in similar.iterrows():
        print(f"  {idx+1}. {row['title']}")
        print(f"     Similarity: {row['similarity']:.3f} | Genres: {row['genres']}")

# Test 3: Progressive recommendations (simulating new user)
print("\n\nüß™ TEST 3: Progressive recommendations (New user journey)")
print("-" * 80)

print("\nüë§ Step 1: User rates first movie")
progressive_ratings = {1: 5.0}
recs = recommender.recommend(progressive_ratings, n_recommendations=3)
print("Recommendations after 1 rating:")
for idx, row in recs.iterrows():
    print(f"  ‚Ä¢ {row['title']} (Score: {row['score']:.2f})")

print("\nüë§ Step 2: User rates 3 more movies")
progressive_ratings.update({2: 4.0, 50: 4.5, 100: 3.5})
recs = recommender.recommend(progressive_ratings, n_recommendations=3)
print("Recommendations after 4 ratings:")
for idx, row in recs.iterrows():
    print(f"  ‚Ä¢ {row['title']} (Score: {row['score']:.2f})")

print("\nüë§ Step 3: User rates 5 more movies")
progressive_ratings.update({150: 5.0, 200: 4.0, 250: 4.5, 300: 3.0, 350: 4.0})
recs = recommender.recommend(progressive_ratings, n_recommendations=3)
print("Recommendations after 9 ratings:")
for idx, row in recs.iterrows():
    print(f"  ‚Ä¢ {row['title']} (Score: {row['score']:.2f})")

print("\n‚úÖ All tests passed!")

TESTING THE TRAINED MODEL

üß™ TEST 1: User who likes action movies
--------------------------------------------------------------------------------
User's ratings:
  ‚Ä¢ Toy Story (1995): 5.0 ‚≠ê
  ‚Ä¢ Jumanji (1995): 4.5 ‚≠ê
  ‚Ä¢ Usual Suspects, The (1995): 4.0 ‚≠ê

üé¨ Top 5 Recommendations:
  1. Jurassic Park (1993)
     Score: 6.842 | Genres: Action|Adventure|Sci-Fi|Thriller
  2. Pulp Fiction (1994)
     Score: 6.621 | Genres: Comedy|Crime|Drama|Thriller
  3. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
     Score: 6.438 | Genres: Action|Adventure
  4. Star Wars: Episode VI - Return of the Jedi (1983)
     Score: 6.420 | Genres: Action|Adventure|Sci-Fi
  5. Star Wars: Episode IV - A New Hope (1977)
     Score: 6.385 | Genres: Action|Adventure|Sci-Fi


üß™ TEST 2: Find movies similar to a specific movie
--------------------------------------------------------------------------------
Base movie: Under Siege 2: Dark Territory (1995)
Genres: Actio

In [6]:
# Display model information
print("=" * 80)
print("MODEL INFORMATION")
print("=" * 80)

model_info = recommender.get_info()

print("\nüìã Model Details:")
for key, value in model_info.items():
    print(f"  ‚Ä¢ {key}: {value}")

print(f"\nüíæ Model Memory Usage:")
print(f"  ‚Ä¢ User-Item Matrix: {recommender.user_item_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  ‚Ä¢ Item Similarity Matrix: {recommender.item_similarity_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  ‚Ä¢ Movies DataFrame: {recommender.movies_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

total_memory = (
    recommender.user_item_matrix.memory_usage(deep=True).sum() +
    recommender.item_similarity_df.memory_usage(deep=True).sum() +
    recommender.movies_df.memory_usage(deep=True).sum()
) / 1024**2

print(f"  ‚Ä¢ Total: {total_memory:.2f} MB")

MODEL INFORMATION

üìã Model Details:
  ‚Ä¢ trained_at: 2025-11-29 21:41:27
  ‚Ä¢ num_users: 668
  ‚Ä¢ num_movies: 10283
  ‚Ä¢ num_ratings: 100000
  ‚Ä¢ sparsity: 98.54419334044155
  ‚Ä¢ avg_rating: 3.437665
  ‚Ä¢ model_version: 1.0
  ‚Ä¢ algorithm: Item-Based Collaborative Filtering
  ‚Ä¢ similarity_metric: Cosine Similarity

üíæ Model Memory Usage:
  ‚Ä¢ User-Item Matrix: 52.41 MB
  ‚Ä¢ Item Similarity Matrix: 807.07 MB
  ‚Ä¢ Movies DataFrame: 1.62 MB
  ‚Ä¢ Total: 861.11 MB


In [7]:
# Save the trained model
# WARNING: This creates a large file!

print("=" * 80)
print("SAVING MODEL")
print("=" * 80)

# Option 1: Save full model (LARGE FILE - will be excluded from Git)
# Uncomment if you want to save locally
save_full_model = False  # Set to True to save

if save_full_model:
    model_path = '../../models/saved_models/movie_recommender_v1.pkl'
    recommender.save(model_path)
    print(f"\n‚úÖ Full model saved to: {model_path}")
    print("‚ö†Ô∏è  Note: This file is excluded from Git due to size")
else:
    print("\n‚ö†Ô∏è  Full model NOT saved (file would be too large)")
    print("üí° The model can be recreated by running this notebook")
    print("üí° For deployment, we'll save to Cloud Storage instead")

print("\nüì¶ What's saved:")
print("  ‚úÖ Notebook code (can regenerate model)")
print("  ‚úÖ Training pipeline (reproducible)")
print("  ‚ùå Large model file (excluded from Git)")

SAVING MODEL

‚ö†Ô∏è  Full model NOT saved (file would be too large)
üí° The model can be recreated by running this notebook
üí° For deployment, we'll save to Cloud Storage instead

üì¶ What's saved:
  ‚úÖ Notebook code (can regenerate model)
  ‚úÖ Training pipeline (reproducible)
  ‚ùå Large model file (excluded from Git)


In [None]:
# Create a lightweight export for API deployment
print("=" * 80)
print("CREATING LIGHTWEIGHT MODEL EXPORT")
print("=" * 80)

# Instead of saving the huge matrices, save only essential info
lightweight_export = {
    'model_info': recommender.get_info(),
    'movies_df': recommender.movies_df,  # Movie metadata is small
    'sample_recommendations': {}
}

# Generate sample recommendations for documentation
print("\n‚è≥ Generating sample recommendations...")
sample_users = ratings_df['userId'].sample(5, random_state=42)

for user_id in sample_users:
    user_ratings_data = ratings_df[ratings_df['userId'] == user_id]
    user_profile = dict(zip(
        user_ratings_data['movieId'].head(5),
        user_ratings_data['rating'].head(5)
    ))
    
    recs = recommender.recommend(user_profile, n_recommendations=5)
    lightweight_export['sample_recommendations'][int(user_id)] = recs.to_dict('records')

# Save lightweight export
os.makedirs('../../models/exports', exist_ok=True)
export_path = '../../models/exports/model_info.pkl'

with open(export_path, 'wb') as f:
    pickle.dump(lightweight_export, f)

file_size = os.path.getsize(export_path) / 1024
print(f"\n‚úÖ Lightweight export saved to: {export_path}")
print(f"üì¶ File size: {file_size:.2f} KB (much smaller!)")
print("\nüí° This file contains:")
print("  ‚Ä¢ Model metadata and statistics")
print("  ‚Ä¢ Movie catalog")
print("  ‚Ä¢ Sample recommendations for documentation")