# Movie Recommender System - Phase 4: Collaborative Filtering


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load processed data
df = pd.read_csv('../data/processed/movies_processed.csv')
print(f"Movies loaded: {len(df)}")


Movies loaded: 4772


# 1. CREATE SYNTHETIC USER-MOVIE RATING MATRIX


In [2]:
print("\n[1] Creating Synthetic User-Movie Rating Matrix...")
print("-" * 60)

"""
Note: Dataset TMDB không có user ratings, vì vậy ta sẽ:
1. Sử dụng vote_average và vote_count để tạo synthetic ratings
2. Tạo user profiles dựa trên phân phối thực tế
3. Implement các thuật toán CF chuẩn để demo
"""

# Set random seed for reproducibility
np.random.seed(42)

# Tạo synthetic users
n_users = 1000
n_movies = len(df)

print(f"Creating {n_users} synthetic users for {n_movies} movies...")

# Tạo rating matrix sparse (chỉ ~5% có ratings)
def create_synthetic_ratings(df, n_users=1000, sparsity=0.95):
    """
    Create synthetic user-movie rating matrix

    Strategy:
    - Users rate movies based on movie's vote_average with some noise
    - Popular movies (high vote_count) get more ratings
    - Simulate realistic sparsity
    """
    n_movies = len(df)

    # Initialize sparse rating matrix
    ratings_dict = {'userId': [], 'movieId': [], 'rating': []}

    # Normalize vote_count for probability weighting
    vote_count_norm = df['vote_count'] / df['vote_count'].max()

    for user_id in range(n_users):
        # Each user rates 10-100 movies (realistic range)
        n_ratings = np.random.randint(10, 100)

        # Select movies to rate (weighted by popularity)
        probabilities = vote_count_norm ** 0.5  # Square root for smoother distribution
        probabilities = probabilities / probabilities.sum()

        movie_indices = np.random.choice(
            n_movies,
            size=n_ratings,
            replace=False,
            p=probabilities
        )

        for movie_idx in movie_indices:
            # Base rating from movie's vote_average
            base_rating = df.iloc[movie_idx]['vote_average']

            # Add user preference noise
            user_noise = np.random.normal(0, 1.5)
            rating = base_rating + user_noise

            # Clip to valid range [1, 10]
            rating = np.clip(rating, 1, 10)

            ratings_dict['userId'].append(user_id)
            ratings_dict['movieId'].append(movie_idx)
            ratings_dict['rating'].append(rating)

    return pd.DataFrame(ratings_dict)

# Create ratings
ratings_df = create_synthetic_ratings(df, n_users=n_users)
print(f"\nRatings created: {len(ratings_df)}")
print(f"Sparsity: {1 - len(ratings_df)/(n_users * n_movies):.4f}")
print(f"Avg ratings per user: {len(ratings_df)/n_users:.1f}")
print(f"Avg ratings per movie: {len(ratings_df)/n_movies:.1f}")

print("\nRatings distribution:")
print(ratings_df['rating'].describe())

# Save ratings
ratings_df.to_csv('../data/processed/synthetic_ratings.csv', index=False)
print("\nSaved: synthetic_ratings.csv")


[1] Creating Synthetic User-Movie Rating Matrix...
------------------------------------------------------------
Creating 1000 synthetic users for 4772 movies...

Ratings created: 54382
Sparsity: 0.9886
Avg ratings per user: 54.4
Avg ratings per movie: 11.4

Ratings distribution:
count    54382.000000
mean         6.478852
std          1.700962
min          1.000000
25%          5.327297
50%          6.500286
75%          7.669869
max         10.000000
Name: rating, dtype: float64

Saved: synthetic_ratings.csv


# 2. CREATE USER-ITEM MATRIX


In [3]:
print("\n[2] Creating User-Item Matrix...")
print("-" * 60)

# Pivot to create user-item matrix
user_item_matrix = ratings_df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

print(f"User-Item Matrix shape: {user_item_matrix.shape}")
print(f"Memory size: {user_item_matrix.memory_usage().sum() / 1024**2:.2f} MB")


[2] Creating User-Item Matrix...
------------------------------------------------------------
User-Item Matrix shape: (1000, 4548)
Memory size: 34.71 MB


# 3. MATRIX FACTORIZATION (SVD)


In [4]:
print("\n[3] Matrix Factorization using SVD...")
print("-" * 60)

def matrix_factorization_svd(user_item_matrix, k=50):
    """
    Perform Matrix Factorization using SVD

    Parameters:
    -----------
    user_item_matrix : DataFrame
        User-Item rating matrix
    k : int
        Number of latent factors

    Returns:
    --------
    predictions : array
        Predicted ratings matrix
    """
    # Convert to numpy array
    R = user_item_matrix.values

    # Normalize by subtracting mean rating for each user
    user_ratings_mean = np.mean(R, axis=1)
    R_normalized = R - user_ratings_mean.reshape(-1, 1)

    # Perform SVD
    print(f"Performing SVD with k={k} latent factors...")
    U, sigma, Vt = svds(R_normalized, k=k)

    # Convert sigma to diagonal matrix
    sigma = np.diag(sigma)

    # Reconstruct the matrix
    predictions = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

    # Clip predictions to valid rating range
    predictions = np.clip(predictions, 1, 10)

    return predictions, U, sigma, Vt

# Apply SVD
predictions_svd, U, sigma, Vt = matrix_factorization_svd(user_item_matrix, k=50)
print(f"Predictions shape: {predictions_svd.shape}")

# Create predictions DataFrame
predictions_df = pd.DataFrame(
    predictions_svd,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

def get_user_recommendations_svd(user_id, predictions_df, user_item_matrix, df, top_n=10):
    """
    Get recommendations for a user using SVD predictions

    Parameters:
    -----------
    user_id : int
        User ID
    predictions_df : DataFrame
        Predicted ratings
    user_item_matrix : DataFrame
        Actual ratings matrix
    df : DataFrame
        Movies dataframe
    top_n : int
        Number of recommendations
    """
    # Get user's predictions
    user_predictions = predictions_df.loc[user_id]

    # Get movies user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index

    # Get predictions for unrated movies
    recommendations = user_predictions[unrated_movies].sort_values(ascending=False).head(top_n)

    # Get movie details
    movie_details = df.iloc[recommendations.index][['title', 'genres', 'vote_average', 'release_year']].copy()
    movie_details['predicted_rating'] = recommendations.values

    return movie_details

# Test SVD recommendations
print("\n[TEST] SVD Recommendations for User 0:")
print(get_user_recommendations_svd(0, predictions_df, user_item_matrix, df, top_n=10))


[3] Matrix Factorization using SVD...
------------------------------------------------------------
Performing SVD with k=50 latent factors...
Predictions shape: (1000, 4548)

[TEST] SVD Recommendations for User 0:
                           title                                       genres  \
2965       2001: A Space Odyssey  ['Science Fiction', 'Mystery', 'Adventure']   
16                  The Avengers   ['Science Fiction', 'Action', 'Adventure']   
571         Inglourious Basterds       ['Drama', 'Action', 'Thriller', 'War']   
1399                     In Time    ['Action', 'Thriller', 'Science Fiction']   
3939                        올드보이   ['Drama', 'Thriller', 'Mystery', 'Action']   
111                 Transformers   ['Adventure', 'Science Fiction', 'Action']   
439               Shutter Island             ['Drama', 'Thriller', 'Mystery']   
79                    Iron Man 2   ['Adventure', 'Action', 'Science Fiction']   
26    Captain America: Civil War   ['Adventure', 'Action

# 4. USER-BASED COLLABORATIVE FILTERING


In [5]:
print("\n[4] User-Based Collaborative Filtering...")
print("-" * 60)

# Compute user-user similarity
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

print(f"User Similarity Matrix shape: {user_similarity_df.shape}")

def get_user_recommendations_ubcf(user_id, user_similarity_df, user_item_matrix, df,
                                  top_n=10, n_neighbors=30):
    """
    User-Based Collaborative Filtering

    Recommend based on similar users' preferences
    """
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_neighbors+1]

    # Get ratings from similar users
    similar_users_ratings = user_item_matrix.loc[similar_users.index]

    # Weight ratings by similarity
    weighted_ratings = similar_users_ratings.T.dot(similar_users)

    # Normalize by sum of similarities
    similarity_sum = similar_users.sum()
    predicted_ratings = weighted_ratings / similarity_sum

    # Get movies user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index

    # Get top predictions
    recommendations = predicted_ratings[unrated_movies].sort_values(ascending=False).head(top_n)

    # Get movie details
    movie_details = df.iloc[recommendations.index][['title', 'genres', 'vote_average', 'release_year']].copy()
    movie_details['predicted_rating'] = recommendations.values

    return movie_details

# Test UBCF
print("\n[TEST] User-Based CF Recommendations for User 0:")
print(get_user_recommendations_ubcf(0, user_similarity_df, user_item_matrix, df, top_n=10))


[4] User-Based Collaborative Filtering...
------------------------------------------------------------
User Similarity Matrix shape: (1000, 1000)

[TEST] User-Based CF Recommendations for User 0:
                        title                                      genres  \
94    Guardians of the Galaxy  ['Action', 'Science Fiction', 'Adventure']   
686                       Hop           ['Animation', 'Comedy', 'Family']   
124                    Frozen        ['Animation', 'Adventure', 'Family']   
456                      Fury                  ['War', 'Drama', 'Action']   
4120      The Last Five Years     ['Comedy', 'Drama', 'Music', 'Romance']   
3256          American Psycho              ['Thriller', 'Drama', 'Crime']   
4535              Now Is Good                        ['Drama', 'Romance']   
239                   Gravity    ['Science Fiction', 'Thriller', 'Drama']   
583                  Big Fish           ['Adventure', 'Fantasy', 'Drama']   
1141                   8 Mile    

# 5. ITEM-BASED COLLABORATIVE FILTERING


In [6]:
print("\n[5] Item-Based Collaborative Filtering...")
print("-" * 60)

# Compute item-item similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print(f"Item Similarity Matrix shape: {item_similarity_df.shape}")

def get_user_recommendations_ibcf(user_id, item_similarity_df, user_item_matrix, df,
                                  top_n=10, n_neighbors=30):
    """
    Item-Based Collaborative Filtering

    Recommend based on similarity to movies user has rated
    """
    # Get user's ratings
    user_ratings = user_item_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0]

    # Calculate weighted scores for all movies
    predicted_ratings = pd.Series(0.0, index=user_item_matrix.columns)

    for movie_id, rating in rated_movies.items():
        # Get similar movies
        similar_movies = item_similarity_df[movie_id]

        # Weight by user's rating
        predicted_ratings += similar_movies * rating

    # Get movies user hasn't rated
    unrated_movies = user_ratings[user_ratings == 0].index

    # Get top predictions
    recommendations = predicted_ratings[unrated_movies].sort_values(ascending=False).head(top_n)

    # Get movie details
    movie_details = df.iloc[recommendations.index][['title', 'genres', 'vote_average', 'release_year']].copy()
    movie_details['predicted_rating'] = recommendations.values

    return movie_details

# Test IBCF
print("\n[TEST] Item-Based CF Recommendations for User 0:")
print(get_user_recommendations_ibcf(0, item_similarity_df, user_item_matrix, df, top_n=10))


[5] Item-Based Collaborative Filtering...
------------------------------------------------------------
Item Similarity Matrix shape: (4548, 4548)

[TEST] Item-Based CF Recommendations for User 0:
                                             title  \
96                                       Inception   
25                                         Titanic   
26                      Captain America: Civil War   
426                               The Hunger Games   
1399                                       In Time   
124                                         Frozen   
229   Star Wars: Episode III - Revenge of the Sith   
65                                 The Dark Knight   
111                                   Transformers   
28                                  Jurassic World   

                                                 genres  vote_average  \
96    ['Action', 'Thriller', 'Science Fiction', 'Mys...           8.1   
25                     ['Drama', 'Romance', 'Thriller']       

# 6. SAVE MODELS

In [7]:
np.save('../results/models/collaborative/user_similarity.npy', user_similarity)
np.save('../results/models/collaborative/item_similarity.npy', item_similarity)
np.save('../results/models/collaborative/svd_U.npy', U)
np.save('../results/models/collaborative/svd_sigma.npy', sigma)
np.save('../results/models/collaborative/svd_Vt.npy', Vt)
user_item_matrix.to_csv('../data/processed/user_item_matrix.csv')

# 7. COMPREHENSIVE TESTING


In [8]:
print("\n[7] Comprehensive Testing")
print("=" * 60)

test_users = [0, 1, 2, 3, 4]

for user_id in test_users:
    print(f"\n{'='*60}")
    print(f"RECOMMENDATIONS FOR USER {user_id}")
    print(f"{'='*60}")

    print("\n[SVD Method]")
    print(get_user_recommendations_svd(user_id, predictions_df, user_item_matrix, df, top_n=5))

    print("\n[User-Based CF]")
    print(get_user_recommendations_ubcf(user_id, user_similarity_df, user_item_matrix, df, top_n=5))

    print("\n[Item-Based CF]")
    print(get_user_recommendations_ibcf(user_id, item_similarity_df, user_item_matrix, df, top_n=5))

print("\n" + "=" * 60)
print("COLLABORATIVE FILTERING COMPLETED!")
print("=" * 60)
print("\nAvailable functions:")
print("  - get_user_recommendations_svd(user_id, predictions_df, user_item_matrix, df, top_n)")
print("  - get_user_recommendations_ubcf(user_id, user_similarity_df, user_item_matrix, df, top_n)")
print("  - get_user_recommendations_ibcf(user_id, item_similarity_df, user_item_matrix, df, top_n)")


[7] Comprehensive Testing

RECOMMENDATIONS FOR USER 0

[SVD Method]
                      title                                       genres  \
2965  2001: A Space Odyssey  ['Science Fiction', 'Mystery', 'Adventure']   
16             The Avengers   ['Science Fiction', 'Action', 'Adventure']   
571    Inglourious Basterds       ['Drama', 'Action', 'Thriller', 'War']   
1399                In Time    ['Action', 'Thriller', 'Science Fiction']   
3939                   올드보이   ['Drama', 'Thriller', 'Mystery', 'Action']   

      vote_average  release_year  predicted_rating  
2965           7.9        1968.0          1.832946  
16             7.4        2012.0          1.747337  
571            7.9        2009.0          1.686181  
1399           6.7        2011.0          1.686178  
3939           8.0        2003.0          1.618400  

[User-Based CF]
                        title                                      genres  \
94    Guardians of the Galaxy  ['Action', 'Science Fiction', '