# Collaborative Filtering with NMF - Testing Notebook

This notebook tests the NMF-based collaborative filtering implementation for the Timbrality music recommendation system.

In [None]:
# Import required libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Add the project to Python path
sys.path.append(os.path.abspath('.'))

# Import our custom modules
from timbral.models.nmf_model import NMFModel
from timbral.utils.data_loader import DataLoader
from timbral.logic.trainer import ModelTrainer
from timbral.utils.redis_connector import RedisConnector

## 1. Create Sample Data

Let's create some synthetic user-item interaction data to test our collaborative filtering model.

In [None]:
# Create synthetic user-item interaction data
np.random.seed(42)

# Parameters
n_users = 100
n_items = 200
n_interactions = 2000

# Generate random interactions
user_ids = np.random.randint(0, n_users, n_interactions)
item_ids = np.random.randint(0, n_items, n_interactions)
ratings = np.random.uniform(1, 5, n_interactions)  # Ratings between 1 and 5

# Create DataFrame
interactions_df = pd.DataFrame({
    'user_id': user_ids,
    'item_id': item_ids,
    'rating': ratings
})

# Remove duplicates and keep highest rating for each user-item pair
interactions_df = interactions_df.groupby(['user_id', 'item_id'])['rating'].max().reset_index()

print(f"Generated {len(interactions_df)} unique user-item interactions")
print(f"Users: {interactions_df['user_id'].nunique()}")
print(f"Items: {interactions_df['item_id'].nunique()}")
print(f"Sparsity: {1 - len(interactions_df) / (n_users * n_items):.3f}")

interactions_df.head()

## 2. Initialize Data Loader and Create User-Item Matrix

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Create user-item matrix
user_item_matrix = data_loader.create_user_item_matrix(interactions_df)

print(f"User-item matrix shape: {user_item_matrix.shape}")
print(f"Matrix sparsity: {(user_item_matrix == 0).sum().sum() / user_item_matrix.size:.3f}")
print(f"Non-zero entries: {(user_item_matrix > 0).sum().sum()}")

# Display first few rows and columns
user_item_matrix.iloc[:10, :10]

## 3. Train NMF Model

In [None]:
# Initialize trainer
trainer = ModelTrainer()

# Train NMF model with different numbers of components
n_components = 20

print(f"Training NMF model with {n_components} components...")
nmf_model = trainer.train_nmf_model(
    user_item_matrix=user_item_matrix,
    n_components=n_components,
    random_state=42
)

print("\nModel training completed!")
print(f"User factors shape: {nmf_model.user_factors.shape}")
print(f"Item factors shape: {nmf_model.item_factors.shape}")

## 4. Test Model Predictions

In [None]:
# Test predictions for a few user-item pairs
test_users = np.array([0, 1, 2, 5, 10])
test_items = np.array([0, 5, 10, 15, 20])

predictions = nmf_model.predict(test_users, test_items)

print("Sample predictions:")
for i, (user, item, pred) in enumerate(zip(test_users, test_items, predictions)):
    actual = user_item_matrix.iloc[user, item]
    print(f"User {user}, Item {item}: Predicted={pred:.3f}, Actual={actual:.3f}")

## 5. Generate Recommendations

In [None]:
# Get recommendations for a specific user
user_id = 0
top_k = 10

recommended_items, scores = nmf_model.get_top_recommendations(user_id, top_k)

print(f"Top {top_k} recommendations for User {user_id}:")
for i, (item, score) in enumerate(zip(recommended_items, scores)):
    actual_rating = user_item_matrix.iloc[user_id, item]
    print(f"{i+1}. Item {item}: Score={score:.3f}, Actual Rating={actual_rating:.3f}")

## 6. Evaluate Model Performance

In [None]:
# Split data into train/test
train_interactions, test_interactions = train_test_split(
    interactions_df, test_size=0.2, random_state=42
)

print(f"Train interactions: {len(train_interactions)}")
print(f"Test interactions: {len(test_interactions)}")

# Create train matrix
train_matrix = data_loader.create_user_item_matrix(train_interactions)

# Train model on train data
train_model = NMFModel(n_components=n_components, random_state=42)
train_model.fit(train_matrix.values)

# Evaluate on test set
test_users = test_interactions['user_id'].values
test_items = test_interactions['item_id'].values
test_ratings = test_interactions['rating'].values

# Map to matrix indices
test_user_indices = [data_loader.user_to_idx.get(user, -1) for user in test_users]
test_item_indices = [data_loader.item_to_idx.get(item, -1) for item in test_items]

# Filter out unknown users/items
valid_indices = [(i, u, it) for i, (u, it) in enumerate(zip(test_user_indices, test_item_indices)) 
                 if u >= 0 and it >= 0 and u < train_model.n_users and it < train_model.n_items]

if valid_indices:
    original_indices, valid_users, valid_items = zip(*valid_indices)
    valid_ratings = test_ratings[list(original_indices)]
    
    predictions = train_model.predict(np.array(valid_users), np.array(valid_items))
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(valid_ratings, predictions))
    print(f"\nTest RMSE: {rmse:.3f}")
    
    # Calculate MAE
    mae = np.mean(np.abs(valid_ratings - predictions))
    print(f"Test MAE: {mae:.3f}")
else:
    print("No valid test samples found (all users/items unknown)")

## 7. Visualize User and Item Embeddings

In [None]:
# Visualize the distribution of embedding values
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# User factors distribution
axes[0, 0].hist(nmf_model.user_factors.flatten(), bins=50, alpha=0.7)
axes[0, 0].set_title('Distribution of User Factor Values')
axes[0, 0].set_xlabel('Factor Value')
axes[0, 0].set_ylabel('Frequency')

# Item factors distribution
axes[0, 1].hist(nmf_model.item_factors.flatten(), bins=50, alpha=0.7)
axes[0, 1].set_title('Distribution of Item Factor Values')
axes[0, 1].set_xlabel('Factor Value')
axes[0, 1].set_ylabel('Frequency')

# Heatmap of first 10 users and factors
sns.heatmap(nmf_model.user_factors[:10], cmap='viridis', ax=axes[1, 0])
axes[1, 0].set_title('User Factors Heatmap (First 10 Users)')
axes[1, 0].set_xlabel('Factor Dimension')
axes[1, 0].set_ylabel('User ID')

# Heatmap of first 10 items and factors
sns.heatmap(nmf_model.item_factors[:, :10].T, cmap='viridis', ax=axes[1, 1])
axes[1, 1].set_title('Item Factors Heatmap (First 10 Items)')
axes[1, 1].set_xlabel('Factor Dimension')
axes[1, 1].set_ylabel('Item ID')

plt.tight_layout()
plt.show()

## 8. Test Model Saving and Loading

In [None]:
# Save the model
model_path = "models/test_nmf_model.pkl"
os.makedirs("models", exist_ok=True)

nmf_model.save(model_path)
print(f"Model saved to {model_path}")

# Load the model
loaded_model = NMFModel()
loaded_model.load(model_path)

print(f"Model loaded successfully")
print(f"Loaded model components: {loaded_model.n_components}")
print(f"Loaded model users: {loaded_model.n_users}")
print(f"Loaded model items: {loaded_model.n_items}")

# Test that loaded model gives same predictions
original_pred = nmf_model.predict([0], [0])
loaded_pred = loaded_model.predict([0], [0])

print(f"\nPrediction comparison:")
print(f"Original model: {original_pred[0]:.6f}")
print(f"Loaded model: {loaded_pred[0]:.6f}")
print(f"Difference: {abs(original_pred[0] - loaded_pred[0]):.10f}")

## 9. Test Redis Caching (Optional)

In [None]:
# Test Redis caching (will gracefully handle if Redis is not available)
redis_connector = RedisConnector()

if redis_connector.redis_client:
    print("Redis connection successful!")
    
    # Test caching recommendations
    sample_recommendations = [
        {"item_id": 1, "score": 0.95, "title": "Song A"},
        {"item_id": 2, "score": 0.87, "title": "Song B"},
        {"item_id": 3, "score": 0.82, "title": "Song C"}
    ]
    
    # Cache recommendations
    success = redis_connector.set_recommendations(user_id=123, recommendations=sample_recommendations)
    print(f"Caching recommendations: {'Success' if success else 'Failed'}")
    
    # Retrieve recommendations
    cached_recs = redis_connector.get_recommendations(user_id=123)
    print(f"Retrieved recommendations: {cached_recs}")
    
    # Test caching embeddings
    sample_embeddings = {"user_embeddings": nmf_model.user_factors[:5]}
    success = redis_connector.set_embeddings("test_embeddings", sample_embeddings)
    print(f"Caching embeddings: {'Success' if success else 'Failed'}")
    
    # Retrieve embeddings
    cached_embeddings = redis_connector.get_embeddings("test_embeddings")
    if cached_embeddings:
        print(f"Retrieved embeddings shape: {cached_embeddings['user_embeddings'].shape}")

else:
    print("Redis not available - caching will be disabled")

## 10. Performance Analysis

In [None]:
# Analyze performance with different number of components
components_range = [5, 10, 20, 30, 50]
rmse_scores = []
training_times = []

import time

for n_comp in components_range:
    print(f"Testing {n_comp} components...")
    
    # Train model
    start_time = time.time()
    model = NMFModel(n_components=n_comp, random_state=42)
    model.fit(train_matrix.values)
    training_time = time.time() - start_time
    
    # Evaluate
    if valid_indices:
        predictions = model.predict(np.array(valid_users), np.array(valid_items))
        rmse = np.sqrt(mean_squared_error(valid_ratings, predictions))
        rmse_scores.append(rmse)
    else:
        rmse_scores.append(float('nan'))
    
    training_times.append(training_time)

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.plot(components_range, rmse_scores, 'bo-')
ax1.set_xlabel('Number of Components')
ax1.set_ylabel('RMSE')
ax1.set_title('Model Performance vs Components')
ax1.grid(True)

ax2.plot(components_range, training_times, 'ro-')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Training Time (seconds)')
ax2.set_title('Training Time vs Components')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Print summary
print("\nPerformance Summary:")
for i, n_comp in enumerate(components_range):
    print(f"{n_comp} components: RMSE={rmse_scores[i]:.3f}, Time={training_times[i]:.2f}s")

## Summary

This notebook demonstrates:

1. ✅ **NMF Model Implementation**: Successfully trains and makes predictions
2. ✅ **Data Loading**: Handles user-item interaction data and creates matrices
3. ✅ **Recommendations**: Generates top-k recommendations for users
4. ✅ **Model Persistence**: Saves and loads models correctly
5. ✅ **Evaluation**: Calculates RMSE and MAE metrics
6. ✅ **Redis Integration**: Caches recommendations and embeddings
7. ✅ **Performance Analysis**: Tests different hyperparameters

