# RNN Model Evaluation Notebook

This notebook provides comprehensive evaluation of the trained RNN models from the BARYONYX Lost & Found system.

## Overview

The BARYONYX system uses three RNN models:

1. **UserBehaviorLSTM**: Predicts user's next likely action based on behavior sequences
2. **BidirectionalDescriptionRNN**: Classifies item descriptions into categories
3. **TemporalPatternRNN**: Predicts optimal time periods for finding items

## Table of Contents

1. [Setup and Imports](#setup)
2. [Load Trained Models](#load)
3. [User Behavior LSTM Evaluation](#behavior)
4. [Description RNN Evaluation](#description)
5. [Temporal Pattern RNN Evaluation](#temporal)
6. [Performance Analysis](#analysis)
7. [Visualization and Results](#visualization)
8. [Model Comparison](#comparison)


## 1. Setup and Imports {#setup}


In [None]:
# Core imports
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Machine Learning imports
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

# Add project root to path
sys.path.append('.')

# Import project modules
from rnn_models import RNNModelManager, UserBehaviorLSTM, BidirectionalDescriptionRNN, TemporalPatternRNN

print("✅ All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")


## 2. Load Trained Models {#load}


In [None]:
# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize RNN manager
print("Loading RNN models from models/rnn_models/...")
rnn_manager = RNNModelManager(device=str(device))

# Check if model files exist
model_dir = 'models/rnn_models'
model_files = {
    'user_behavior_lstm.pth': 'User Behavior LSTM',
    'description_birnn.pth': 'Description Bidirectional RNN', 
    'temporal_pattern_rnn.pth': 'Temporal Pattern RNN',
    'vocab.pkl': 'Vocabulary',
    'temporal_data.pkl': 'Temporal Data'
}

print("\n=== Model File Status ===")
for filename, description in model_files.items():
    filepath = os.path.join(model_dir, filename)
    if os.path.exists(filepath):
        file_size = os.path.getsize(filepath) / 1024  # KB
        print(f"✅ {description}: {filename} ({file_size:.1f} KB)")
    else:
        print(f"❌ {description}: {filename} - NOT FOUND")

# Load models
try:
    rnn_manager.load_models()
    print("\n✅ RNN Models loaded successfully!")
    
    # Check model status
    print(f"\n=== Model Status ===")
    print(f"User Behavior Model: {'✅ Loaded' if hasattr(rnn_manager, 'user_behavior_model') else '❌ Not loaded'}")
    print(f"Description Model: {'✅ Loaded' if hasattr(rnn_manager, 'description_model') else '❌ Not loaded'}")
    print(f"Temporal Model: {'✅ Loaded' if hasattr(rnn_manager, 'temporal_model') else '❌ Not loaded'}")
    print(f"Vocabulary: {'✅ Loaded' if hasattr(rnn_manager, 'vocab') and rnn_manager.vocab else '❌ Not loaded'}")
    print(f"Vocabulary Size: {len(rnn_manager.vocab) if hasattr(rnn_manager, 'vocab') and rnn_manager.vocab else 0}")
    
except Exception as e:
    print(f"❌ Error loading RNN models: {e}")
    print("Will proceed with evaluation using default models")


## 3. User Behavior LSTM Evaluation {#behavior}


In [None]:
# User Behavior LSTM Evaluation
def evaluate_user_behavior_model(rnn_manager, num_test_samples=200):
    """Evaluate the User Behavior LSTM model"""
    
    print("🧠 Evaluating User Behavior LSTM...")
    
    # Create test data
    actions = ['search', 'upload', 'view', 'browse', 'logout']
    item_types = ['phone', 'wallet', 'keys', 'laptop', 'glasses', 'watch', 'bag', 'book', 'charger', 'headphones']
    
    # Generate test sequences
    test_sequences = []
    test_labels = []
    
    for i in range(num_test_samples):
        # Create a realistic user behavior sequence
        sequence_length = 10
        features = []
        
        for j in range(sequence_length):
            # Generate realistic features
            hour = np.random.uniform(0, 1)  # Normalized hour
            day_of_week = np.random.uniform(0, 1)  # Normalized day
            action_type = np.random.uniform(0, 1)  # Normalized action
            item_type = np.random.uniform(0, 1) if j % 3 == 0 else 0  # Some actions have items
            confidence = np.random.uniform(0.6, 1.0)
            search_count = np.random.uniform(0, 1)
            upload_count = np.random.uniform(0, 0.5)
            view_count = np.random.uniform(0, 0.8)
            time_since_last = np.random.uniform(0, 1)
            session_length = np.random.uniform(0, 1)
            
            feature_vector = [hour, day_of_week, action_type, item_type, confidence,
                            search_count, upload_count, view_count, time_since_last, session_length]
            features.append(feature_vector)
        
        # Create label (next action)
        next_action = np.random.choice(actions)
        label = actions.index(next_action)
        
        test_sequences.append(features)
        test_labels.append(label)
    
    # Convert to tensors
    X_test = torch.FloatTensor(test_sequences).to(device)
    y_test = torch.LongTensor(test_labels).to(device)
    
    # Evaluate model
    model = rnn_manager.user_behavior_model
    model.eval()
    
    predictions = []
    probabilities = []
    attention_weights = []
    
    with torch.no_grad():
        for i in range(0, len(X_test), 32):  # Batch processing
            batch_X = X_test[i:i+32]
            outputs, attention = model(batch_X)
            probs = F.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
            attention_weights.extend(attention.cpu().numpy())
    
    predictions = np.array(predictions)
    y_test_np = y_test.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_np, predictions)
    precision = precision_score(y_test_np, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_np, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test_np, predictions, average='weighted', zero_division=0)
    
    print(f"\n=== User Behavior LSTM Results ===")
    print(f"Test samples: {len(test_sequences)}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Per-class performance
    print(f"\nPer-class Performance:")
    for i, action in enumerate(actions):
        class_mask = y_test_np == i
        if np.sum(class_mask) > 0:
            class_acc = accuracy_score(y_test_np[class_mask], predictions[class_mask])
            print(f"  {action}: {class_acc:.4f} ({np.sum(class_mask)} samples)")
    
    # Confusion matrix
    cm = confusion_matrix(y_test_np, predictions)
    print(f"\nConfusion Matrix:")
    print("Predicted ->", end="")
    for action in actions:
        print(f"{action:>8}", end="")
    print()
    for i, action in enumerate(actions):
        print(f"{action:>8}", end="")
        for j in range(len(actions)):
            print(f"{cm[i,j]:>8}", end="")
        print()
    
    return {
        'predictions': predictions,
        'true_labels': y_test_np,
        'probabilities': probabilities,
        'attention_weights': attention_weights,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }

# Run evaluation
behavior_results = evaluate_user_behavior_model(rnn_manager)


## 4. Description RNN Evaluation {#description}


In [None]:
# Description RNN Evaluation
def evaluate_description_model(rnn_manager, num_test_samples=200):
    """Evaluate the Description Bidirectional RNN model"""
    
    print("📝 Evaluating Description Bidirectional RNN...")
    
    # Test descriptions for different categories
    test_descriptions = [
        # Phone
        "Lost black iPhone 12 with cracked screen",
        "Found Samsung Galaxy phone with blue case", 
        "Lost iPhone 13 Pro Max gold",
        "Found Google Pixel phone black",
        "Lost OnePlus phone with red case",
        
        # Mouse
        "Lost computer mouse wireless black",
        "Found gaming mouse red with RGB",
        "Lost wireless mouse Logitech black",
        "Found optical mouse white",
        "Lost Bluetooth mouse silver",
        
        # Wallet
        "Lost wallet brown leather with cards",
        "Found wallet black leather bifold",
        "Lost wallet red leather with money",
        "Found wallet blue canvas",
        "Lost wallet black synthetic leather",
        
        # Tumbler
        "Lost tumbler stainless steel silver",
        "Found water bottle blue plastic",
        "Lost tumbler black with handle",
        "Found coffee cup white ceramic",
        "Lost tumbler red with straw"
    ]
    
    # Create more test samples by generating variations
    all_descriptions = []
    all_labels = []
    
    categories = {
        0: 'phone',
        1: 'mouse', 
        2: 'wallet',
        3: 'tumbler'
    }
    
    # Generate test samples
    for i in range(num_test_samples):
        if i < len(test_descriptions):
            desc = test_descriptions[i]
            label = i // 5  # 5 descriptions per category
        else:
            # Generate random descriptions for the 4 categories
            item_types = ['phone', 'mouse', 'wallet', 'tumbler']
            colors = ['black', 'blue', 'red', 'white', 'silver', 'brown']
            item_type = np.random.choice(item_types)
            color = np.random.choice(colors)
            desc = f"Lost {color} {item_type}"
            label = item_types.index(item_type)
        
        all_descriptions.append(desc)
        all_labels.append(label)
    
    # Prepare data for model
    if not hasattr(rnn_manager, 'vocab') or not rnn_manager.vocab:
        print("Building vocabulary from test data...")
        from collections import Counter
        counter = Counter()
        for desc in all_descriptions:
            counter.update(desc.lower().split())
        rnn_manager.vocab = {w: i + 1 for i, (w, c) in enumerate(counter.items()) if c >= 1}
        rnn_manager.vocab_size = len(rnn_manager.vocab)
        rnn_manager.vocab['<PAD>'] = 0
        rnn_manager.vocab['<UNK>'] = len(rnn_manager.vocab)
    
    # Convert texts to sequences
    def text_to_sequence(text, max_length=20):
        words = text.lower().split()
        word_indices = [rnn_manager.vocab.get(w, rnn_manager.vocab['<UNK>']) for w in words]
        
        if len(word_indices) < max_length:
            word_indices.extend([rnn_manager.vocab['<PAD>']] * (max_length - len(word_indices)))
        else:
            word_indices = word_indices[:max_length]
        
        return word_indices
    
    # Convert to tensors
    X_test = []
    for desc in all_descriptions:
        X_test.append(text_to_sequence(desc))
    
    X_test = torch.LongTensor(X_test).to(device)
    y_test = torch.LongTensor(all_labels).to(device)
    
    # Evaluate model
    model = rnn_manager.description_model
    model.eval()
    
    predictions = []
    probabilities = []
    attention_weights = []
    
    with torch.no_grad():
        for i in range(0, len(X_test), 32):  # Batch processing
            batch_X = X_test[i:i+32]
            outputs, attention = model(batch_X)
            probs = F.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
            attention_weights.extend(attention.cpu().numpy())
    
    predictions = np.array(predictions)
    y_test_np = y_test.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_np, predictions)
    precision = precision_score(y_test_np, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_np, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test_np, predictions, average='weighted', zero_division=0)
    
    print(f"\n=== Description RNN Results ===")
    print(f"Test samples: {len(all_descriptions)}")
    print(f"Vocabulary size: {len(rnn_manager.vocab)}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Per-class performance
    print(f"\nPer-class Performance:")
    for i, (cat_id, cat_name) in enumerate(categories.items()):
        class_mask = y_test_np == cat_id
        if np.sum(class_mask) > 0:
            class_acc = accuracy_score(y_test_np[class_mask], predictions[class_mask])
            print(f"  {cat_name}: {class_acc:.4f} ({np.sum(class_mask)} samples)")
    
    # Show some example predictions
    print(f"\nExample Predictions:")
    for i in range(min(10, len(all_descriptions))):
        pred_cat = categories.get(predictions[i], 'unknown')
        true_cat = categories.get(y_test_np[i], 'unknown')
        correct = "✅" if predictions[i] == y_test_np[i] else "❌"
        print(f"  {correct} '{all_descriptions[i][:30]}...' -> Pred: {pred_cat}, True: {true_cat}")
    
    return {
        'predictions': predictions,
        'true_labels': y_test_np,
        'probabilities': probabilities,
        'attention_weights': attention_weights,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'descriptions': all_descriptions,
        'categories': categories
    }

# Run evaluation
description_results = evaluate_description_model(rnn_manager)


## 5. Temporal Pattern RNN Evaluation {#temporal}


In [None]:
# Temporal Pattern RNN Evaluation
def evaluate_temporal_model(rnn_manager, num_test_samples=200):
    """Evaluate the Temporal Pattern RNN model"""
    
    print("⏰ Evaluating Temporal Pattern RNN...")
    
    # Generate test temporal data
    test_sequences = []
    test_labels = []
    
    for i in range(num_test_samples):
        # Create temporal sequence (e.g., 7 days of data)
        sequence_length = 7
        features = []
        
        for day in range(sequence_length):
            # Generate realistic temporal features
            hour = np.random.uniform(0, 1)  # Normalized hour
            day_of_week = day / 7.0  # Normalized day of week
            month = np.random.uniform(0, 1)  # Normalized month
            season = np.random.uniform(0, 1)  # Normalized season
            weather = np.random.uniform(0, 1)  # Normalized weather
            location_type = np.random.uniform(0, 1)  # Normalized location type
            item_category = np.random.uniform(0, 1)  # Normalized item category
            user_activity = np.random.uniform(0, 1)  # Normalized user activity
            search_frequency = np.random.uniform(0, 1)  # Normalized search frequency
            success_rate = np.random.uniform(0, 1)  # Normalized success rate
            
            feature_vector = [hour, day_of_week, month, season, weather, 
                            location_type, item_category, user_activity, 
                            search_frequency, success_rate]
            features.append(feature_vector)
        
        # Create label (optimal time period for finding items)
        # 0: early_morning, 1: morning, 2: afternoon, 3: evening, 4: night
        optimal_period = np.random.randint(0, 5)
        
        test_sequences.append(features)
        test_labels.append(optimal_period)
    
    # Convert to tensors
    X_test = torch.FloatTensor(test_sequences).to(device)
    y_test = torch.LongTensor(test_labels).to(device)
    
    # Evaluate model
    model = rnn_manager.temporal_model
    model.eval()
    
    predictions = []
    probabilities = []
    
    with torch.no_grad():
        for i in range(0, len(X_test), 32):  # Batch processing
            batch_X = X_test[i:i+32]
            outputs = model(batch_X)
            probs = F.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
    
    predictions = np.array(predictions)
    y_test_np = y_test.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_np, predictions)
    precision = precision_score(y_test_np, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_np, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test_np, predictions, average='weighted', zero_division=0)
    
    time_periods = {
        0: 'Early Morning (6-9 AM)',
        1: 'Morning (9-12 PM)', 
        2: 'Afternoon (12-5 PM)',
        3: 'Evening (5-9 PM)',
        4: 'Night (9 PM-6 AM)'
    }
    
    print(f"\n=== Temporal Pattern RNN Results ===")
    print(f"Test samples: {len(test_sequences)}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Per-class performance
    print(f"\nPer-class Performance:")
    for i, (period_id, period_name) in enumerate(time_periods.items()):
        class_mask = y_test_np == period_id
        if np.sum(class_mask) > 0:
            class_acc = accuracy_score(y_test_np[class_mask], predictions[class_mask])
            print(f"  {period_name}: {class_acc:.4f} ({np.sum(class_mask)} samples)")
    
    # Confusion matrix
    cm = confusion_matrix(y_test_np, predictions)
    print(f"\nConfusion Matrix:")
    print("Predicted ->", end="")
    for period_name in time_periods.values():
        print(f"{period_name[:12]:>12}", end="")
    print()
    for i, (period_id, period_name) in enumerate(time_periods.items()):
        print(f"{period_name[:12]:>12}", end="")
        for j in range(len(time_periods)):
            print(f"{cm[i,j]:>12}", end="")
        print()
    
    # Show some example predictions
    print(f"\nExample Predictions:")
    for i in range(min(10, len(test_sequences))):
        pred_period = time_periods.get(predictions[i], 'unknown')
        true_period = time_periods.get(y_test_np[i], 'unknown')
        correct = "✅" if predictions[i] == y_test_np[i] else "❌"
        print(f"  {correct} Sample {i+1} -> Pred: {pred_period}, True: {true_period}")
    
    return {
        'predictions': predictions,
        'true_labels': y_test_np,
        'probabilities': probabilities,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'time_periods': time_periods
    }

# Run evaluation
temporal_results = evaluate_temporal_model(rnn_manager)


## 6. Performance Analysis {#analysis}


In [None]:
# Performance Analysis
def analyze_model_performance(behavior_results, description_results, temporal_results):
    """Analyze and compare performance across all RNN models"""
    
    print("📊 RNN Model Performance Analysis")
    print("=" * 50)
    
    # Create performance summary
    models = {
        'User Behavior LSTM': {
            'accuracy': behavior_results['accuracy'],
            'precision': behavior_results['precision'],
            'recall': behavior_results['recall'],
            'f1': behavior_results['f1'],
            'type': 'Sequence Classification',
            'purpose': 'Predict next user action'
        },
        'Description RNN': {
            'accuracy': description_results['accuracy'],
            'precision': description_results['precision'],
            'recall': description_results['recall'],
            'f1': description_results['f1'],
            'type': 'Text Classification',
            'purpose': 'Classify item descriptions'
        },
        'Temporal Pattern RNN': {
            'accuracy': temporal_results['accuracy'],
            'precision': temporal_results['precision'],
            'recall': temporal_results['recall'],
            'f1': temporal_results['f1'],
            'type': 'Time Series Classification',
            'purpose': 'Predict optimal search times'
        }
    }
    
    # Performance summary table
    print("\n📈 Performance Summary")
    print("-" * 80)
    print(f"{'Model':<25} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("-" * 80)
    
    for model_name, metrics in models.items():
        print(f"{model_name:<25} {metrics['accuracy']:<10.4f} {metrics['precision']:<10.4f} {metrics['recall']:<10.4f} {metrics['f1']:<10.4f}")
    
    # Best performing model
    best_model = max(models.items(), key=lambda x: x[1]['accuracy'])
    print(f"\n🏆 Best Performing Model: {best_model[0]}")
    print(f"   Accuracy: {best_model[1]['accuracy']:.4f}")
    
    # Model effectiveness analysis
    print(f"\n📋 Model Effectiveness Analysis")
    print("-" * 50)
    
    for model_name, metrics in models.items():
        accuracy = metrics['accuracy']
        f1 = metrics['f1']
        
        if accuracy >= 0.9:
            performance = "Excellent ⭐⭐⭐"
        elif accuracy >= 0.8:
            performance = "Very Good ⭐⭐"
        elif accuracy >= 0.7:
            performance = "Good ⭐"
        elif accuracy >= 0.6:
            performance = "Fair"
        else:
            performance = "Needs Improvement"
        
        print(f"{model_name}: {performance} (Acc: {accuracy:.3f}, F1: {f1:.3f})")
    
    # Recommendations
    print(f"\n💡 Recommendations")
    print("-" * 30)
    
    for model_name, metrics in models.items():
        accuracy = metrics['accuracy']
        if accuracy < 0.7:
            print(f"• {model_name}: Consider retraining with more data or hyperparameter tuning")
        elif accuracy < 0.8:
            print(f"• {model_name}: Good performance, minor improvements possible")
        else:
            print(f"• {model_name}: Excellent performance, ready for production")
    
    return models

# Run performance analysis
performance_summary = analyze_model_performance(behavior_results, description_results, temporal_results)


## 7. Visualization and Results {#visualization}


In [None]:
# Create comprehensive visualizations
def create_evaluation_visualizations(behavior_results, description_results, temporal_results, performance_summary):
    """Create comprehensive visualizations for RNN model evaluation"""
    
    # Set up the plotting style
    plt.style.use('default')
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Model Performance Comparison
    ax1 = plt.subplot(3, 3, 1)
    models = list(performance_summary.keys())
    accuracies = [performance_summary[model]['accuracy'] for model in models]
    f1_scores = [performance_summary[model]['f1'] for model in models]
    
    x = np.arange(len(models))
    width = 0.35
    
    ax1.bar(x - width/2, accuracies, width, label='Accuracy', alpha=0.8)
    ax1.bar(x + width/2, f1_scores, width, label='F1-Score', alpha=0.8)
    ax1.set_xlabel('Models')
    ax1.set_ylabel('Score')
    ax1.set_title('Model Performance Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels([m.replace(' ', '\n') for m in models], rotation=45, ha='right')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. User Behavior LSTM Confusion Matrix
    ax2 = plt.subplot(3, 3, 2)
    cm_behavior = behavior_results['confusion_matrix']
    actions = ['search', 'upload', 'view', 'browse', 'logout']
    sns.heatmap(cm_behavior, annot=True, fmt='d', cmap='Blues', 
                xticklabels=actions, yticklabels=actions, ax=ax2)
    ax2.set_title('User Behavior LSTM\nConfusion Matrix')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('Actual')
    
    # 3. Description RNN Category Performance
    ax3 = plt.subplot(3, 3, 3)
    categories = description_results['categories']
    cat_names = list(categories.values())
    cat_accuracies = []
    
    for i, (cat_id, cat_name) in enumerate(categories.items()):
        class_mask = description_results['true_labels'] == cat_id
        if np.sum(class_mask) > 0:
            class_acc = accuracy_score(description_results['true_labels'][class_mask], 
                                     description_results['predictions'][class_mask])
            cat_accuracies.append(class_acc)
        else:
            cat_accuracies.append(0)
    
    bars = ax3.bar(range(len(cat_names)), cat_accuracies, alpha=0.8)
    ax3.set_xlabel('Categories')
    ax3.set_ylabel('Accuracy')
    ax3.set_title('Description RNN\nPer-Category Performance')
    ax3.set_xticks(range(len(cat_names)))
    ax3.set_xticklabels(cat_names, rotation=45, ha='right')
    ax3.grid(True, alpha=0.3)
    
    # Color bars based on performance
    for i, bar in enumerate(bars):
        if cat_accuracies[i] >= 0.8:
            bar.set_color('green')
        elif cat_accuracies[i] >= 0.6:
            bar.set_color('orange')
        else:
            bar.set_color('red')
    
    # 4. Temporal Pattern RNN Confusion Matrix
    ax4 = plt.subplot(3, 3, 4)
    cm_temporal = temporal_results['confusion_matrix']
    time_periods = list(temporal_results['time_periods'].values())
    time_labels = [period.split('(')[0].strip() for period in time_periods]
    
    sns.heatmap(cm_temporal, annot=True, fmt='d', cmap='Oranges',
                xticklabels=time_labels, yticklabels=time_labels, ax=ax4)
    ax4.set_title('Temporal Pattern RNN\nConfusion Matrix')
    ax4.set_xlabel('Predicted')
    ax4.set_ylabel('Actual')
    
    # 5. Model Accuracy Distribution
    ax5 = plt.subplot(3, 3, 5)
    model_names = [m.replace(' ', '\n') for m in models]
    colors = ['skyblue', 'lightgreen', 'lightcoral']
    wedges, texts, autotexts = ax5.pie(accuracies, labels=model_names, autopct='%1.1f%%', 
                                       colors=colors, startangle=90)
    ax5.set_title('Model Accuracy\nDistribution')
    
    # 6. Precision vs Recall Scatter
    ax6 = plt.subplot(3, 3, 6)
    precisions = [performance_summary[model]['precision'] for model in models]
    recalls = [performance_summary[model]['recall'] for model in models]
    
    scatter = ax6.scatter(precisions, recalls, s=200, alpha=0.7, c=accuracies, 
                         cmap='viridis', edgecolors='black')
    
    for i, model in enumerate(models):
        ax6.annotate(model.replace(' ', '\n'), (precisions[i], recalls[i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    ax6.set_xlabel('Precision')
    ax6.set_ylabel('Recall')
    ax6.set_title('Precision vs Recall\n(Size = Accuracy)')
    ax6.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax6, label='Accuracy')
    
    # 7. Model Performance Radar Chart
    ax7 = plt.subplot(3, 3, 7, projection='polar')
    
    # Metrics for radar chart
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    
    for i, model in enumerate(models):
        values = [performance_summary[model]['accuracy'],
                 performance_summary[model]['precision'],
                 performance_summary[model]['recall'],
                 performance_summary[model]['f1']]
        values += values[:1]  # Complete the circle
        
        ax7.plot(angles, values, 'o-', linewidth=2, label=model)
        ax7.fill(angles, values, alpha=0.25)
    
    ax7.set_xticks(angles[:-1])
    ax7.set_xticklabels(metrics)
    ax7.set_ylim(0, 1)
    ax7.set_title('Model Performance\nRadar Chart', pad=20)
    ax7.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    ax7.grid(True)
    
    # 8. Training Data Distribution (if available)
    ax8 = plt.subplot(3, 3, 8)
    
    # Simulate training data distribution
    sample_sizes = [750, 800, 600]  # From the training output
    model_labels = ['User\nBehavior', 'Description', 'Temporal']
    
    bars = ax8.bar(model_labels, sample_sizes, alpha=0.8, color=['lightblue', 'lightgreen', 'lightcoral'])
    ax8.set_ylabel('Training Samples')
    ax8.set_title('Training Data\nDistribution')
    ax8.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, size in zip(bars, sample_sizes):
        ax8.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                str(size), ha='center', va='bottom')
    
    # 9. Model Complexity vs Performance
    ax9 = plt.subplot(3, 3, 9)
    
    # Simulate model complexity (parameters)
    complexity = [50000, 75000, 40000]  # Estimated parameters
    colors = ['red' if acc < 0.7 else 'orange' if acc < 0.8 else 'green' 
              for acc in accuracies]
    
    scatter = ax9.scatter(complexity, accuracies, s=300, c=colors, alpha=0.7, edgecolors='black')
    
    for i, model in enumerate(models):
        ax9.annotate(model.replace(' ', '\n'), (complexity[i], accuracies[i]), 
                    xytext=(10, 10), textcoords='offset points', fontsize=8)
    
    ax9.set_xlabel('Model Complexity (Parameters)')
    ax9.set_ylabel('Accuracy')
    ax9.set_title('Model Complexity\nvs Performance')
    ax9.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n📊 Visualization Summary")
    print("=" * 40)
    print(f"Total models evaluated: {len(models)}")
    print(f"Average accuracy: {np.mean(accuracies):.3f}")
    print(f"Best accuracy: {np.max(accuracies):.3f}")
    print(f"Worst accuracy: {np.min(accuracies):.3f}")
    print(f"Standard deviation: {np.std(accuracies):.3f}")

# Create visualizations
create_evaluation_visualizations(behavior_results, description_results, temporal_results, performance_summary)


## 8. Model Comparison and Final Report {#comparison}


In [None]:
# Generate comprehensive final report
def generate_final_report(behavior_results, description_results, temporal_results, performance_summary):
    """Generate a comprehensive final evaluation report"""
    
    print("🎯 BARYONYX RNN Model Evaluation - Final Report")
    print("=" * 60)
    print(f"Evaluation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Device Used: {device}")
    print()
    
    # Executive Summary
    print("📋 EXECUTIVE SUMMARY")
    print("-" * 30)
    
    best_model = max(performance_summary.items(), key=lambda x: x[1]['accuracy'])
    worst_model = min(performance_summary.items(), key=lambda x: x[1]['accuracy'])
    avg_accuracy = np.mean([m['accuracy'] for m in performance_summary.values()])
    
    print(f"• Total Models Evaluated: {len(performance_summary)}")
    print(f"• Average Accuracy: {avg_accuracy:.3f}")
    print(f"• Best Performing Model: {best_model[0]} ({best_model[1]['accuracy']:.3f})")
    print(f"• Worst Performing Model: {worst_model[0]} ({worst_model[1]['accuracy']:.3f})")
    print(f"• Production Ready Models: {sum(1 for m in performance_summary.values() if m['accuracy'] >= 0.8)}")
    print()
    
    # Detailed Model Analysis
    print("🔍 DETAILED MODEL ANALYSIS")
    print("-" * 35)
    
    for model_name, metrics in performance_summary.items():
        print(f"\n{model_name}:")
        print(f"  Purpose: {metrics['purpose']}")
        print(f"  Type: {metrics['type']}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1']:.4f}")
        
        # Performance assessment
        if metrics['accuracy'] >= 0.9:
            status = "🟢 EXCELLENT - Ready for production"
        elif metrics['accuracy'] >= 0.8:
            status = "🟡 GOOD - Minor improvements possible"
        elif metrics['accuracy'] >= 0.7:
            status = "🟠 FAIR - Consider retraining"
        else:
            status = "🔴 POOR - Needs significant improvement"
        
        print(f"  Status: {status}")
    
    # Key Findings
    print(f"\n🔍 KEY FINDINGS")
    print("-" * 20)
    
    findings = []
    
    # Check for high-performing models
    excellent_models = [name for name, m in performance_summary.items() if m['accuracy'] >= 0.9]
    if excellent_models:
        findings.append(f"• {len(excellent_models)} model(s) achieved excellent performance (≥90%): {', '.join(excellent_models)}")
    
    # Check for balanced performance
    f1_scores = [m['f1'] for m in performance_summary.values()]
    if np.std(f1_scores) < 0.1:
        findings.append("• Models show balanced precision and recall performance")
    else:
        findings.append("• Some models show imbalanced precision/recall - consider class weighting")
    
    # Check for consistent performance
    if np.std([m['accuracy'] for m in performance_summary.values()]) < 0.1:
        findings.append("• All models show consistent performance levels")
    else:
        findings.append("• Performance varies significantly across models - investigate training differences")
    
    # Check for overfitting
    for name, metrics in performance_summary.items():
        if metrics['precision'] > metrics['recall'] + 0.1:
            findings.append(f"• {name} may be overfitting (precision >> recall)")
            break
    
    for finding in findings:
        print(finding)
    
    # Recommendations
    print(f"\n💡 RECOMMENDATIONS")
    print("-" * 25)
    
    recommendations = []
    
    # Model-specific recommendations
    for name, metrics in performance_summary.items():
        if metrics['accuracy'] < 0.7:
            recommendations.append(f"• {name}: Retrain with more data, try different architectures, or adjust hyperparameters")
        elif metrics['accuracy'] < 0.8:
            recommendations.append(f"• {name}: Consider data augmentation or ensemble methods")
        else:
            recommendations.append(f"• {name}: Monitor performance in production, consider A/B testing")
    
    # General recommendations
    recommendations.extend([
        "• Implement continuous monitoring for model drift",
        "• Set up automated retraining pipelines",
        "• Consider ensemble methods for critical predictions",
        "• Implement confidence thresholds for predictions",
        "• Regular evaluation with real-world data"
    ])
    
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
    
    # Technical Specifications
    print(f"\n⚙️ TECHNICAL SPECIFICATIONS")
    print("-" * 35)
    print(f"• PyTorch Version: {torch.__version__}")
    print(f"• CUDA Available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"• GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"• Evaluation Samples: 200 per model")
    print(f"• Batch Size: 32")
    print(f"• Device: {device}")
    
    # Model File Locations
    print(f"\n📁 MODEL FILES")
    print("-" * 20)
    model_files = [
        "models/rnn_models/user_behavior_lstm.pth",
        "models/rnn_models/description_birnn.pth", 
        "models/rnn_models/temporal_pattern_rnn.pth",
        "models/rnn_models/vocab.pkl",
        "models/rnn_models/temporal_data.pkl"
    ]
    
    for file_path in model_files:
        if os.path.exists(file_path):
            size = os.path.getsize(file_path) / 1024
            print(f"✅ {file_path} ({size:.1f} KB)")
        else:
            print(f"❌ {file_path} - Not found")
    
    # Conclusion
    print(f"\n🎯 CONCLUSION")
    print("-" * 15)
    
    production_ready = sum(1 for m in performance_summary.values() if m['accuracy'] >= 0.8)
    total_models = len(performance_summary)
    
    if production_ready == total_models:
        conclusion = "All RNN models are performing excellently and are ready for production deployment."
    elif production_ready >= total_models * 0.7:
        conclusion = "Most RNN models are performing well. Consider minor improvements for underperforming models."
    else:
        conclusion = "Several models need improvement before production deployment. Focus on retraining and optimization."
    
    print(conclusion)
    print(f"\nOverall System Readiness: {production_ready}/{total_models} models production-ready")
    
    return {
        'summary': {
            'total_models': len(performance_summary),
            'avg_accuracy': avg_accuracy,
            'best_model': best_model[0],
            'worst_model': worst_model[0],
            'production_ready': production_ready
        },
        'findings': findings,
        'recommendations': recommendations
    }

# Generate final report
final_report = generate_final_report(behavior_results, description_results, temporal_results, performance_summary)


## 9. Save Results and Export Data {#export}


In [None]:
# Save evaluation results and export data
def save_evaluation_results(behavior_results, description_results, temporal_results, performance_summary, final_report):
    """Save all evaluation results to files"""
    
    print("💾 Saving Evaluation Results...")
    
    # Create results directory
    results_dir = 'evaluation_results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Save individual model results
    results_data = {
        'behavior_results': behavior_results,
        'description_results': description_results,
        'temporal_results': temporal_results,
        'performance_summary': performance_summary,
        'final_report': final_report,
        'evaluation_timestamp': datetime.now().isoformat(),
        'device_used': str(device)
    }
    
    # Save as pickle
    with open(os.path.join(results_dir, 'rnn_evaluation_results.pkl'), 'wb') as f:
        pickle.dump(results_data, f)
    
    # Save as JSON (convert numpy arrays to lists)
    def convert_numpy(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, dict):
            return {k: convert_numpy(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_numpy(item) for item in obj]
        else:
            return obj
    
    json_data = convert_numpy(results_data)
    with open(os.path.join(results_dir, 'rnn_evaluation_results.json'), 'w') as f:
        json.dump(json_data, f, indent=2)
    
    # Create CSV summary
    csv_data = []
    for model_name, metrics in performance_summary.items():
        csv_data.append({
            'Model': model_name,
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1_Score': metrics['f1'],
            'Type': metrics['type'],
            'Purpose': metrics['purpose']
        })
    
    df = pd.DataFrame(csv_data)
    df.to_csv(os.path.join(results_dir, 'model_performance_summary.csv'), index=False)
    
    # Create detailed confusion matrices
    # User Behavior Confusion Matrix
    cm_behavior_df = pd.DataFrame(
        behavior_results['confusion_matrix'],
        index=['search', 'upload', 'view', 'browse', 'logout'],
        columns=['search', 'upload', 'view', 'browse', 'logout']
    )
    cm_behavior_df.to_csv(os.path.join(results_dir, 'user_behavior_confusion_matrix.csv'))
    
    # Temporal Pattern Confusion Matrix
    time_periods = list(temporal_results['time_periods'].values())
    time_labels = [period.split('(')[0].strip() for period in time_periods]
    cm_temporal_df = pd.DataFrame(
        temporal_results['confusion_matrix'],
        index=time_labels,
        columns=time_labels
    )
    cm_temporal_df.to_csv(os.path.join(results_dir, 'temporal_pattern_confusion_matrix.csv'))
    
    # Save model predictions
    predictions_data = {
        'user_behavior': {
            'predictions': behavior_results['predictions'].tolist(),
            'true_labels': behavior_results['true_labels'].tolist(),
            'actions': ['search', 'upload', 'view', 'browse', 'logout']
        },
        'description': {
            'predictions': description_results['predictions'].tolist(),
            'true_labels': description_results['true_labels'].tolist(),
            'descriptions': description_results['descriptions'][:50],  # First 50 descriptions
            'categories': description_results['categories']
        },
        'temporal': {
            'predictions': temporal_results['predictions'].tolist(),
            'true_labels': temporal_results['true_labels'].tolist(),
            'time_periods': temporal_results['time_periods']
        }
    }
    
    with open(os.path.join(results_dir, 'model_predictions.json'), 'w') as f:
        json.dump(predictions_data, f, indent=2)
    
    # Create a summary report
    summary_report = f"""
# BARYONYX RNN Model Evaluation Report

**Evaluation Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Device Used:** {device}

## Executive Summary

- **Total Models Evaluated:** {final_report['summary']['total_models']}
- **Average Accuracy:** {final_report['summary']['avg_accuracy']:.3f}
- **Best Model:** {final_report['summary']['best_model']}
- **Production Ready:** {final_report['summary']['production_ready']}/{final_report['summary']['total_models']} models

## Model Performance

| Model | Accuracy | Precision | Recall | F1-Score |
|-------|----------|-----------|--------|----------|
"""
    
    for model_name, metrics in performance_summary.items():
        summary_report += f"| {model_name} | {metrics['accuracy']:.4f} | {metrics['precision']:.4f} | {metrics['recall']:.4f} | {metrics['f1']:.4f} |\n"
    
    summary_report += f"""
## Key Findings

"""
    for finding in final_report['findings']:
        summary_report += f"- {finding}\n"
    
    summary_report += f"""
## Recommendations

"""
    for i, rec in enumerate(final_report['recommendations'], 1):
        summary_report += f"{i}. {rec}\n"
    
    summary_report += f"""
## Files Generated

- `rnn_evaluation_results.pkl` - Complete results (pickle format)
- `rnn_evaluation_results.json` - Complete results (JSON format)
- `model_performance_summary.csv` - Performance metrics summary
- `user_behavior_confusion_matrix.csv` - User Behavior LSTM confusion matrix
- `temporal_pattern_confusion_matrix.csv` - Temporal Pattern RNN confusion matrix
- `model_predictions.json` - Detailed predictions for all models
- `evaluation_summary.md` - This summary report

## Next Steps

1. Review model performance and identify areas for improvement
2. Implement recommended changes
3. Set up continuous monitoring
4. Deploy production-ready models
5. Schedule regular re-evaluation
"""
    
    with open(os.path.join(results_dir, 'evaluation_summary.md'), 'w') as f:
        f.write(summary_report)
    
    print(f"✅ Results saved to {results_dir}/")
    print(f"📁 Files created:")
    print(f"   - rnn_evaluation_results.pkl")
    print(f"   - rnn_evaluation_results.json") 
    print(f"   - model_performance_summary.csv")
    print(f"   - user_behavior_confusion_matrix.csv")
    print(f"   - temporal_pattern_confusion_matrix.csv")
    print(f"   - model_predictions.json")
    print(f"   - evaluation_summary.md")
    
    return results_dir

# Save all results
results_directory = save_evaluation_results(behavior_results, description_results, temporal_results, performance_summary, final_report)
