In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
from utils.data_loader import LibriSpeechDataLoader
from utils.evaluation import ModelEvaluator
from utils.model_utils import decode_predictions
import config

# Set random seeds
tf.random.set_seed(42)
np.random.seed(42)

# Load all necessary data
char_to_num = np.load('char_to_num.npy', allow_pickle=True).item()
num_to_char = np.load('num_to_char.npy', allow_pickle=True).item()
feature_info = np.load('feature_info.npy', allow_pickle=True).item()

print("Comprehensive Model Evaluation")
print("=" * 50)

# Initialize evaluator
evaluator = ModelEvaluator(char_to_num, num_to_char)

# Load test datasets
test_datasets = {}
for feature_type in config.FEATURE_CONFIG['feature_types']:
    feature_datasets = np.load(f'feature_datasets_{feature_type}.npy', allow_pickle=True).item()
    test_datasets[feature_type] = feature_datasets['test']

# Load all trained models
models = {}

# Load baseline models
for feature_type in config.FEATURE_CONFIG['feature_types']:
    try:
        models[f'cnn_{feature_type}'] = keras.models.load_model(
            f'models/cnn_{feature_type}_best.h5',
            custom_objects={'CTCLayer': None}  # You might need to adjust this
        )
        models[f'lstm_{feature_type}'] = keras.models.load_model(
            f'models/lstm_{feature_type}_best.h5',
            custom_objects={'CTCLayer': None}
        )
        models[f'transformer_{feature_type}'] = keras.models.load_model(
            f'models/transformer_{feature_type}_best.h5',
            custom_objects={'CTCLayer': None}
        )
    except Exception as e:
        print(f"Warning: Could not load some models: {e}")

# Load tuned models
try:
    models['cnn_tuned'] = keras.models.load_model('models/cnn_tuned_best.h5')
    models['lstm_tuned'] = keras.models.load_model('models/lstm_tuned_best.h5')
    models['transformer_tuned'] = keras.models.load_model('models/transformer_tuned_best.h5')
except Exception as e:
    print(f"Warning: Could not load tuned models: {e}")

print(f"Loaded {len(models)} models for evaluation")

# Comprehensive evaluation function
def evaluate_model_comprehensive(model, test_dataset, model_name, num_samples=100):
    """Comprehensive evaluation of a single model"""
    print(f"\nEvaluating {model_name}...")
    
    results = {
        'model': model_name,
        'test_loss': 0,
        'wer_scores': [],
        'cer_scores': [],
        'predictions': [],
        'actuals': []
    }
    
    sample_count = 0
    total_loss = 0
    batch_count = 0
    
    # Calculate test loss
    for batch in test_dataset.take(10):  # Use first 10 batches for loss calculation
        features, labels = batch
        batch_size = features.shape[0]
        
        # Create CTC inputs
        input_length = np.ones((batch_size, 1)) * features.shape[1]
        label_length = np.ones((batch_size, 1)) * labels.shape[1]
        dummy_labels = np.zeros(batch_size)
        
        loss = model.test_on_batch(
            [features, labels, input_length, label_length],
            dummy_labels
        )
        
        total_loss += loss * batch_size
        batch_count += batch_size
        
        # Calculate WER and CER for samples
        for i in range(min(batch_size, 5)):  # 5 samples per batch
            if sample_count >= num_samples:
                break
                
            sample_features = features[i:i+1]
            sample_labels = labels[i]
            
            # Get prediction
            prediction = evaluator.predict_and_decode(model, sample_features, num_to_char)
            
            # Get actual text
            actual_text = evaluator.numbers_to_text(sample_labels.numpy())
            
            # Calculate metrics
            wer = evaluator.calculate_wer(actual_text, prediction)
            cer = evaluator.calculate_cer(actual_text, prediction)
            
            results['wer_scores'].append(wer)
            results['cer_scores'].append(cer)
            results['predictions'].append(prediction)
            results['actuals'].append(actual_text)
            
            sample_count += 1
    
    results['test_loss'] = total_loss / batch_count if batch_count > 0 else float('inf')
    results['avg_wer'] = np.mean(results['wer_scores']) if results['wer_scores'] else float('inf')
    results['avg_cer'] = np.mean(results['cer_scores']) if results['cer_scores'] else float('inf')
    
    print(f"  Test Loss: {results['test_loss']:.4f}")
    print(f"  Average WER: {results['avg_wer']:.4f}")
    print(f"  Average CER: {results['avg_cer']:.4f}")
    
    return results

# Evaluate all models
all_results = []

for model_name, model in models.items():
    # Determine feature type from model name
    feature_type = None
    for ft in config.FEATURE_CONFIG['feature_types']:
        if ft in model_name:
            feature_type = ft
            break
    
    if feature_type and feature_type in test_datasets:
        results = evaluate_model_comprehensive(model, test_datasets[feature_type], model_name)
        all_results.append(results)

# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df = results_df[['model', 'test_loss', 'avg_wer', 'avg_cer']]
results_df = results_df.sort_values('avg_wer')

print("\n" + "="*50)
print("Comprehensive Model Comparison")
print("="*50)
print(results_df.to_string(index=False))

# Visualize results
plt.figure(figsize=(15, 10))

# Plot 1: Test Loss Comparison
plt.subplot(2, 2, 1)
plt.bar(results_df['model'], results_df['test_loss'], alpha=0.7)
plt.title('Test Loss Comparison')
plt.ylabel('Loss')
plt.xticks(rotation=45, ha='right')

# Plot 2: WER Comparison
plt.subplot(2, 2, 2)
plt.bar(results_df['model'], results_df['avg_wer'], alpha=0.7, color='orange')
plt.title('Word Error Rate (WER) Comparison')
plt.ylabel('WER')
plt.xticks(rotation=45, ha='right')

# Plot 3: CER Comparison
plt.subplot(2, 2, 3)
plt.bar(results_df['model'], results_df['avg_cer'], alpha=0.7, color='green')
plt.title('Character Error Rate (CER) Comparison')
plt.ylabel('CER')
plt.xticks(rotation=45, ha='right')

# Plot 4: Model Type Performance
plt.subplot(2, 2, 4)
model_types = []
wer_by_type = []

for model_name in results_df['model']:
    if 'cnn' in model_name:
        model_types.append('CNN')
    elif 'lstm' in model_name:
        model_types.append('LSTM')
    elif 'transformer' in model_name:
        model_types.append('Transformer')
    else:
        model_types.append('Other')

results_df['model_type'] = model_types
type_performance = results_df.groupby('model_type')['avg_wer'].mean()

plt.bar(type_performance.index, type_performance.values, alpha=0.7, color='red')
plt.title('Average WER by Model Type')
plt.ylabel('WER')

plt.tight_layout()
plt.savefig('comprehensive_evaluation.png', dpi=300, bbox_inches='tight')
plt.show()

# Feature type analysis
print("\n" + "="*50)
print("Feature Type Analysis")
print("="*50)

feature_performance = {}
for feature_type in config.FEATURE_CONFIG['feature_types']:
    feature_models = [r for r in all_results if feature_type in r['model']]
    if feature_models:
        avg_wer = np.mean([r['avg_wer'] for r in feature_models])
        feature_performance[feature_type] = avg_wer
        print(f"{feature_type}: Average WER = {avg_wer:.4f}")

# Best model identification
best_model_idx = results_df['avg_wer'].idxmin()
best_model = results_df.loc[best_model_idx]

print(f"\n{'='*50}")
print("BEST MODEL IDENTIFIED")
print(f"{'='*50}")
print(f"Model: {best_model['model']}")
print(f"Test Loss: {best_model['test_loss']:.4f}")
print(f"Word Error Rate: {best_model['avg_wer']:.4f}")
print(f"Character Error Rate: {best_model['avg_cer']:.4f}")

# Sample predictions from best model
print(f"\n{'='*50}")
print("Sample Predictions from Best Model")
print(f"{'='*50}")

best_model_name = best_model['model']
best_model_obj = models[best_model_name]

# Determine feature type for best model
best_feature_type = None
for ft in config.FEATURE_CONFIG['feature_types']:
    if ft in best_model_name:
        best_feature_type = ft
        break

if best_feature_type:
    test_ds = test_datasets[best_feature_type]
    
    print("Sample predictions (showing first 5):")
    sample_count = 0
    for features, labels in test_ds.take(3):
        for i in range(features.shape[0]):
            if sample_count >= 5:
                break
                
            sample_features = features[i:i+1]
            sample_labels = labels[i]
            
            # Get prediction
            prediction = evaluator.predict_and_decode(best_model_obj, sample_features, num_to_char)
            
            # Get actual text
            actual_text = evaluator.numbers_to_text(sample_labels.numpy())
            
            # Calculate metrics
            wer = evaluator.calculate_wer(actual_text, prediction)
            cer = evaluator.calculate_cer(actual_text, prediction)
            
            print(f"\nSample {sample_count + 1}:")
            print(f"  Actual: '{actual_text}'")
            print(f"  Predicted: '{prediction}'")
            print(f"  WER: {wer:.4f}, CER: {cer:.4f}")
            
            sample_count += 1

# Save comprehensive results
comprehensive_results = {
    'all_results': all_results,
    'best_model': best_model.to_dict(),
    'feature_performance': feature_performance,
    'summary_stats': {
        'total_models_evaluated': len(all_results),
        'best_wer': best_model['avg_wer'],
        'best_cer': best_model['avg_cer'],
        'average_wer': results_df['avg_wer'].mean(),
        'average_cer': results_df['avg_cer'].mean()
    }
}

with open('comprehensive_evaluation_results.json', 'w') as f:
    json.dump(comprehensive_results, f, indent=2)

# Save results DataFrame
results_df.to_csv('model_evaluation_results.csv', index=False)

print(f"\n{'='*50}")
print("EVALUATION COMPLETED!")
print(f"{'='*50}")
print("Results saved to:")
print("- comprehensive_evaluation_results.json")
print("- model_evaluation_results.csv")
print("- comprehensive_evaluation.png")
print(f"\nBest model: {best_model['model']}")
print(f"Best WER: {best_model['avg_wer']:.4f}")