# 📊 Model Evaluation for Insurance LLaMA

This notebook evaluates the fine-tuned LLaMA model on insurance-specific tasks:

## What this notebook does:
1. Load the fine-tuned model and test datasets
2. Run comprehensive evaluations for each task type
3. Calculate insurance-specific metrics (ROUGE, BLEU, F1, accuracy)
4. Generate sample predictions and compare with ground truth
5. Create evaluation reports and visualizations
6. Analyze model performance and identify areas for improvement

**⚠️ Important: Make sure you have a trained model from notebook 03**

## 1. Import Libraries and Setup

In [None]:
import os
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import warnings
from datetime import datetime
from tqdm.auto import tqdm

# Core ML libraries
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig
)
from peft import PeftModel
from datasets import Dataset, load_from_disk

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt', quiet=True)
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Configuration and Paths

In [None]:
# Model and data paths
BASE_MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
LORA_MODEL_PATH = Path("outputs/final_model/lora_model")
TOKENIZED_DATA_DIR = Path("data/tokenized")
EVALUATION_RESULTS_DIR = Path("outputs/evaluation")

# Create evaluation results directory
EVALUATION_RESULTS_DIR.mkdir(exist_ok=True)

# Generation configuration
GENERATION_CONFIG = {
    "max_new_tokens": 512,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 50,
    "do_sample": True,
    "pad_token_id": None,  # Will be set when tokenizer is loaded
    "repetition_penalty": 1.1
}

# Insurance task types
TASK_TYPES = {
    'CLAIM_CLASSIFICATION': 'Classification',
    'POLICY_SUMMARIZATION': 'Summarization',
    'FAQ_GENERATION': 'Generation',
    'COMPLIANCE_CHECK': 'Analysis',
    'CONTRACT_QA': 'Question Answering'
}

# Evaluation metrics configuration
ROUGE_TYPES = ['rouge1', 'rouge2', 'rougeL']
CLASSIFICATION_METRICS = ['accuracy', 'precision', 'recall', 'f1']

print(f"Configuration loaded:")
print(f"- Base model: {BASE_MODEL_NAME}")
print(f"- LoRA model path: {LORA_MODEL_PATH}")
print(f"- Evaluation results: {EVALUATION_RESULTS_DIR}")
print(f"- Max new tokens: {GENERATION_CONFIG['max_new_tokens']}")

## 3. Load Fine-tuned Model

In [None]:
def load_finetuned_model() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Load the fine-tuned LoRA model and tokenizer"""
    
    print(f"Loading fine-tuned model from {LORA_MODEL_PATH}...")
    
    # Check if LoRA model exists
    if not LORA_MODEL_PATH.exists():
        print(f"❌ LoRA model not found at {LORA_MODEL_PATH}")
        print("Please run notebook 03_finetuning_lora.ipynb first to train the model")
        return None, None
    
    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_PATH)
        
        # Load base model
        print(f"Loading base model: {BASE_MODEL_NAME}")
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        # Load LoRA model
        print(f"Loading LoRA adapters...")
        model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)
        
        # Set pad token for generation
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        GENERATION_CONFIG['pad_token_id'] = tokenizer.pad_token_id
        
        print(f"✅ Model loaded successfully")
        print(f"  Tokenizer vocab size: {len(tokenizer)}")
        print(f"  Model device: {next(model.parameters()).device}")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return None, None

def load_baseline_model() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Load the baseline (non-fine-tuned) model for comparison"""
    
    print(f"Loading baseline model: {BASE_MODEL_NAME}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        print(f"✅ Baseline model loaded")
        return model, tokenizer
        
    except Exception as e:
        print(f"⚠️ Could not load baseline model: {e}")
        return None, None

# Load models
print("Loading fine-tuned model...")
finetuned_model, finetuned_tokenizer = load_finetuned_model()

if finetuned_model is None:
    print("❌ Cannot proceed without a fine-tuned model")
else:
    print("\nOptionally loading baseline model for comparison...")
    baseline_model, baseline_tokenizer = load_baseline_model()

## 4. Load Test Datasets

In [None]:
def load_test_datasets() -> Dict[str, Dataset]:
    """Load test datasets for evaluation"""
    
    print(f"Loading test datasets from {TOKENIZED_DATA_DIR}...")
    
    test_datasets = {}
    
    # Load combined dataset first
    combined_test_path = TOKENIZED_DATA_DIR / "combined" / "test_hf"
    if combined_test_path.exists():
        try:
            test_dataset = load_from_disk(combined_test_path)
            test_datasets['combined'] = test_dataset
            print(f"✅ Combined test dataset: {len(test_dataset)} examples")
        except Exception as e:
            print(f"⚠️ Could not load combined test dataset: {e}")
    
    # Load individual task datasets
    for task_name in TASK_TYPES.keys():
        task_test_path = TOKENIZED_DATA_DIR / task_name.lower() / "test_hf"
        if task_test_path.exists():
            try:
                task_dataset = load_from_disk(task_test_path)
                test_datasets[task_name] = task_dataset
                print(f"✅ {task_name} test dataset: {len(task_dataset)} examples")
            except Exception as e:
                print(f"⚠️ Could not load {task_name} test dataset: {e}")
    
    if not test_datasets:
        print("❌ No test datasets found")
        print("Please run the previous notebooks to create test data")
    
    return test_datasets

def load_original_test_data() -> List[Dict]:
    """Load original test data (before tokenization) for evaluation"""
    
    original_test_path = Path("data/processed/combined/test.json")
    
    if original_test_path.exists():
        try:
            with open(original_test_path, 'r') as f:
                original_test_data = json.load(f)
            print(f"✅ Original test data loaded: {len(original_test_data)} examples")
            return original_test_data
        except Exception as e:
            print(f"⚠️ Could not load original test data: {e}")
    else:
        print(f"⚠️ Original test data not found at {original_test_path}")
    
    return []

# Load test datasets
if finetuned_model is not None:
    test_datasets = load_test_datasets()
    original_test_data = load_original_test_data()
    
    if test_datasets:
        print(f"\n📊 Test Data Summary:")
        total_examples = sum(len(dataset) for dataset in test_datasets.values())
        print(f"  Total test examples: {total_examples}")
        
        for name, dataset in test_datasets.items():
            print(f"  {name}: {len(dataset)} examples")
else:
    test_datasets = {}
    original_test_data = []

## 5. Evaluation Functions

In [None]:
def generate_response(model, tokenizer, prompt: str, generation_config: dict) -> str:
    """Generate response from model given a prompt"""
    
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                **generation_config,
                use_cache=True
            )
        
        # Decode response (remove input prompt)
        input_length = inputs['input_ids'].shape[1]
        generated_tokens = outputs[0][input_length:]
        response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        return response.strip()
        
    except Exception as e:
        print(f"⚠️ Error generating response: {e}")
        return ""

def calculate_rouge_scores(predictions: List[str], references: List[str]) -> Dict[str, float]:
    """Calculate ROUGE scores"""
    
    scorer = rouge_scorer.RougeScorer(ROUGE_TYPES, use_stemmer=True)
    scores = {rouge_type: [] for rouge_type in ROUGE_TYPES}
    
    for pred, ref in zip(predictions, references):
        rouge_scores = scorer.score(ref, pred)
        for rouge_type in ROUGE_TYPES:
            scores[rouge_type].append(rouge_scores[rouge_type].fmeasure)
    
    # Calculate average scores
    avg_scores = {}
    for rouge_type in ROUGE_TYPES:
        avg_scores[f"{rouge_type}_precision"] = np.mean([scorer.score(ref, pred)[rouge_type].precision for pred, ref in zip(predictions, references)])
        avg_scores[f"{rouge_type}_recall"] = np.mean([scorer.score(ref, pred)[rouge_type].recall for pred, ref in zip(predictions, references)])
        avg_scores[f"{rouge_type}_fmeasure"] = np.mean(scores[rouge_type])
    
    return avg_scores

def calculate_bleu_scores(predictions: List[str], references: List[str]) -> Dict[str, float]:
    """Calculate BLEU scores"""
    
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    
    for pred, ref in zip(predictions, references):
        # Tokenize
        pred_tokens = word_tokenize(pred.lower())
        ref_tokens = [word_tokenize(ref.lower())]
        
        # Calculate BLEU
        try:
            bleu_score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothing)
            bleu_scores.append(bleu_score)
        except:
            bleu_scores.append(0.0)
    
    return {
        'bleu_score': np.mean(bleu_scores),
        'bleu_std': np.std(bleu_scores)
    }

def calculate_classification_metrics(predictions: List[str], references: List[str]) -> Dict[str, float]:
    """Calculate classification metrics (accuracy, precision, recall, F1)"""
    
    # Simple string matching for classification
    y_true = references
    y_pred = predictions
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def extract_answer_from_response(response: str, task_type: str) -> str:
    """Extract the actual answer from model response"""
    
    if not response:
        return ""
    
    # Clean up the response
    response = response.strip()
    
    # For classification, try to extract class name
    if task_type == 'CLAIM_CLASSIFICATION':
        # Look for patterns like "This is a X claim" or "X claim"
        import re
        patterns = [
            r'This is an? ([\w_]+) claim',
            r'([\w_]+) claim',
            r'Category: ([\w_]+)',
            r'Type: ([\w_]+)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                return match.group(1).lower().replace(' ', '_')
    
    # For other tasks, return first sentence or first 100 chars
    sentences = sent_tokenize(response)
    if sentences:
        return sentences[0]
    
    return response[:100] if len(response) > 100 else response

print("✅ Evaluation functions defined")

## 6. Run Comprehensive Evaluation

In [None]:
def evaluate_model_on_dataset(model, tokenizer, test_examples: List[Dict], task_type: str) -> Dict:
    """Evaluate model on a specific dataset"""
    
    print(f"\nEvaluating {task_type} ({len(test_examples)} examples)...")
    
    predictions = []
    references = []
    prompts = []
    
    for example in tqdm(test_examples, desc=f"Evaluating {task_type}"):
        # Create prompt
        instruction = example.get('instruction', '')
        input_text = example.get('input', '')
        expected_output = example.get('output', '')
        
        prompt = f"[INST] {instruction}\n\n{input_text} [/INST]"
        
        # Generate prediction
        response = generate_response(model, tokenizer, prompt, GENERATION_CONFIG)
        
        # Extract answer
        prediction = extract_answer_from_response(response, task_type)
        
        predictions.append(prediction)
        references.append(expected_output)
        prompts.append(prompt)
    
    # Calculate metrics based on task type
    metrics = {}
    
    if task_type in ['POLICY_SUMMARIZATION', 'FAQ_GENERATION', 'COMPLIANCE_CHECK']:
        # Text generation tasks - use ROUGE and BLEU
        rouge_scores = calculate_rouge_scores(predictions, references)
        bleu_scores = calculate_bleu_scores(predictions, references)
        metrics.update(rouge_scores)
        metrics.update(bleu_scores)
    
    elif task_type == 'CLAIM_CLASSIFICATION':
        # Classification task - use accuracy, precision, recall, F1
        classification_metrics = calculate_classification_metrics(predictions, references)
        metrics.update(classification_metrics)
    
    elif task_type == 'CONTRACT_QA':
        # QA task - use both ROUGE and exact match
        rouge_scores = calculate_rouge_scores(predictions, references)
        exact_matches = [pred.lower().strip() == ref.lower().strip() for pred, ref in zip(predictions, references)]
        metrics.update(rouge_scores)
        metrics['exact_match'] = np.mean(exact_matches)
    
    # Add sample predictions for review
    num_samples = min(5, len(predictions))
    samples = []
    for i in range(num_samples):
        samples.append({
            'prompt': prompts[i][:200] + '...' if len(prompts[i]) > 200 else prompts[i],
            'prediction': predictions[i],
            'reference': references[i],
            'match': predictions[i].lower().strip() == references[i].lower().strip()
        })
    
    results = {
        'task_type': task_type,
        'num_examples': len(test_examples),
        'metrics': metrics,
        'samples': samples
    }
    
    return results

def run_full_evaluation() -> Dict[str, Dict]:
    """Run evaluation on all available test datasets"""
    
    if not original_test_data:
        print("❌ No test data available for evaluation")
        return {}
    
    print(f"🔍 Starting comprehensive evaluation...")
    print(f"Evaluating on {len(original_test_data)} test examples")
    
    # Group examples by task type
    task_examples = {}
    for example in original_test_data:
        task_type = example.get('task_type', 'POLICY_SUMMARIZATION')
        if task_type not in task_examples:
            task_examples[task_type] = []
        task_examples[task_type].append(example)
    
    print(f"\nTask distribution:")
    for task_type, examples in task_examples.items():
        print(f"  {task_type}: {len(examples)} examples")
    
    # Evaluate each task
    evaluation_results = {}
    
    for task_type, examples in task_examples.items():
        if examples:  # Only evaluate if we have examples
            results = evaluate_model_on_dataset(finetuned_model, finetuned_tokenizer, examples, task_type)
            evaluation_results[task_type] = results
    
    return evaluation_results

# Run evaluation if model and data are available
if finetuned_model is not None and original_test_data:
    evaluation_results = run_full_evaluation()
    
    print(f"\n✅ Evaluation completed!")
    print(f"Evaluated {len(evaluation_results)} task types")
else:
    print("⚠️ Skipping evaluation - model or test data not available")
    evaluation_results = {}

## 7. Results Analysis and Visualization

In [None]:
def create_results_summary(evaluation_results: Dict[str, Dict]) -> pd.DataFrame:
    """Create a summary DataFrame of evaluation results"""
    
    if not evaluation_results:
        return pd.DataFrame()
    
    summary_data = []
    
    for task_type, results in evaluation_results.items():
        row = {
            'Task Type': task_type,
            'Task Name': TASK_TYPES.get(task_type, task_type),
            'Examples': results['num_examples']
        }
        
        # Add metrics
        metrics = results['metrics']
        
        # Add relevant metrics based on task type
        if 'accuracy' in metrics:
            row['Accuracy'] = f"{metrics['accuracy']:.3f}"
        if 'f1' in metrics:
            row['F1 Score'] = f"{metrics['f1']:.3f}"
        if 'rouge1_fmeasure' in metrics:
            row['ROUGE-1'] = f"{metrics['rouge1_fmeasure']:.3f}"
        if 'rouge2_fmeasure' in metrics:
            row['ROUGE-2'] = f"{metrics['rouge2_fmeasure']:.3f}"
        if 'rougeL_fmeasure' in metrics:
            row['ROUGE-L'] = f"{metrics['rougeL_fmeasure']:.3f}"
        if 'bleu_score' in metrics:
            row['BLEU'] = f"{metrics['bleu_score']:.3f}"
        if 'exact_match' in metrics:
            row['Exact Match'] = f"{metrics['exact_match']:.3f}"
        
        summary_data.append(row)
    
    return pd.DataFrame(summary_data)

def plot_evaluation_metrics(evaluation_results: Dict[str, Dict]):
    """Create visualizations of evaluation metrics"""
    
    if not evaluation_results:
        print("No evaluation results to plot")
        return
    
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('LLaMA Insurance Model Evaluation Results', fontsize=16, fontweight='bold')
    
    # Plot 1: Task Performance Overview
    task_names = []
    primary_scores = []
    
    for task_type, results in evaluation_results.items():
        task_names.append(TASK_TYPES.get(task_type, task_type))
        metrics = results['metrics']
        
        # Choose primary metric based on task type
        if 'accuracy' in metrics:
            primary_scores.append(metrics['accuracy'])
        elif 'rouge1_fmeasure' in metrics:
            primary_scores.append(metrics['rouge1_fmeasure'])
        elif 'exact_match' in metrics:
            primary_scores.append(metrics['exact_match'])
        else:
            primary_scores.append(0.0)
    
    axes[0, 0].bar(task_names, primary_scores, color='skyblue', alpha=0.7)
    axes[0, 0].set_title('Primary Metric by Task')
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].set_ylim(0, 1)
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for i, score in enumerate(primary_scores):
        axes[0, 0].text(i, score + 0.01, f'{score:.3f}', ha='center', va='bottom')
    
    # Plot 2: ROUGE Scores (for text generation tasks)
    rouge_data = {'Task': [], 'ROUGE-1': [], 'ROUGE-2': [], 'ROUGE-L': []}
    
    for task_type, results in evaluation_results.items():
        metrics = results['metrics']
        if 'rouge1_fmeasure' in metrics:
            rouge_data['Task'].append(TASK_TYPES.get(task_type, task_type))
            rouge_data['ROUGE-1'].append(metrics.get('rouge1_fmeasure', 0))
            rouge_data['ROUGE-2'].append(metrics.get('rouge2_fmeasure', 0))
            rouge_data['ROUGE-L'].append(metrics.get('rougeL_fmeasure', 0))
    
    if rouge_data['Task']:
        rouge_df = pd.DataFrame(rouge_data)
        rouge_df.set_index('Task').plot(kind='bar', ax=axes[0, 1], alpha=0.7)
        axes[0, 1].set_title('ROUGE Scores by Task')
        axes[0, 1].set_ylabel('ROUGE Score')
        axes[0, 1].legend(loc='upper right')
        axes[0, 1].tick_params(axis='x', rotation=45)
    else:
        axes[0, 1].text(0.5, 0.5, 'No ROUGE scores available', ha='center', va='center', transform=axes[0, 1].transAxes)
        axes[0, 1].set_title('ROUGE Scores by Task')
    
    # Plot 3: Sample Size Distribution
    sample_sizes = [results['num_examples'] for results in evaluation_results.values()]
    task_labels = [TASK_TYPES.get(task_type, task_type) for task_type in evaluation_results.keys()]
    
    axes[1, 0].pie(sample_sizes, labels=task_labels, autopct='%1.1f%%', startangle=90)
    axes[1, 0].set_title('Test Sample Distribution')
    
    # Plot 4: Metric Comparison
    metric_comparison = {'Metric': [], 'Average Score': []}
    
    # Collect all metrics
    all_metrics = {}
    for results in evaluation_results.values():
        for metric, value in results['metrics'].items():
            if metric.endswith('_fmeasure') or metric in ['accuracy', 'f1', 'exact_match', 'bleu_score']:
                if metric not in all_metrics:
                    all_metrics[metric] = []
                all_metrics[metric].append(value)
    
    for metric, values in all_metrics.items():
        if values:  # Only include metrics with values
            metric_comparison['Metric'].append(metric.replace('_fmeasure', '').replace('_', ' ').title())
            metric_comparison['Average Score'].append(np.mean(values))
    
    if metric_comparison['Metric']:
        axes[1, 1].barh(metric_comparison['Metric'], metric_comparison['Average Score'], alpha=0.7)
        axes[1, 1].set_title('Average Metric Scores')
        axes[1, 1].set_xlabel('Average Score')
        axes[1, 1].set_xlim(0, 1)
    else:
        axes[1, 1].text(0.5, 0.5, 'No metrics available', ha='center', va='center', transform=axes[1, 1].transAxes)
        axes[1, 1].set_title('Average Metric Scores')
    
    plt.tight_layout()
    
    # Save plot
    plot_file = EVALUATION_RESULTS_DIR / 'evaluation_plots.png'
    plt.savefig(plot_file, dpi=300, bbox_inches='tight')
    print(f"✅ Evaluation plots saved to: {plot_file}")
    
    plt.show()

def display_sample_predictions(evaluation_results: Dict[str, Dict]):
    """Display sample predictions for manual review"""
    
    if not evaluation_results:
        return
    
    print("\n" + "="*80)
    print("SAMPLE PREDICTIONS FOR MANUAL REVIEW")
    print("="*80)
    
    for task_type, results in evaluation_results.items():
        print(f"\n🔍 {task_type} ({TASK_TYPES.get(task_type, task_type)})")
        print("-" * 60)
        
        for i, sample in enumerate(results['samples'][:3], 1):  # Show first 3 samples
            print(f"\nExample {i}:")
            print(f"Prompt: {sample['prompt']}")
            print(f"Prediction: {sample['prediction']}")
            print(f"Reference: {sample['reference']}")
            print(f"Match: {'✅' if sample['match'] else '❌'}")
            print("-" * 40)

# Generate results analysis if we have evaluation results
if evaluation_results:
    print("\n📊 Generating results analysis...")
    
    # Create summary table
    results_summary = create_results_summary(evaluation_results)
    
    print("\n📈 EVALUATION RESULTS SUMMARY")
    print("=" * 50)
    if not results_summary.empty:
        print(results_summary.to_string(index=False))
    else:
        print("No results to display")
    
    # Create visualizations
    plot_evaluation_metrics(evaluation_results)
    
    # Display sample predictions
    display_sample_predictions(evaluation_results)
    
else:
    print("⚠️ No evaluation results to analyze")

## 8. Save Evaluation Results

In [None]:
def save_evaluation_results(evaluation_results: Dict[str, Dict]):
    """Save evaluation results to files"""
    
    if not evaluation_results:
        print("No evaluation results to save")
        return
    
    print(f"💾 Saving evaluation results to {EVALUATION_RESULTS_DIR}...")
    
    # Save detailed results as JSON
    results_file = EVALUATION_RESULTS_DIR / 'evaluation_results.json'
    
    # Prepare results for JSON serialization
    json_results = {}
    for task_type, results in evaluation_results.items():
        json_results[task_type] = {
            'task_type': results['task_type'],
            'num_examples': results['num_examples'],
            'metrics': results['metrics'],
            'samples': results['samples']  # Include sample predictions
        }
    
    # Add metadata
    evaluation_metadata = {
        'evaluation_date': datetime.now().isoformat(),
        'model_path': str(LORA_MODEL_PATH),
        'base_model': BASE_MODEL_NAME,
        'generation_config': GENERATION_CONFIG,
        'total_examples': sum(results['num_examples'] for results in evaluation_results.values()),
        'tasks_evaluated': list(evaluation_results.keys())
    }
    
    final_results = {
        'metadata': evaluation_metadata,
        'results': json_results
    }
    
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Detailed results saved to: {results_file}")
    
    # Save summary table as CSV
    results_summary = create_results_summary(evaluation_results)
    if not results_summary.empty:
        csv_file = EVALUATION_RESULTS_DIR / 'evaluation_summary.csv'
        results_summary.to_csv(csv_file, index=False)
        print(f"✅ Summary CSV saved to: {csv_file}")
    
    # Create evaluation report
    create_evaluation_report(evaluation_results, evaluation_metadata)

def create_evaluation_report(evaluation_results: Dict[str, Dict], metadata: Dict):
    """Create a comprehensive evaluation report"""
    
    report_file = EVALUATION_RESULTS_DIR / 'evaluation_report.md'
    
    report_content = f"""
# LLaMA Insurance Model Evaluation Report

## Overview
- **Evaluation Date**: {metadata['evaluation_date']}
- **Base Model**: {metadata['base_model']}
- **Fine-tuned Model**: {metadata['model_path']}
- **Total Test Examples**: {metadata['total_examples']}
- **Tasks Evaluated**: {len(metadata['tasks_evaluated'])}

## Generation Configuration
```json
{json.dumps(metadata['generation_config'], indent=2)}
```

## Results by Task
"""
    
    for task_type, results in evaluation_results.items():
        task_name = TASK_TYPES.get(task_type, task_type)
        metrics = results['metrics']
        
        report_content += f"""
### {task_name} ({task_type})
- **Examples**: {results['num_examples']}

#### Metrics
"""
        
        for metric, value in metrics.items():
            if isinstance(value, float):
                report_content += f"- **{metric.replace('_', ' ').title()}**: {value:.4f}\n"
            else:
                report_content += f"- **{metric.replace('_', ' ').title()}**: {value}\n"
        
        report_content += "\n#### Sample Predictions\n"
        
        for i, sample in enumerate(results['samples'][:2], 1):
            report_content += f"""
**Example {i}:**
- **Input**: {sample['prompt'][:150]}...
- **Prediction**: {sample['prediction']}
- **Reference**: {sample['reference']}
- **Match**: {'✅' if sample['match'] else '❌'}

"""
    
    report_content += """
## Analysis and Recommendations

### Strengths
- The model shows competency across multiple insurance-specific tasks
- Fine-tuning has successfully adapted the base LLaMA model to the insurance domain

### Areas for Improvement
- Consider increasing training data for tasks with lower performance
- Experiment with different generation parameters for better outputs
- Add more diverse examples to improve generalization

### Next Steps
1. Collect additional training data for underperforming tasks
2. Fine-tune generation parameters based on task-specific requirements
3. Consider ensemble methods or multi-stage training
4. Deploy model for user testing and feedback collection

---
*Report generated automatically by LLaMA Insurance Evaluation Pipeline*
"""
    
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(report_content.strip())
    
    print(f"✅ Evaluation report saved to: {report_file}")

# Save results if available
if evaluation_results:
    save_evaluation_results(evaluation_results)
    
    print(f"\n🎉 Evaluation Complete!")
    print(f"\nResults saved to: {EVALUATION_RESULTS_DIR}")
    print(f"\nFiles created:")
    print(f"- evaluation_results.json (detailed results)")
    print(f"- evaluation_summary.csv (summary table)")
    print(f"- evaluation_report.md (comprehensive report)")
    print(f"- evaluation_plots.png (visualizations)")
    
    print(f"\nNext steps:")
    print(f"1. Review the evaluation report for insights")
    print(f"2. Run 05_inference_demo.ipynb to test the model interactively")
    print(f"3. Consider additional training or data collection based on results")
else:
    print("⚠️ No evaluation results to save")
    print("Please ensure you have:")
    print("1. A trained model from notebook 03")
    print("2. Test data from notebook 01")
    print("3. Processed datasets from notebook 02")