# 2. Evaluate tr√™n t·∫≠p Test

Notebook n√†y ch·∫°y ƒë√°nh gi√° h·ªá th·ªëng tr√™n t·∫≠p test v√† t√≠nh c√°c metrics:
- **BERTScore**: Semantic similarity
- **LLM-as-a-Judge**: Expert scoring v·ªõi Gemini API



## 1. C√†i ƒë·∫∑t th∆∞ vi·ªán


In [None]:
%pip install -q transformers accelerate bitsandbytes qwen-vl-utils sentence-transformers rank-bm25 underthesea wikipedia pillow pandas tqdm
%pip install -q bert-score google-generativeai matplotlib seaborn


## 2. Import th∆∞ vi·ªán


In [None]:
import json
import sys
import os
from pathlib import Path
from PIL import Image
import pandas as pd
from tqdm import tqdm
import torch

# Add src to path (if running in Kaggle)
sys.path.insert(0, '/kaggle/working/code/src' if Path('/kaggle/working').exists() else '../src')

from pipeline import RAGVQAPipeline
from evaluation_metrics import VQAEvaluator 


## 3. C·∫•u h√¨nh


In [None]:
# Paths
TEST_DATA_PATH = "/kaggle/input/vqa-test/vqa_test.json"  
IMAGES_DIR = "/kaggle/input/vqa-images/images_flat"  
KB_PATH = "/kaggle/input/vietnamese-knowledge-base/knowledge_base.json"

# Output
OUTPUT_DIR = "/kaggle/working/results"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Evaluation settings
MAX_SAMPLES = None  # Set to number to limit, None for all
BATCH_SIZE = 1  # Process one at a time


# EVALUATION METRICS CONFIG 

USE_BERT_SCORE = True
USE_LLM_JUDGE = True  # Set to False if no Gemini API key

# Set your Gemini API key here or via environment variable
# Get free API key at: https://makersuite.google.com/app/apikey
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", None)  # ‚Üê Set your key here or use env var

# If you don't have API key, disable LLM Judge:
if GEMINI_API_KEY is None and USE_LLM_JUDGE:
    print("‚ö†Ô∏è  WARNING: GEMINI_API_KEY not set. LLM Judge will be disabled.")
    print("Get free API key at: https://makersuite.google.com/app/apikey")
    USE_LLM_JUDGE = False


## 4. Load Test Data


In [None]:
print("Loading test data...")
with open(TEST_DATA_PATH, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

if MAX_SAMPLES:
    test_data = test_data[:MAX_SAMPLES]

print(f"Loaded {len(test_data)} test samples")


## 5. Initialize Pipeline


In [None]:
print("Initializing VQA pipeline...")
pipeline = RAGVQAPipeline(use_4bit=True)
print("‚úì Pipeline ready!")


## 6. Run Predictions

Generate predictions for all test samples.


In [None]:
predictions = []

print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

for i, item in enumerate(tqdm(test_data, desc="Processing")):
    try:
        # Load image
        image_path = Path(IMAGES_DIR) / Path(item['image_path']).name
        if not image_path.exists():
            print(f"Image not found: {image_path}")
            continue
        
        image = Image.open(image_path).convert('RGB')
        question = item['question']
        ground_truth = item.get('answer', {}).get('answer', '')
        
        # Get prediction
        result = pipeline.process(
            image=image,
            question=question,
            return_intermediate=True
        )
        
        predictions.append({
            'image_id': item.get('image_id', f'img_{i:06d}'),
            'question': question,
            'prediction': result['answer'],
            'ground_truth': ground_truth,
            'caption': result.get('caption', ''),
            'ocr': result.get('ocr', ''),
            'num_retrieved': len(result.get('retrieved_docs', []))
        })
        
    except Exception as e:
        print(f"Error processing sample {i}: {e}")
        predictions.append({
            'image_id': item.get('image_id', f'img_{i:06d}'),
            'question': question,
            'prediction': f'ERROR: {str(e)}',
            'ground_truth': ground_truth,
            'caption': '',
            'ocr': '',
            'num_retrieved': 0
        })

print(f"\n‚úì Completed predictions on {len(predictions)} samples")


## 7. Save Predictions


In [None]:
# Save predictions (needed for evaluation)
predictions_file = f"{OUTPUT_DIR}/predictions.json"
with open(predictions_file, 'w', encoding='utf-8') as f:
    json.dump(predictions, f, ensure_ascii=False, indent=2)
print(f"‚úì Predictions saved to {predictions_file}")


## 8. Run Evaluation Metrics


ƒê√°nh gi√° predictions b·∫±ng:
1. **BERTScore** - Semantic similarity
2. **LLM-as-a-Judge** - Expert scoring v·ªõi Gemini


In [None]:
print("\n" + "="*80)
print("EVALUATING WITH METRICS")
print("="*80)

# Initialize evaluator
evaluator = VQAEvaluator(
    use_bert_score=USE_BERT_SCORE,
    use_llm_judge=USE_LLM_JUDGE,
    gemini_api_key=GEMINI_API_KEY,
    bert_model="bert-base-multilingual-cased",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Run evaluation
evaluation_output = f"{OUTPUT_DIR}/evaluation_results.json"
stats = evaluator.evaluate_dataset(
    predictions_file=predictions_file,
    output_file=evaluation_output,
    llm_judge_delay=1.5  # Delay to avoid rate limit
)

print("\n‚úì Evaluation completed!")


## 9. Display Results


In [None]:
print("\n" + "="*80)
print("EVALUATION STATISTICS")
print("="*80)

if USE_BERT_SCORE:
    print("\nüìä BERTScore Results:")
    bert_stats = stats.get('bert_score', {})
    print(f"  Mean:   {bert_stats.get('mean', 0):.4f}")
    print(f"  Std:    {bert_stats.get('std', 0):.4f}")
    print(f"  Min:    {bert_stats.get('min', 0):.4f}")
    print(f"  Max:    {bert_stats.get('max', 0):.4f}")
    print(f"  Median: {bert_stats.get('median', 0):.4f}")

if USE_LLM_JUDGE:
    print("\nüìä LLM Judge Results:")
    llm_stats = stats.get('llm_judge', {})
    print(f"  Mean:   {llm_stats.get('mean', 0):.2f}/5")
    print(f"  Std:    {llm_stats.get('std', 0):.2f}")
    print(f"  Min:    {llm_stats.get('min', 0):.2f}/5")
    print(f"  Max:    {llm_stats.get('max', 0):.2f}/5")
    print(f"  Median: {llm_stats.get('median', 0):.2f}/5")

print("\n" + "="*80)


## 10. Compare with Baseline


In [None]:
print("\nüìà Comparison with Baseline (from report):")
print("\n" + "="*80)
print(f"{'Method':<30} {'BERTScore (%)':<20} {'LLM Judge (1-5)'}")
print("="*80)
print(f"{'Baseline (Zero-shot)':<30} {'42.1':<20} {'3.5'}")

if USE_BERT_SCORE and USE_LLM_JUDGE:
    bert_pct = stats['bert_score']['mean'] * 100
    llm_score = stats['llm_judge']['mean']
    print(f"{'Your Result (RAG)':<30} {bert_pct:<20.1f} {llm_score:.1f}")
    print("="*80)
    
    # Check if better than baseline
    if bert_pct > 42.1 and llm_score > 3.5:
        print("\n‚úÖ Your result is BETTER than baseline! Great job!")
    elif bert_pct > 42.1 or llm_score > 3.5:
        print("\n‚ö†Ô∏è  Your result is partially better than baseline.")
    else:
        print("\n‚ùå Your result needs improvement.")
elif USE_BERT_SCORE:
    bert_pct = stats['bert_score']['mean'] * 100
    print(f"{'Your Result (RAG)':<30} {bert_pct:<20.1f} {'N/A (no LLM Judge)'}")
    print("="*80)


## 11. Visualize Results


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load evaluation results
with open(evaluation_output, 'r', encoding='utf-8') as f:
    eval_data = json.load(f)

results_list = eval_data['results']

# Extract scores
bert_scores = [r['bert_score'] for r in results_list if r.get('bert_score') is not None]
llm_scores = [r['llm_judge_score'] for r in results_list if r.get('llm_judge_score') is not None]

# Create plots
fig, axes = plt.subplots(1, 2 if USE_LLM_JUDGE else 1, figsize=(14 if USE_LLM_JUDGE else 7, 5))

if USE_BERT_SCORE:
    ax = axes[0] if USE_LLM_JUDGE else axes
    ax.hist(bert_scores, bins=20, color='skyblue', edgecolor='black')
    ax.axvline(stats['bert_score']['mean'], color='red', linestyle='--', 
               label=f"Mean: {stats['bert_score']['mean']:.4f}")
    ax.set_xlabel('BERTScore')
    ax.set_ylabel('Frequency')
    ax.set_title('BERTScore Distribution')
    ax.legend()
    ax.grid(alpha=0.3)

if USE_LLM_JUDGE:
    ax = axes[1] if USE_BERT_SCORE else axes
    ax.hist(llm_scores, bins=5, color='lightgreen', edgecolor='black', range=(1, 5))
    ax.axvline(stats['llm_judge']['mean'], color='red', linestyle='--',
               label=f"Mean: {stats['llm_judge']['mean']:.2f}")
    ax.set_xlabel('LLM Judge Score')
    ax.set_ylabel('Frequency')
    ax.set_title('LLM Judge Score Distribution')
    ax.set_xticks([1, 2, 3, 4, 5])
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
viz_path = f"{OUTPUT_DIR}/evaluation_distributions.png"
plt.savefig(viz_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úì Visualization saved to {viz_path}")


## 12. Sample Detailed Results


In [None]:
# Display first few detailed results
print("\n" + "="*80)
print("SAMPLE DETAILED RESULTS")
print("="*80)

for i, result in enumerate(results_list[:3]):
    print(f"\n{'='*80}")
    print(f"Sample {i+1}: {result['image_id']}")
    print(f"{'='*80}")
    print(f"\n‚ùì Question: {result['question']}")
    print(f"\n‚úÖ Ground Truth: {result['ground_truth']}")
    print(f"\nü§ñ Prediction: {result['predicted_answer']}")
    
    if result.get('bert_score'):
        print(f"\nüìä BERTScore: {result['bert_score']:.4f}")
    
    if result.get('llm_judge_score'):
        print(f"üìä LLM Judge: {result['llm_judge_score']}/5")
        print(f"üí≠ Reasoning: {result.get('llm_judge_reasoning', 'N/A')}")


## 13. Export to CSV


In [None]:
# Create DataFrame for easy viewing
df = pd.DataFrame(results_list)

# Save to CSV
csv_path = f"{OUTPUT_DIR}/evaluation_results.csv"
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"\n‚úì Results exported to {csv_path}")

# Display summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total samples evaluated: {len(df)}")
print(f"Samples with caption: {(df['caption'] != '').sum()}")
print(f"Samples with OCR: {(df['ocr'] != '').sum()}")
print(f"Average retrieved docs: {df.get('num_retrieved', pd.Series([0])).mean():.2f}")
print("="*80)


## 14. K·∫øt lu·∫≠n



###  Files ƒë∆∞·ª£c t·∫°o:
1. `predictions.json` - Raw predictions
2. `evaluation_results.json` - Detailed evaluation results
3. `evaluation_results.csv` - CSV format for easy viewing
4. `evaluation_distributions.png` - Visualizations

### üéØ K·∫øt qu·∫£ mong ƒë·ª£i:
- BERTScore > 42.1%
- LLM Judge > 3.5/5


