# 2. Evaluate trên tập Test

Notebook này chạy đánh giá hệ thống trên tập test và tính các metrics.


## 1. Cài đặt thư viện


In [None]:
%pip install -q transformers accelerate bitsandbytes qwen-vl-utils sentence-transformers rank-bm25 underthesea wikipedia pillow pandas tqdm


## 2. Import thư viện


In [None]:
import json
import sys
from pathlib import Path
from PIL import Image
import pandas as pd
from tqdm import tqdm
import torch

# Add src to path (if running in Kaggle)
sys.path.insert(0, '/kaggle/working/code/src' if Path('/kaggle/working').exists() else '../src')

from pipeline import RAGVQAPipeline


## 3. Cấu hình


In [None]:
# Paths
TEST_DATA_PATH = "/kaggle/input/vqa-test/vqa_test.json"  # Điều chỉnh theo dataset của bạn
IMAGES_DIR = "/kaggle/input/vqa-images/images_flat"  # Điều chỉnh theo dataset của bạn
KB_PATH = "/kaggle/input/vietnamese-knowledge-base/knowledge_base.json"

# Output
OUTPUT_DIR = "/kaggle/working/results"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Evaluation settings
MAX_SAMPLES = None  # Set to number to limit, None for all
BATCH_SIZE = 1  # Process one at a time


## 4. Load Test Data


In [None]:
print("Loading test data...")
with open(TEST_DATA_PATH, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

if MAX_SAMPLES:
    test_data = test_data[:MAX_SAMPLES]

print(f"Loaded {len(test_data)} test samples")


## 5. Initialize Pipeline


In [None]:
print("Initializing pipeline...")
pipeline = RAGVQAPipeline(use_4bit=True)
print("Pipeline ready!")


## 6. Run Evaluation


In [None]:
results = []

for i, item in enumerate(tqdm(test_data, desc="Evaluating")):
    try:
        # Load image
        image_path = Path(IMAGES_DIR) / Path(item['image_path']).name
        if not image_path.exists():
            print(f"Image not found: {image_path}")
            continue
        
        image = Image.open(image_path).convert('RGB')
        question = item['question']
        ground_truth = item.get('answer', {}).get('answer', '')
        
        # Get prediction
        result = pipeline.process(
            image=image,
            question=question,
            return_intermediate=True
        )
        
        results.append({
            'id': i,
            'question': question,
            'ground_truth': ground_truth,
            'prediction': result['answer'],
            'caption': result.get('caption', ''),
            'ocr': result.get('ocr', ''),
            'num_retrieved': len(result.get('retrieved_docs', []))
        })
        
    except Exception as e:
        print(f"Error processing sample {i}: {e}")
        results.append({
            'id': i,
            'question': question,
            'ground_truth': ground_truth,
            'prediction': f'ERROR: {str(e)}',
            'caption': '',
            'ocr': '',
            'num_retrieved': 0
        })

print(f"\nCompleted evaluation on {len(results)} samples")


## 7. Save Results


In [None]:
# Save to JSON
output_json = f"{OUTPUT_DIR}/evaluation_results.json"
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Results saved to {output_json}")

# Save to CSV for easy viewing
df = pd.DataFrame(results)
output_csv = f"{OUTPUT_DIR}/evaluation_results.csv"
df.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Results saved to {output_csv}")

# Display summary
print("\n=== Evaluation Summary ===")
print(f"Total samples: {len(results)}")
print(f"Average retrieved docs: {df['num_retrieved'].mean():.2f}")
print(f"Samples with caption: {(df['caption'] != '').sum()}")
print(f"Samples with OCR: {(df['ocr'] != '').sum()}")


## 8. Sample Results


In [None]:
# Display first few results
for i, result in enumerate(results[:3]):
    print(f"\n=== Sample {i+1} ===")
    print(f"Question: {result['question']}")
    print(f"Ground Truth: {result['ground_truth'][:100]}...")
    print(f"Prediction: {result['prediction'][:100]}...")
