In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load dataset
dataset = load_dataset("rag-datasets/mini-bioasq")

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Load generator model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Function to retrieve relevant contexts
def retrieve_contexts(question, contexts, k=3):
    question_embedding = embedder.encode([question])[0]
    context_embeddings = embedder.encode(contexts)
    similarities = cosine_similarity([question_embedding], context_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return [contexts[i] for i in top_k_indices]

# Function to generate answer
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
    outputs = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Evaluate RAG system
def evaluate_rag(dataset, num_samples=100):
    correct = 0
    for i in range(num_samples):
        sample = dataset['test'][i]
        question = sample['question']
        contexts = sample['contexts']
        true_answer = sample['answers'][0]
        
        retrieved_contexts = retrieve_contexts(question, contexts)
        generated_answer = generate_answer(question, " ".join(retrieved_contexts))
        
        if generated_answer.lower() in true_answer.lower():
            correct += 1
    
    accuracy = correct / num_samples
    print(f"Accuracy: {accuracy:.2f}")

# Run evaluation
evaluate_rag(dataset)