In [8]:
import os
from pathlib import Path

from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama


# Load environment variables from .env
load_dotenv()

# Set base paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"

print("Project root:", PROJECT_ROOT)
print("Raw data dir:", DATA_RAW_DIR)
print("Files:", list(DATA_RAW_DIR.iterdir()))


Project root: c:\Users\gowsi\OneDrive\Documents\my-git-code\rag-eval-lab
Raw data dir: c:\Users\gowsi\OneDrive\Documents\my-git-code\rag-eval-lab\data\raw
Files: [WindowsPath('c:/Users/gowsi/OneDrive/Documents/my-git-code/rag-eval-lab/data/raw/Evals_Paper.pdf')]


In [9]:
# Configure global LlamaIndex settings (you can tune later)
# Settings.llm = OpenAI(model="gpt-4o-mini")  # cheap + decent for RAG
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Local LLM via Ollama
Settings.llm = Ollama(model="llama3.2", 
                      base_url="http://127.0.0.1:11434",  # be explicit
                      request_timeout=300.0 )  # or another small model you pulled

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

Settings.chunk_size = 1024   # can experiment with this later
Settings.chunk_overlap = 200


2025-11-27 22:49:21,957 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [10]:
# Load the paper
loader = SimpleDirectoryReader(
    input_dir=str(DATA_RAW_DIR),
    required_exts=[".pdf"],      # only PDFs
    recursive=False
)

documents = loader.load_data()
len(documents), documents[0].metadata


(16,
 {'page_label': '1',
  'file_name': 'Evals_Paper.pdf',
  'file_path': 'c:\\Users\\gowsi\\OneDrive\\Documents\\my-git-code\\rag-eval-lab\\data\\raw\\Evals_Paper.pdf',
  'file_type': 'application/pdf',
  'file_size': 2613163,
  'creation_date': '2025-11-25',
  'last_modified_date': '2025-11-25'})

In [11]:
# Build index over the paper
index = VectorStoreIndex.from_documents(documents)

# Create a simple query engine
query_engine = index.as_query_engine(
    similarity_top_k=5,   # number of chunks to retrieve
    response_mode="compact"
)


2025-11-27 22:49:26,545 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/show "HTTP/1.1 200 OK"


In [12]:
question = "What types of evaluation methods does this paper discuss for LLMs?"
query_engine = index.as_query_engine(similarity_top_k=5)
response = query_engine.query(question)
print(response)

2025-11-27 22:50:16,038 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


This paper discusses various evaluation methods for LLMs, including:

1. Automating Evaluations of Prompts
2. Using automated evaluation pipelines in code or with other LLMs
3. Public PE tools like promptfoo and ChainForge that support both code-based and LLM-based evaluators
4. Assertion selectivity and its impact on LLM output quality confidence
5. Sampling grades from LLM outputs, including strategies such as random, highest, lowest, and alternating sampling policies
6. Choosing aligned assertions for each criterion, using the highest alignment score among candidate assertions


In [13]:
# response.source_nodes contains retrieved chunks + metadata
for i, node in enumerate(response.source_nodes, start=1):
    print(f"--- Source {i} | score={node.score:.4f} ---")
    # Print only first 400 chars for readability
    print(node.text[:400].replace("\n", " "))
    print()


--- Source 1 | score=0.5815 ---
Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences InputsPrompt Under TestLLM OutputsMetrics Evaluator LLMMetric Prompt Metric (Code)… Test Results … iterate (a) Typical Evaluation Pipeline InputsPrompt Under TestLLM Outputs Test Results Candidate Criteria Candidate Assertions edit criteria grade LLM outputs Selected Assertions Alignment Report Car

--- Source 2 | score=0.5451 ---
Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences Figure 3: The Table View, showing inputs, LLM outputs, and evaluation results per criteria for the NER task (Sec. 6). generated no good assertions, and participants deleted the criteria without complaints. P8 said, “I like that it tries, because maybe there will be a good implementation!” This sugg

--- Source 3 | score=0.5434 ---
One could imagine interfaces similar to creating a “pull request” for a new assertion and soliciting