## Import Statements

In [1]:
import os
import glob
import numpy as np
from typing import List, Dict
from sentence_transformers import SentenceTransformer, CrossEncoder
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from dotenv import load_dotenv
from neo4j import GraphDatabase

  from .autonotebook import tqdm as notebook_tqdm


## Setup Configuration

In [2]:
load_dotenv(dotenv_path="../.env")
NEO4J_URI = os.environ.get("NEO4J_URI")
NEO4J_USERNAME = os.environ.get("NEO4J_USERNAME")
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")

#QWEN_API = os.environ.get("QWEN3_KEY")
GEMINI_API=os.environ.get("GEMMA_KEY")

PDF_GLOB_GRANT = "./Papers/GrantEssay/*.pdf"
PDF_GLOB_AVA = "./Papers/AvaBiology/*.pdf"
INDEX_NAME = "essay_chunk_agentspace"

EMBEDDING_MODEL_NAME = "google/embeddinggemma-300m"
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

## Embeddings

In [3]:
class GemmaEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def _normalize(self, v):
        norm = np.linalg.norm(v)
        return v / norm if norm > 0 else v

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        vectors = self.model.encode(texts, convert_to_numpy=True)
        return [self._normalize(v).tolist() for v in vectors]

    def embed_query(self, text: str) -> List[float]:
        v = self.model.encode([text], convert_to_numpy=True)[0]
        return self._normalize(v).tolist()

## Data Ingestion

In [4]:
def load_pdfs(path_glob: str):
    docs = []
    for file in glob.glob(path_glob):
        loader = PyMuPDFLoader(
            file)
        docs.extend(loader.load())
    return docs


def split_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP, 
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    chunks = splitter.split_documents(docs)

    for i, c in enumerate(chunks):
        c.metadata["chunk_id"] = i
    return chunks

## Creating Graph Functions

### Connect to Neo4j Driver

In [5]:
def get_driver():
    """Get Neo4j driver - create once and reuse."""
    return GraphDatabase.driver(
        NEO4J_URI, 
        auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
    )

### Create/Get VectorStore passing the driver around and having embedding and relationship on same graph

In [6]:
def get_or_create_vectorstore(embeddings, driver):
    """
    Connect to existing vector store OR create empty one.
    """
    

    with driver.session() as session:
        result = session.run("""
            SHOW INDEXES
            YIELD name, type
            WHERE name = $index_name AND type = 'VECTOR'
            RETURN count(*) > 0 as exists
        """, index_name=INDEX_NAME)
        
        record = result.single()
        index_exists = record['exists'] if record else False

    
    if index_exists:
        print(f"‚úÖ Connecting to existing vector index: {INDEX_NAME}")
        vectorstore = Neo4jVector(
            embedding=embeddings,
            url=NEO4J_URI,
            username=NEO4J_USERNAME,
            password=NEO4J_PASSWORD,
            index_name=INDEX_NAME,
            node_label="Chunk",
            text_node_property="text",
            embedding_node_property="embedding",
        )
    else:
        print(f"üÜï Creating new vector index: {INDEX_NAME}")
        vectorstore = Neo4jVector.from_documents(
            documents=[],  # Empty - just creates index
            embedding=embeddings,
            url=NEO4J_URI,
            username=NEO4J_USERNAME,
            password=NEO4J_PASSWORD,
            index_name=INDEX_NAME,
            node_label="Chunk",
            text_node_property="text",
            embedding_node_property="embedding",
        )
        print(f"‚úÖ Created empty vector store")
    
    return vectorstore

## Add documents to vector store and graph 

In [7]:
def build_graph_relationships(chunks, driver):
    """Build Document nodes and relationships."""

    by_source: Dict[str, List] = {}
    for c in chunks:
        src = c.metadata.get("source", "unknown")
        by_source.setdefault(src, []).append(c)

    with driver.session() as session:
        for source, source_chunks in by_source.items():
            source_chunks.sort(key=lambda x: x.metadata["chunk_id"])

            session.run(
                "MERGE (d:Document {name: $name}) SET d.chunk_count = $n",
                {"name": source, "n": len(source_chunks)}
            )

            for i, c in enumerate(source_chunks):
                session.run(
                    """
                    MATCH (d:Document {name: $source})
                    MATCH (c:Chunk {chunk_id: $cid})
                    MERGE (c)-[:PART_OF]->(d)
                    """,
                    {"source": source, "cid": c.metadata["chunk_id"]}
                )

                if i < len(source_chunks) - 1:
                    session.run(
                        """
                        MATCH (c1:Chunk {chunk_id: $c1})
                        MATCH (c2:Chunk {chunk_id: $c2})
                        MERGE (c1)-[:NEXT]->(c2)
                        """,
                        {
                            "c1": c.metadata["chunk_id"],
                            "c2": source_chunks[i + 1].metadata["chunk_id"],
                        },
                    )



def add_documents(path, vectorstore, driver):
    """"Function where you can take file path and get back vector embeddings"""
    pdfs = load_pdfs(path)
    chunks = split_documents(pdfs)
    vectorstore.add_documents(chunks)
    build_graph_relationships(chunks, driver)


# Simple Vector Embedding Retriever

In [8]:
class SimpleSemanticRetriever:
    """Simple retriever using only semantic similarity (no MMR, no reranking, no graph)."""
    
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore
    
    def retrieve(self, query: str, top_k: int = 20, rerank_k: int = 6):
        """
        Simple semantic search - just vector similarity.
        
        Args:
            query: Search query
            top_k: Not used, kept for compatibility
            rerank_k: Number of results to return
            
        Returns:
            List of dicts with same format as HybridRetriever
        """
        # Just do simple similarity search
        docs = self.vectorstore.similarity_search(query, k=rerank_k)
        
        # Format results to match HybridRetriever output
        results = []
        for i, doc in enumerate(docs):
            results.append({
                "text": doc.page_content,
                "score": 1.0 - (i * 0.1),  # Fake decreasing scores
                "source": doc.metadata.get("source", "unknown"),
                "chunk_id": doc.metadata.get("chunk_id", i),
                "context": None  # No graph context
            })
        
        return results

## Context Evaluation

### Make Precision and Recall Evaluator

In [9]:
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, ContextPrecision
from ragas.dataset_schema import SingleTurnSample
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import List, Dict
import asyncio
import json


class RagasEvaluator:
    def __init__(self):
        """qwen_llm = ChatOpenAI(
            model="qwen3",
            api_key=QWEN_API,
            base_url="https://ellm.nrp-nautilus.io/v1",
            temperature=0
        )""" 

        gemma_llm = ChatGoogleGenerativeAI(
            model="gemini-2.5-flash-lite",
            google_api_key=GEMINI_API, 
            temperature=0
        )
        
        self.llm = LangchainLLMWrapper(gemma_llm)
        self.context_recall = LLMContextRecall(llm=self.llm)
        self.context_precision = ContextPrecision(llm=self.llm)
    
    async def evaluate_recall(
        self,
        user_input: str,
        retrieved_contexts: List[str],
        reference: str
    ) -> float:
        """Evaluate context recall for a single query."""
        sample = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
            reference=reference
        )
        
        result = await self.context_recall.single_turn_ascore(sample)
        return result
    
    async def evaluate_precision(
        self,
        user_input: str,
        retrieved_contexts: List[str],
        reference: str
    ) -> float:
        """Evaluate context precision for a single query."""
        sample = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
            reference=reference
        )
        
        result = await self.context_precision.single_turn_ascore(sample)
        return result

  from ragas.metrics import LLMContextRecall, ContextPrecision
  from ragas.metrics import LLMContextRecall, ContextPrecision


## Make a class to use the retriever tester easily

In [10]:
class RetrievalTester:
    def __init__(self, retriever, evaluator: RagasEvaluator):
        self.retriever = retriever
        self.evaluator = evaluator
    
    def format_retrieval_results(self, results: List[Dict]) -> List[str]:
        """
        Convert HybridRetriever results to list of context strings.
        
        Args:
            results: Output from HybridRetriever.retrieve()
            
        Returns:
            List of context strings for RAGAS
        """
        contexts = []
        for r in results:
            contexts.append(r["text"])
        
        return contexts
    
    def retrieve_and_format(
        self,
        query: str,
        top_k: int = 20,
        rerank_k: int = 6
    ) -> tuple[List[str], List[Dict]]:
        """
        Retrieve documents and format for evaluation.
        
        Returns:
            Tuple of (formatted_contexts, raw_results)
        """
        raw_results = self.retriever.retrieve(
            query=query,
            top_k=top_k,
            rerank_k=rerank_k
        )
        
        formatted_contexts = self.format_retrieval_results(raw_results)
        
        return formatted_contexts, raw_results
    
    async def test_from_json(
        self,
        json_file_path: str,
        top_k: int = 20,
        rerank_k: int = 6
    ) -> List[Dict]:
        """
        Load test cases from JSON and evaluate.
        
        Args:
            json_file_path: Path to JSON file with test cases
            top_k: Number of initial candidates
            rerank_k: Number of final results
            
        Returns:
            List of detailed results for each query
        """
        with open(json_file_path, 'r') as f:
            test_cases = json.load(f)
        
        return await self.test_from_list(test_cases, top_k, rerank_k)
    
    async def test_recall_only(
        self,
        test_cases: List[Dict[str, str]],
        top_k: int = 20,
        rerank_k: int = 6
    ) -> List[Dict]:
        """
        Test RECALL only (fast).
        
        Args:
            test_cases: List of dicts with 'query' and 'reference' keys
            top_k: Number of initial candidates
            rerank_k: Number of final results
            
        Returns:
            List of results with recall scores
        """
        results = []
        
        for i, case in enumerate(test_cases, 1):
            print(f"Processing query {i}/{len(test_cases)}: {case['query'][:50]}...")
            
            # Retrieve documents
            formatted_contexts, raw_results = self.retrieve_and_format(
                query=case['query'],
                top_k=top_k,
                rerank_k=rerank_k
            )
            
            # Evaluate recall only
            recall_score = await self.evaluator.evaluate_recall(
                user_input=case['query'],
                retrieved_contexts=formatted_contexts,
                reference=case['reference']
            )
            
            # Compile results
            result = {
                "query": case['query'],
                "reference": case['reference'],
                "num_retrieved": len(raw_results),
                "recall": recall_score,
                "retrieved_sources": [r["source"] for r in raw_results],
                "retrieved_chunks": [{"text": r["text"], "source": r["source"]} for r in raw_results]
            }
            
            results.append(result)
            print(f"  Recall: {recall_score:.3f}")
        
        return results
    
    async def test_precision_only(
        self,
        test_cases: List[Dict[str, str]],
        top_k: int = 20,
        rerank_k: int = 6
    ) -> List[Dict]:
        """
        Test PRECISION only (slower).
        
        Args:
            test_cases: List of dicts with 'query' and 'reference' keys
            top_k: Number of initial candidates
            rerank_k: Number of final results
            
        Returns:
            List of results with precision scores
        """
        results = []
        
        for i, case in enumerate(test_cases, 1):
            print(f"Processing query {i}/{len(test_cases)}: {case['query'][:50]}...")
            
            # Retrieve documents
            formatted_contexts, raw_results = self.retrieve_and_format(
                query=case['query'],
                top_k=top_k,
                rerank_k=rerank_k
            )
            
            # Evaluate precision only
            precision_score = await self.evaluator.evaluate_precision(
                user_input=case['query'],
                retrieved_contexts=formatted_contexts,
                reference=case['reference']
            )
            
            # Compile results
            result = {
                "query": case['query'],
                "reference": case['reference'],
                "num_retrieved": len(raw_results),
                "precision": precision_score,
                "retrieved_sources": [r["source"] for r in raw_results],
                "retrieved_chunks": [{"text": r["text"], "source": r["source"]} for r in raw_results]
            }
            
            results.append(result)
            print(f"  Precision: {precision_score:.3f}")
        
        return results
    
    async def test_both(
        self,
        test_cases: List[Dict[str, str]],
        top_k: int = 20,
        rerank_k: int = 6
    ) -> List[Dict]:
        """
        Test BOTH recall and precision (slowest).
        
        Args:
            test_cases: List of dicts with 'query' and 'reference' keys
            top_k: Number of initial candidates
            rerank_k: Number of final results
            
        Returns:
            List of results with both recall and precision scores
        """
        results = []
        
        for i, case in enumerate(test_cases, 1):
            print(f"Processing query {i}/{len(test_cases)}: {case['query'][:50]}...")
            
            # Retrieve documents
            formatted_contexts, raw_results = self.retrieve_and_format(
                query=case['query'],
                top_k=top_k,
                rerank_k=rerank_k
            )
            
            # Evaluate recall
            recall_score = await self.evaluator.evaluate_recall(
                user_input=case['query'],
                retrieved_contexts=formatted_contexts,
                reference=case['reference']
            )
            
            # Evaluate precision
            precision_score = await self.evaluator.evaluate_precision(
                user_input=case['query'],
                retrieved_contexts=formatted_contexts,
                reference=case['reference']
            )
            
            # Compile results
            result = {
                "query": case['query'],
                "reference": case['reference'],
                "num_retrieved": len(raw_results),
                "recall": recall_score,
                "precision": precision_score,
                "retrieved_sources": [r["source"] for r in raw_results],
                "retrieved_chunks": [{"text": r["text"], "source": r["source"]} for r in raw_results]
            }
            
            results.append(result)
            print(f"  Recall: {recall_score:.3f}, Precision: {precision_score:.3f}")
        
        return results
    
    
    def print_results(self, results: List[Dict], metric: str = "both", show_chunks: bool = False):
        """
        Pretty print evaluation results.
        
        Args:
            results: List of result dictionaries
            metric: "both", "recall", or "precision"
            show_chunks: Whether to print the actual retrieved chunks
        """
        print("\n" + "="*80)
        print(f"EVALUATION RESULTS - {metric.upper()}")
        print("="*80)
        
        for i, result in enumerate(results, 1):
            print(f"\n[Query {i}]")
            print(f"Query: {result['query']}")
            print(f"Reference: {result['reference']}...")
            print(f"\nMetrics:")
            
            if metric == "both":
                print(f"  Recall:    {result['recall']:.3f}")
                print(f"  Precision: {result['precision']:.3f}")
            elif metric == "recall":
                print(f"  Recall:    {result['recall']:.3f}")
            elif metric == "precision":
                print(f"  Precision: {result['precision']:.3f}")
            
            print(f"\nRetrieval Info:")
            print(f"  Chunks Retrieved: {result['num_retrieved']}")
            
            if show_chunks:
                print(f"\n  Retrieved Chunks:")
                for j, chunk in enumerate(result['retrieved_chunks'], 1):
                    print(f"\n    [{j}] Source: {chunk['source']}")
                    print(f"    Text: {chunk['text']}...")
            
            print("-" * 80)
        
        # Overall statistics
        print(f"\n{'='*80}")
        print("OVERALL STATISTICS")
        print(f"{'='*80}")
        
        if metric == "both":
            avg_recall = sum(r["recall"] for r in results) / len(results)
            avg_precision = sum(r["precision"] for r in results) / len(results)
            print(f"Average Recall:    {avg_recall:.3f}")
            print(f"Average Precision: {avg_precision:.3f}")
        elif metric == "recall":
            avg_recall = sum(r["recall"] for r in results) / len(results)
            print(f"Average Recall:    {avg_recall:.3f}")
        elif metric == "precision":
            avg_precision = sum(r["precision"] for r in results) / len(results)
            print(f"Average Precision: {avg_precision:.3f}")
        
        print(f"Total Queries:     {len(results)}")
    
    def save_results(self, results: List[Dict], output_path: str):
        """Save results to JSON file."""
        with open(output_path, 'w') as f:
            json.dump(results, indent=2, fp=f)
        print(f"\nResults saved to: {output_path}")

### Do some tests of the precision and recall evaluators

In [11]:
#connect to everything
driver = get_driver()
embeddings = GemmaEmbeddings(EMBEDDING_MODEL_NAME)
vectorstore = get_or_create_vectorstore(embeddings, driver)

‚úÖ Connecting to existing vector index: essay_chunk_agentspace


  vectorstore = Neo4jVector(


In [31]:
#if not already in the vectorstore add in my documents 
add_documents(PDF_GLOB_AVA, vectorstore, driver)

In [32]:
# Test it

simple_retriever = SimpleSemanticRetriever(vectorstore)
evaluator = RagasEvaluator()
tester = RetrievalTester(retriever=simple_retriever, evaluator=evaluator)

test_cases = [
    {
        "query": "Where is the Eiffel Tower?",
        "reference": "The Eiffel Tower is in Paris, France."
    }
]

# Run tests
results = await tester.test_both(test_cases)
tester.print_results(results, metric="recall")

  self.llm = LangchainLLMWrapper(gemma_llm)


Processing query 1/1: Where is the Eiffel Tower?...
  Recall: 0.000, Precision: 0.000

EVALUATION RESULTS - RECALL

[Query 1]
Query: Where is the Eiffel Tower?
Reference: The Eiffel Tower is in Paris, France....

Metrics:
  Recall:    0.000

Retrieval Info:
  Chunks Retrieved: 6
--------------------------------------------------------------------------------

OVERALL STATISTICS
Average Recall:    0.000
Total Queries:     1


### Test the Biology set of questions on both precision and recall

In [25]:
test_cases = [
    {
        "query": "What is the tdk gene and what does it encode in E. coli?",
        "reference": "We developed the tdk (thymidine deoxykinase) single-gene knockout system as a reporter gene in E. coli. The TDK protein is required to phosphorylate thymidine and thymdine derivatives such as 5-fluorodeoxyuridine. This is also true for AZT, as we find that a large majority of the mutants resistant to 100 ng/mL AZT have mutations that inactivate the tdk-encoded thymidine deoxykinase. AZT (30-azido-30-deoxythymidine) is a DNA replication chain blocker that is used in anti-retroviral therapy."
    },
    {
        "query": "How many mutationally prone regions (MPRs) were identified in the tdk gene versus the thyA gene?",
        "reference": "We can define 5 mutationally prone regions (MPRs) in the gene and extends the analysis of Mashiach et al. (2021) to a second gene-protein system. We previously analyzed mutational sites in the E. coli thyA gene, and showed that some regions of the gene were more prone to mutations at known base substitution sites than other regions (Mashiach et al. 2021). Analyzing over 1100 mutations from different treatments and mutator strains, we defined three 'MPRs' (mutationally prone regions). Placed side by side, the two studies, each involving a similar number of mutations, reveal 8 MPR regions of somewhat similar heights and sizes."
    },
    {
        "query": "What specific hotspot did cisplatin create in the tdk gene and why is it significant?",
        "reference": "It is evident from Figure 2 that there is a major hotspot at position 499, which has 102 of the 411 base changes (24%). Note that this site involves a G:C‚ÜíT:A change at an AGG triplet, one of the favored triplets based on previous work in mammalian cell lines. In fact, the region from 484 to 507 has all the earmarks of an mutation prone region (MPR; Mashiach et al., 2021), with 174 of the 411 base substitution mutations (42%) in a 23-base pair region. There are four CGA triplets at which we have found G:C‚ÜíT:A mutations that yield AZT-resistant mutants, at positions 313, 355, 406, and 496. The number of occurrences at each of these sites in the total CPT treated sets are, respectively, 0, 0, 0, and 21 (p = 421/4 = 420; p < 10‚àí12). Thus, the site in the MPR, 496, is hot, and the other sites that are outside this region are 'cold.'"
    },
    {
        "query": "What are the advantages of using the tdk/AZT system compared to other reporter gene systems?",
        "reference": "The tdk/AZT-resistant system we developed here is an ideal gene reporter for mutations. The tdk gene is only 618 bp long, meaning that one can sequence the whole gene easily with a single primer pair. AZT-resistant mutants appear on LB plates containing 100 ng/mL AZT. This is an advantage because here mutants appear after 1 day, whereas in some reporter systems, such as the thyA/trimethoprim resistance system, it takes 2 days for full mutant colonies to appear. Also, any mutation resulting in an inactive gene product, including base substitutions, insertions, and deletions will show up in this system. Another advantage is the low background of spontaneous base substitutions."
    },
    {
        "query": "What are the key conclusions about what determines mutational hotspots based on these studies?",
        "reference": "The elements necessary for a significant hotspot using CPT as an example are: 1. Preferential lesion target and type of mutation, for example, G:C‚ÜíT:A, and A:T‚ÜíT:A at the 50-end of a pur‚Äìpur sequence. 2. Preferential nearest neighbors in many cases. 3. Preferential location in a gene, namely in an MPR. 4. To observe the full complement of hotspots one needs to unmask the effect of repair systems, particularly the UvrA,B,C excision repair systems, as they can preferentially remove lesions, camouflaging certain hotspots. Therefore, we can attribute the differences in base substitution mutation rates at sites with the identical nearest neighbors to the crucial element in determining which sites are true 'hotspots,' namely being in a particular region of the gene. The larger implications are that mutation frequency is significantly influenced by the structure of regions of DNA, and this points to the value of future studies of what these structures are and how changes in these structures affect mutability."
    }
]

In [26]:
# Test it

evaluator = RagasEvaluator()
simple_retriever= SimpleSemanticRetriever(vectorstore)

tester = RetrievalTester(retriever=simple_retriever, evaluator=evaluator)

# Run tests
results = await tester.test_both(test_cases)
tester.print_results(results,  show_chunks=True)

Processing query 1/5: What is the tdk gene and what does it encode in E....
  Recall: 0.250, Precision: 0.333
Processing query 2/5: How many mutationally prone regions (MPRs) were id...
  Recall: 1.000, Precision: 0.450
Processing query 3/5: What specific hotspot did cisplatin create in the ...
  Recall: 0.000, Precision: 0.000
Processing query 4/5: What are the advantages of using the tdk/AZT syste...
  Recall: 1.000, Precision: 0.967
Processing query 5/5: What are the key conclusions about what determines...
  Recall: 0.667, Precision: 0.750

EVALUATION RESULTS - BOTH

[Query 1]
Query: What is the tdk gene and what does it encode in E. coli?
Reference: We developed the tdk (thymidine deoxykinase) single-gene knockout system as a reporter gene in E. coli. The TDK protein is required to phosphorylate thymidine and thymdine derivatives such as 5-fluorodeoxyuridine. This is also true for AZT, as we find that a large majority of the mutants resistant to 100 ng/mL AZT have mutations that i

### Test Grants Essays for precision and recall

In [27]:
test_cases = [
    {
        "query": "What is the main theme of Bisclavret according to the essay?",
        "reference": "Through the contrasting lives and fates of the knight and his wife in Bisclavret, Marie de France emphasizes the importance of selfless over selfish love. Of the prevailing themes throughout the different lays, one of the most ubiquitous is that of selfless versus selfish love ‚Äî that is, love for another person purely out of adoration and respect for the other person as opposed to love for one's own self-gain."
    },
    {
        "query": "How does the knight in Bisclavret demonstrate selfless love?",
        "reference": "The knight, because of his love and affection for his wife, does not wish to explain a condition that could very well scare her away. He exhibits compassion and does not want to frighten her and rupture a loving relationship. His love is expressed through his long trips away so that he can deal with his affliction on his own and his desire to love her and not lose her is expressed through his belief that he will be destroyed if he scares her away. The knight's selflessness and loyalty is further exuded when his wife condemns him to a life stuck in his wolf form; when the king stumbles across the werewolf and decides to adopt it, the wolf shows clearly that it loved him. The relationship between the knight and the king is that of mutual love and respect, an unwavering loyalty that was not found in the knight's wife."
    },
    {
        "query": "How does the wife's behavior exemplify selfish love in Bisclavret?",
        "reference": "After hearing the news her husband gives her, she quickly decides that she will not lie with him any more and immediately turns away to a knight she never loved, offering him her love and her body. The wife exhibits self-centered love because she appears to be mortified at the notion of being wed to a werewolf and not a man, suggesting a shallow, more physical form of love in which she seeks him for what he can physically provide. She cannot look past her loving husband's physical affliction and see him as a man who loves her unconditionally. Her fate also serves as Marie's lesson on selfish love, for the king banished the woman from the country and many of the women in the family were born without noses and lived noseless after the knight bit off her nose as a wolf."
    },
    {
        "query": "How does Whitman's work relate to Emerson's Transcendentalism?",
        "reference": "Whitman's sociability complements Emerson's solitude and self-reliance, emphasizing the interconnectedness of individuals and the potential for genuine connections to enhance individuality and self-understanding, which in turn empowers individuals to contribute to the communal American project. Like Emerson, who promoted poets to break free from the mold of convention and tradition, Whitman invites the reader to transcend the preconceived notion that life is an individual and disjointed experience for every person."
    },
    {
        "query": "What does Emerson say about the role of the poet in 'The Poet'?",
        "reference": "The poet is the sayer, the namer, and represents beauty. The poet being 'the sayer' underlines that the poet's role in society is to provide insight and meaning to things in life and nature, giving them the unique power to enunciate these ideas to others. Being 'the namer' expresses Emerson's belief that the poet can assign meaning and significance to things through language, implying that the poet has the power to create, shape, and define reality through their diction. The poet being 'the representation of beauty' not only highlights the innate beauty found in nature and the self but also implies that the poet serves as the nexus between people and the aesthetic aspects of life. By saying that the poet is not a 'permissive potentate,' Emerson gives power to the poet through monarchical diction, with the choice of 'sovereign' and 'emperor,' giving them agency to tap into their inner vision."
    },
    {
        "query": "How does Whitman express the concept of shared humanity in 'Crossing Brooklyn Ferry'?",
        "reference": "By questioning 'What is it then between us,' Whitman questions the perceived divisions that separate people as individuals, dismissing their significance. He answers the rhetorical question asserting that 'Whatever it is, it avails not,' emphasizing his belief that external factors have no real consequence in comparison to shared humanity. Whitman continues his answer by repeating the words 'I too,' which further grounds the universality of human experiences and emotions across space and time. His vivid depiction of bustling city life in Manhattan, as well as the other shared experiences of life, build on the Transcendentalist focus on the beauty of nature, and propose that if as a society, Americans cannot find commonalities among each other, then they should look to nature as a basis for the shared experiences of humanity."
    }
]

In [28]:
# Test it adding in grants docuemnts

add_documents(PDF_GLOB_GRANT, vectorstore)

# Run tests
# Run tests
results = await tester.test_both(test_cases)
tester.print_results(results,  show_chunks=True)

Processing query 1/6: What is the main theme of Bisclavret according to ...
  Recall: 1.000, Precision: 0.710
Processing query 2/6: How does the knight in Bisclavret demonstrate self...
  Recall: 0.600, Precision: 1.000
Processing query 3/6: How does the wife's behavior exemplify selfish lov...
  Recall: 0.750, Precision: 0.639
Processing query 4/6: How does Whitman's work relate to Emerson's Transc...
  Recall: 1.000, Precision: 0.710
Processing query 5/6: What does Emerson say about the role of the poet i...
  Recall: 1.000, Precision: 1.000
Processing query 6/6: How does Whitman express the concept of shared hum...
  Recall: 1.000, Precision: 1.000

EVALUATION RESULTS - BOTH

[Query 1]
Query: What is the main theme of Bisclavret according to the essay?
Reference: Through the contrasting lives and fates of the knight and his wife in Bisclavret, Marie de France emphasizes the importance of selfless over selfish love. Of the prevailing themes throughout the different lays, one of the m