In [4]:
!pip install llama_index

Collecting llama_index
  Downloading llama_index-0.12.10-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama_index)
  Downloading llama_index_agent_openai-0.4.1-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama_index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.10 (from llama_index)
  Downloading llama_index_core-0.12.10.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama_index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama_index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama_index)
  Downloading llama_index_llms_openai-0.3.13-py3-none-any.whl.metadata (3.3 kB)


In [5]:
import nest_asyncio
from llama_index.core import SimpleDirectoryReader, Settings, SummaryIndex, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner
from llama_index.core.objects import ObjectIndex
from pathlib import Path

In [6]:
import os

In [7]:
# Apply nest_asyncio
nest_asyncio.apply()

# Set API key from environment variable
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# Check if the API key is set
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable is not set")


In [8]:
# Define papers
papers = [
    "paper_1.pdf",
    "paper_2.pdf",
    "paper_3.pdf",
    "paper_4.pdf"
]


In [9]:
# Set up LLM and embedding model
Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [10]:
# Function to get document tools
def get_doc_tools(paper, paper_name):
    documents = SimpleDirectoryReader(input_files=[paper]).load_data()
    splitter = SentenceSplitter(chunk_size=1024)
    nodes = splitter.get_nodes_from_documents(documents)
    
    summary_index = SummaryIndex(nodes)
    vector_index = VectorStoreIndex(nodes)
    
    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    vector_query_engine = vector_index.as_query_engine()
    
    summary_tool = QueryEngineTool.from_defaults(
        query_engine=summary_query_engine,
        description=f"Useful for summarization questions related to {paper_name}"
    )
    vector_tool = QueryEngineTool.from_defaults(
        query_engine=vector_query_engine,
        description=f"Useful for retrieving specific context from {paper_name}"
    )
    
    return vector_tool, summary_tool

In [11]:
# Create tools for each paper
paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, Path(paper).stem)
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]

Getting tools for paper: paper_1.pdf
Getting tools for paper: paper_2.pdf
Getting tools for paper: paper_3.pdf
Getting tools for paper: paper_4.pdf


In [12]:
# Combine all tools
all_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]

In [13]:
# Create object index for tool retrieval
obj_index = ObjectIndex.from_objects(all_tools, index_cls=VectorStoreIndex)
obj_retriever = obj_index.as_retriever(similarity_top_k=3)


In [16]:
# Create agent
llm = OpenAI(model="gpt-4o")
agent_worker = FunctionCallingAgentWorker.from_tools(
    tool_retriever=obj_retriever,
    llm=llm, 
    system_prompt=""" \
You are an agent designed to answer queries about TSTR methodology and synthetic data quality testing.
Please always use the tools provided to answer questions about TSTR and synthetic data evaluation. Do not rely on prior knowledge.\
""",
    verbose=True
)
agent = AgentRunner(agent_worker)

In [17]:
response1 = agent.query(
    "Explain the key components of the TSTR methodology for evaluating synthetic data quality."
)
print(str(response1))

response2 = agent.query(
    "Compare and contrast the approaches used in different papers for testing synthetic data quality using TSTR. "
    "Analyze the methodology in each paper first."
)
print(str(response2))

Added user message to memory: Explain the key components of the TSTR methodology for evaluating synthetic data quality.
=== Calling Function ===
Calling function: query_engine_tool with args: {"input": "Explain the key components of the TSTR methodology for evaluating synthetic data quality."}
=== Function Output ===
The TSTR (Train on Synthetic, Test on Real) methodology for evaluating synthetic data quality involves training a machine learning model on synthetic data and then testing its performance on real data. This approach assesses how well the synthetic data can replicate the patterns and relationships present in the real data. The key components of TSTR include:

1. **Training Phase**: A model is trained using the synthetic dataset. This phase focuses on capturing the underlying patterns and relationships that the synthetic data is meant to represent.

2. **Testing Phase**: The trained model is then evaluated using a real dataset. The performance metrics obtained during this ph