# RAG Response Evals: Build Dataset

Build a dataset using a combination of user queries and synthetic LLM-generated queries.

Reuse data from previous 'vector search cutoff' experiments.
- LLM generated queries tended to be more complex questions than user queries.
- Aim for 100 questions, 50 pure LLM-generated, and 50 user + few-shot LLM 'user-like' queries

In [1]:
import pandas as pd

## Load previous queries

In [2]:
df_user = pd.read_csv("retrieval_relevance_evaluations_user_queries.csv")

In [5]:
df_synth = pd.read_csv("retrieval_relevance_evaluations.csv")

In [9]:
user_queries = list(df_user['query'].unique())
synth_queries = list(df_synth['query'].unique())

In [10]:
len(user_queries), len(synth_queries)

(16, 20)

In [36]:
user_queries

['What were the first civilizations?',
 'when did julius cesar rule?',
 'how does the author define barbarians?',
 'why are Sunnis and Shia called that?',
 'who were the guptas?',
 'name the Chinese dynasties',
 'what was hellenization?',
 'Can you tell me about groups that moved into europe during the roman empire?',
 'Who were the Magyars?',
 'who were the scythians?',
 "what does 'doge' mean?",
 'tell me about the antonine age in rome',
 'Tell me about the Roman Empire',
 'who were the Seljuks?',
 'What groups had interactions with the Magyars?',
 'What were the main causes of World War I?']

## Generate more queries

In [None]:
# use openai llm to generate a set of questions to ask about a world history book
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langsmith import traceable
from pydantic import BaseModel, Field
from typing import List

class HistoryQuestions(BaseModel):
    questions: List[str] = Field(description="List of world history questions")


### Simple questions

In [None]:

def format_few_shot_examples(queryList):
    return "\n".join([f"- {q}" for q in queryList])

@traceable
def generate_simple_history_questions_few_shot(good_example_list, bad_example_list, num_questions=20):
    # Initialize parser and LLM
    parser = JsonOutputParser(pydantic_object=HistoryQuestions)
    llm = ChatOpenAI(model="gpt-4o", temperature=0.8)
    
    # Create prompt template
    prompt = PromptTemplate.from_template(
    """
    You are a world history expert.
        
    Generate {num_questions} diverse, specific questions that could be asked about a comprehensive world history book. 
    
    The questions should cover:
    - Different time periods (ancient, medieval, modern, contemporary)
    - Various civilizations and regions (Europe, Asia, Africa, Americas, Middle East)

    Below is a set of good example questions. These are simple, factual questions, not complex analytical ones. Try to mimic this style.
    Good examples:
    {good_examples}

    Below is a set of bad example questions. These are more complex and analytical. Try to avoid this style.
    Bad examples:
    {bad_examples}

    While the questions should be simple, they should still cover esoteric topics.

    {format_instructions}
    """
    )
    
    # Create LCEL chain
    # example_formatter = RunnableLambda(lambda x: {"examples": format_few_shot_examples(x["example_list"])})
    example_formatter_1 = RunnablePassthrough.assign(good_examples=lambda x: format_few_shot_examples(x["good_example_list"]))
    example_formatter_2 = RunnablePassthrough.assign(bad_examples=lambda x: format_few_shot_examples(x["bad_example_list"]))

    chain = example_formatter_1 | example_formatter_2 | prompt | llm | parser

    # Execute chain
    result = chain.invoke({
        "good_example_list": good_example_list,
        "bad_example_list": bad_example_list,
        "num_questions": num_questions,
        "format_instructions": parser.get_format_instructions()
    })
    
    return result


In [52]:
user_queries_selected = [user_queries[0],user_queries[1],user_queries[3]]
user_queries_selected

['What were the first civilizations?',
 'when did julius cesar rule?',
 'why are Sunnis and Shia called that?']

In [53]:
synth_queries[:3]

['What were the key factors that led to the fall of the Western Roman Empire?',
 'How did the spread of Islam in the 7th century influence trade and cultural exchanges across Africa and Europe?',
 'What were the primary motivations behind the European Age of Exploration during the 15th and 16th centuries?']

In [55]:
simple_synth_queries = generate_simple_history_questions_few_shot(good_example_list=user_queries_selected, bad_example_list=synth_queries[:3], num_questions=34)

In [56]:
simple_synth_queries

{'questions': ['What were the major inventions of the Bronze Age?',
  'Who was the first emperor of China?',
  'When did the Byzantine Empire fall?',
  'What is the significance of the Indus Valley Civilization?',
  'Who founded the Mali Empire?',
  'What were the key features of the Aztec civilization?',
  'When did the Mongol Empire reach its greatest extent?',
  'Who was the ruler of the Ottoman Empire during the Siege of Vienna?',
  "What were the causes of the Thirty Years' War?",
  'Who was the first president of the United States?',
  'When did the Industrial Revolution begin?',
  'What was the Meiji Restoration?',
  'Who was the leader of the Zulu Kingdom during the Anglo-Zulu War?',
  'What was the significance of the Mayan calendar?',
  'When did the Berlin Wall fall?',
  'Who was the last Tsar of Russia?',
  'What was the primary language of the Sumerians?',
  'Who was the founder of the Persian Empire?',
  'When was the Magna Carta signed?',
  'What was the main religion of

### Complex questions

In [57]:
from langchain_core.prompts import ChatPromptTemplate

# reused from 'investigate_vector_search_cutoff.ipynb'
def generate_history_questions(num_questions=20):
    # Initialize parser and LLM
    parser = JsonOutputParser(pydantic_object=HistoryQuestions)
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)
    
    # Create prompt template
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a world history expert who creates thoughtful, educational questions."),
        ("user", """Generate {num_questions} diverse, specific questions that could be asked about a comprehensive world history book. 
    
    The questions should cover:
    - Different time periods (ancient, medieval, modern, contemporary)
    - Various civilizations and regions (Europe, Asia, Africa, Americas, Middle East)
    - Different types of historical topics (political events, cultural developments, economic systems, wars, social movements, technological advances)
    - Both broad conceptual questions and specific factual questions
    
    Make the questions natural and realistic - the kind a student or researcher might ask when studying world history.
    
    Examples of good questions:
    - What were the main causes of World War I?
    - How did the Silk Road impact trade between East and West?
    - What role did the printing press play in the Renaissance?
    
    {format_instructions}""")
    ])
    
    # Create LCEL chain
    chain = prompt | llm | parser
    
    # Execute chain
    result = chain.invoke({
        "num_questions": num_questions,
        "format_instructions": parser.get_format_instructions()
    })
    
    return result

In [58]:
# generate all 50, instead of augmenting old ones (avoid overlap)
complex_synth_queries = generate_history_questions(num_questions=50)

In [74]:
len(complex_synth_queries["questions"])

53

In [76]:
complex_synth_queries["questions"] = complex_synth_queries["questions"][:50]

## Combine into dataset

In [61]:
df_user_queries = pd.DataFrame(user_queries, columns=["query"])
df_user_queries['source'] = 'user'
df_user_queries['complexity'] = 'simple'

In [63]:
df_simple_synth = pd.DataFrame(simple_synth_queries["questions"], columns=["query"])
df_simple_synth['source'] = 'synth'
df_simple_synth['complexity'] = 'simple'

In [77]:
df_complex_synth = pd.DataFrame(complex_synth_queries["questions"], columns=["query"])
df_complex_synth['source'] = 'synth'
df_complex_synth['complexity'] = 'complex'

In [78]:
df_full = pd.concat([df_user_queries, df_simple_synth, df_complex_synth], ignore_index=True)

In [79]:
df_full.to_csv("eval_dataset_queries.csv", index=False)

## Upload to Langsmith

In [68]:
# create dataset in langsmith
from langsmith import Client

In [80]:
client = Client()
dataset_name = "History Book Eval Queries"

In [83]:
# client.delete_dataset(dataset_name="History Book Eval Queries")

In [85]:
dataset = client.create_dataset(
  dataset_name=dataset_name, description="A dataset of queries for evaluating a history book retrieval system.",
)

In [86]:
examples = [
  {
    "inputs": {"question": row.query},
    "metadata": {"source": row.source, "complexity": row.complexity},
  }
  for _, row in df_full.iterrows()
]

In [87]:
client.create_examples(
  dataset_id=dataset.id,
  examples=examples
)

{'example_ids': ['45d27b47-d078-4ff0-b0c4-23905b1c849d',
  '8708d042-17f6-4a1a-baa4-4ec6bb22ee95',
  'b9b63ea1-0783-4af3-b368-c1202c698f10',
  'cd024e14-7330-4853-887a-1fb1980d806e',
  '4c8a2461-458d-41a0-a107-1a28d91dba96',
  'ba1d39c2-7973-49a3-927f-e5d1bfbd9490',
  '9e34314f-e8ab-4158-bd97-bbe5b218672c',
  '35ff45df-16b5-476f-af90-4c16eedfb843',
  'c422022b-5b7f-4366-b25d-04f95d93326f',
  '3a969e37-67ae-4b56-a6f0-f76296c845c2',
  '56866db6-7bf4-45f1-8f98-359562b9a78a',
  'b0551619-9acd-46d8-82e5-546a6e637e3e',
  'dadf947a-c1f5-49bc-a485-4e705a9dd555',
  'ba306d39-9dd2-464a-8c4e-8f451ceb93dc',
  'c7028512-14ac-4fca-8783-e5803c44dc58',
  '11285b73-c420-40cd-9f68-e0aa6da5a193',
  '49b4a759-675f-4d55-afd3-2359c70520c6',
  '6d8ad2e6-7345-4623-b109-70770db02e4d',
  '884ff068-a67d-413c-a8de-d6c0ac240205',
  '776e8ad0-e481-4c71-a6ee-24e3ff84eb65',
  '1c1be83f-d0b8-4a1b-bb8d-eac93fa4e26f',
  '2ac3cf60-1a93-4e03-a7d5-43e33909cc40',
  '60c37788-2482-49ba-8e38-a479279bb804',
  '79936e54-0faf-4b