In [1]:
# Setup and Imports

from typing import List, Dict, Any
from dotenv import load_dotenv
from langsmith import Client
from openai import OpenAI
from tqdm import tqdm
import json
from langsmith.wrappers import wrap_openai

load_dotenv()
base_client = OpenAI()
openai_client = wrap_openai(base_client)
langsmith_client = Client()

In [2]:
# Split the document into large, overlapping chunks. We're not using
# the chunks for retrieval, so they don't need to be small.

# In fact, since we're using the chunks to generate evaluation
# questions, it's better if they have more context.

# Note: we don't use the markdown parser because it splits the
# chunks by section, and we want to split by chunk size.

from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import FlatReader

docs = SimpleDirectoryReader(
    input_files=["./data_2/Tesla Cybertruck Owners Manual.md"],
    filename_as_id=True,
    file_extractor={
        ".md": FlatReader()
    }
).load_data()

from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=2000,
    chunk_overlap=400
)

nodes = parser.get_nodes_from_documents(docs)

In [None]:
# Inspect a few of the chunks created by the parser

print(f"\nTotal number of chunks: {len(nodes)}\n\n")

print("\nFirst 3 chunks:")
for i in range(min(3, len(nodes))):
    print(f"\n--- Chunk {i+1} ---")
    print(nodes[i].text)
    print("\n" + "="*80)

In [5]:
# Leverage multiple prompts to generate a diverse set of questions

FACTUAL_PROMPT = """Generate 2-3 questions that real Cybertruck owners would actually type into a search bar or ask in an owners' forum. These should feel completely natural and conversational.

Write questions as if they were being typed into a search bar or asked in a forum. For example:

Instead of:
- "How do I activate the climate control system?"
- "What should I do if the touchscreen becomes unresponsive?"
- "How does one optimize range in cold weather?"

Write:
- "how to turn on AC in cybertruck"
- "screen frozen - what now?"
- "battery draining fast in cold weather"

Make questions feel real by:
1. Using natural search patterns
   - "how to..."
   - "why is my..."
   - "help with..."
2. Including context and emotion
   - "stuck at supercharger"
   - "help! frunk won't open"
   - "confused about ride height settings"
3. Writing like real people
   - Use contractions (I'm, won't, can't)
   - OK to use incomplete sentences
   - Include emotional context ("Help!", "Confused about...", "Worried about...")
4. Adding situational details
   - "in rain"
   - "with kids"
   - "while camping"

For each question, evaluate its real-world relevance:
- "common": Everyday, urgent needs:
  * "trunk won't close"
  * "phone key not working"
  * "what's this warning light mean"

- "rare": Occasional situations:
  * "winterizing cybertruck"
  * "car wash settings?"
  * "towing setup help"

- "unlikely": Technical/administrative:
  * Manual details
  * Specifications
  * Legal info

Text: {text}

Provide your response in the following JSON format:
{{
    "questions": [
        {{
            "question": "Natural, search-like question",
            "answer": "Clear, helpful answer",
            "supporting_text": "Relevant excerpt from source text",
            "question_type": "factual",
            "relevance_level": "common|rare|unlikely",
            "relevance_reasoning": "Brief explanation of why this question fits the chosen relevance level"
        }}
    ]
}}"""

REASONING_PROMPT = """Generate 2-3 questions that real Cybertruck owners would ask when trying to understand how features work together or make decisions about using their vehicle. These should feel like real forum posts or search queries.

Write questions as if they were being posted in an owners' forum. For example:

Instead of:
- "What is the optimal charging strategy?"
- "How does ambient temperature affect range?"
- "What are the considerations for child safety?"

Write:
- "best way to charge for long road trip?"
- "losing tons of range in cold - what helps?"
- "safest seats for car seats?"

Make questions feel real by:
1. Using natural patterns
   - "better to..."
   - "best way to..."
   - "tips for..."
2. Including context and emotion
   - "worried about range"
   - "confused about charging"
   - "need advice on settings"
3. Writing like real people
   - Use contractions (I'm, won't, can't)
   - OK to use incomplete sentences
   - Include emotional context ("Help!", "Confused about...", "Worried about...")
4. Adding situational details
   - "for camping"
   - "in winter"
   - "with full family"

For each question, evaluate its real-world relevance:
- "common": Everyday decisions:
  * "faster charging vs battery life?"
  * "seat heaters or cabin heat?"
  * "best settings for commute"

- "rare": Occasional planning:
  * "road trip planning help"
  * "winter driving tips"
  * "towing affects on range?"

- "unlikely": Technical/theoretical:
  * System details
  * Technical specs
  * Legal considerations

Text: {text}

Provide your response in the following JSON format:
{{
    "questions": [
        {{
            "question": "Natural, forum-style question",
            "answer": "Practical, helpful answer",
            "supporting_text": "Relevant excerpt from source text",
            "question_type": "reasoning",
            "relevance_level": "common|rare|unlikely",
            "relevance_reasoning": "Brief explanation of why this question fits the chosen relevance level"
        }}
    ]
}}"""

In [None]:
# Generate a small number of questions to test the prompts
# After inspecting the generated questions, we can adjust the prompts,
# if necessary.

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a skilled question generator."},
        {"role": "user", "content": FACTUAL_PROMPT.format(text=nodes[0].text)}
    ],
    response_format={"type": "json_object"}
)

factual_questions = json.loads(response.choices[0].message.content)
print("Factual Questions Generated:")
print(json.dumps(factual_questions, indent=2))

# Test reasoning questions
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a skilled question generator."},
        {"role": "user", "content": REASONING_PROMPT.format(text=nodes[0].text)}
    ],
    response_format={"type": "json_object"}
)

reasoning_questions = json.loads(response.choices[0].message.content)
print("\nReasoning Questions Generated:")
print(json.dumps(reasoning_questions, indent=2))

In [None]:
# Sample random chunks of the document to generate evaluation
# questions.

import random
NUM_CHUNKS = 10

random_chunks = random.sample(nodes, NUM_CHUNKS)
print(f"Selected {len(random_chunks)} random chunks")

# 2. Generate Questions for Random Chunks
candidate_examples = []

for node in tqdm(random_chunks):
    # Generate factual questions
    factual_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a skilled question generator."},
            {"role": "user", "content": FACTUAL_PROMPT.format(text=node.text)}
        ],
        response_format={"type": "json_object"}
    )
    
    # Generate reasoning questions
    reasoning_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a skilled question generator."},
            {"role": "user", "content": REASONING_PROMPT.format(text=node.text)}
        ],
        response_format={"type": "json_object"}
    )
    
    # Parse responses
    factual_questions = json.loads(factual_response.choices[0].message.content)["questions"]
    reasoning_questions = json.loads(reasoning_response.choices[0].message.content)["questions"]
    
    # Format and store
    for question in factual_questions + reasoning_questions:
        example = {
            "question": question["question"],
            "answer": question["answer"],
            "metadata": {
                "chunk_id": node.node_id,
                "question_type": question["question_type"],
                "supporting_text": question["supporting_text"],
                "relevance_level": question["relevance_level"],
                "source_position": node.start_char_idx if hasattr(node, 'start_char_idx') else None,
                "filename": node.metadata.get("filename", "unknown")
            }
        }
        candidate_examples.append(example)

# 3. Filter for Common Questions
common_examples = [ex for ex in candidate_examples 
                  if ex["metadata"]["relevance_level"] == "common"]
print(f"\nFound {len(common_examples)} common questions")

In [None]:
# Print selected examples for inspection

import textwrap

print("\Generated common examples:")
print("=" * 80)

for i, example in enumerate(common_examples, 1):
    print(f"\nExample {i}:")
    print("-" * 40)
    print(f"Question: {example['question']}")
    print(f"Answer: {example['answer']}")
    print("\nMetadata:")
    print(f"  Question Type: {example['metadata']['question_type']}")
    print(f"  Relevance Level: {example['metadata']['relevance_level']}")
    print(f"  Source File: {example['metadata']['filename']}")
    print(f"  Chunk ID: {example['metadata']['chunk_id']}")
    print("\nSupporting Text:")
    print(textwrap.fill(example['metadata']['supporting_text'], width=70))
    print("=" * 80)

In [11]:
# Add the generated questions to the Langsmith dataset. Note:
# first, create the dataset in Langsmith UI, and apply the chat schema.

dataset_name = "rag_evaluation_dataset"

for example in tqdm(common_examples):
    # Format as chat input/output
    input_data = {
        "messages": [
            {"role": "user", "content": example["question"]}
        ]
    }
    
    output_data = {
        "message": {
            "role": "assistant", 
            "content": example["answer"]
        }
    }

    langsmith_client.create_example(
        dataset_name=dataset_name,
        inputs=input_data,
        outputs=output_data,
        metadata=example["metadata"]
    )

print("Complete!")

100%|██████████| 36/36 [00:21<00:00,  1.71it/s]

Complete!



