# Import

In [17]:
import openai
import os
import json

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

from langsmith import Client

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [2]:
qdrant_client = QdrantClient(url="http://localhost:6333")

# Download all the data from QDrant

In [3]:
all_points = qdrant_client.scroll(
    collection_name="Amazon-items-collection-00",
    limit=100,
    offset=None,
    with_payload=True,
    with_vectors=False
)

In [4]:
all_points[0][0].payload

{'description': 'KEEPRO Pencil 2nd Generation for iPad, Magnetic Wireless Charge Tilt Sensitivity Palm Rejection Active Pen for Apple iPad Pro 11" 4/3/2/1, iPad Pro 12.9" 6/5/4/3, iPad Air 4/5, iPad Mini 6 [Compatibility]- ONLY compatible with iPad mini (6th generation), iPad Air (4th and 5th generation), iPad Pro 12.9-inch (3rd, 4th, 5th and 6th generation), iPad Pro 11-inch (1st, 2nd, 3rd and 4th generation), check and confirm your device before place the order (Note: If the pen doesn\'t charge, fully charge your iPad first then try charging the pen again)[Charging and Pairs Magnetically]- Charges wirelessly, attaches and pairs magnetically to the compatible iPad, this pen is a preferable alternative to the Apple Pencil 2nd Generation[Tilt Sensitivity & Pixel Precision]- Pixel-perfect precision and industry-leading low latency with tilt sensitivity making drawing, sketching, coloring, taking notes, and marking up PDFs, as easy and natural as a real pencil[Native Palm Rejection]- Rest

In [5]:
all_context = [{"id": data.payload["parent_asin"], "text": data.payload["description"]} for data in all_points[0]]

# Render a prompt to generate synthetic Eval reference dataset

In [6]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "string",
                    "description": "ID of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}

SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with IDs of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
The questions should imitate a potential real user of this RAG system.
As an output I need you to provide me the list of questions and the IDs of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Construct 10 that could use multiple chunks in the answer.
Construct 15 questions that could use single chunk in the answer.
Construct 5 questions that can't be answered with the available chunks.

<OUTPUT JSON SCHEMA>
{json.dumps(output_schema, indent=2)}
</OUTPUT JSON SCHEMA>

I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{all_context}
"""

In [7]:
print(SYSTEM_PROMPT)


I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with IDs of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
The questions should imitate a potential real user of this RAG system.
As an output I need you to provide me the list of questions and the IDs of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Construct 10 that could use multiple chunks in the answer.
Construct 15 questions that could use single chunk in the answer.
Construct 5 questions that can't be answered with the available chunks.

<OUTPUT JSON SCHEMA>
{
  "type": "array",
  "items": {
    "type": "obj

In [8]:
response = openai.chat.completions.create(
    model="gpt-5-mini",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ],
    reasoning_effort="minimal"
)

print(response.choices[0].message.content)

[
  {
    "question": "Is the KEEPRO Pencil compatible with my iPad Pro 12.9-inch (6th generation)?",
    "chunk_ids": [
      "B0BF18F6R7"
    ],
    "answer_example": "Yes. The KEEPRO Pencil 2nd Generation is listed as compatible with iPad Pro 12.9-inch (3rd, 4th, 5th and 6th generation), so it will work with your 6th-generation iPad Pro 12.9\".",
    "reasoning": "Chunk B0BF18F6R7 is the product listing for the KEEPRO Pencil and explicitly enumerates the compatible iPad models including iPad Pro 12.9\" 6th generation."
  },
  {
    "question": "How many extra tips come with the KEEPRO Pencil and are they compatible with Apple Pencil tips?",
    "chunk_ids": [
      "B0BF18F6R7"
    ],
    "answer_example": "The KEEPRO Pencil includes 3 extra replaceable tips, and the tip is compatible with Apple Pencil (1st generation) and Apple Pencil (2nd generation).",
    "reasoning": "The KEEPRO listing (B0BF18F6R7) describes included accessories and states tip compatibility with Apple Pencil m

In [9]:
import json
json_output = response.choices[0].message.content
json_output = json.loads(json_output)

In [10]:
json_output

[{'question': 'Is the KEEPRO Pencil compatible with my iPad Pro 12.9-inch (6th generation)?',
  'chunk_ids': ['B0BF18F6R7'],
  'answer_example': 'Yes. The KEEPRO Pencil 2nd Generation is listed as compatible with iPad Pro 12.9-inch (3rd, 4th, 5th and 6th generation), so it will work with your 6th-generation iPad Pro 12.9".',
  'reasoning': 'Chunk B0BF18F6R7 is the product listing for the KEEPRO Pencil and explicitly enumerates the compatible iPad models including iPad Pro 12.9" 6th generation.'},
 {'question': 'How many extra tips come with the KEEPRO Pencil and are they compatible with Apple Pencil tips?',
  'chunk_ids': ['B0BF18F6R7'],
  'answer_example': 'The KEEPRO Pencil includes 3 extra replaceable tips, and the tip is compatible with Apple Pencil (1st generation) and Apple Pencil (2nd generation).',
  'reasoning': 'The KEEPRO listing (B0BF18F6R7) describes included accessories and states tip compatibility with Apple Pencil models.'},
 {'question': 'What maximum coverage area can

In [11]:
len(json_output)

43

In [12]:
points = qdrant_client.scroll(
    collection_name="Amazon-items-collection-00",
    scroll_filter=Filter(
        must=[FieldCondition(key="parent_asin", match=MatchValue(value="B0CBMPG524"))]
    ),
    limit=100,
    with_payload=True,
    with_vectors=False,
)[0]

In [13]:
points[0].payload

{'description': 'Open Ear Headphones, Bluetooth 5.3 Earbuds with 60H Playtime IPX7 Waterproof Wireless Earbuds Immersive Premium Sound True Wireless Open Ear Earbuds with Earhooks for Running, Walking and Workouts 【Open-ear Design Headphones】Feature with a new generation of true open-ear wireless earbuds design, the headphones can rest gently and firmly fit your ears without entering your ear canal, which will reduce stress and hearing loss after extended wear. There is no pinching of the auricle, no blockage of the ear canal, and no pain or damage to hearing.【Powerful Stereo Sound】Equipped with 16.2 millimeters vibrating diaphragm speaker driver, bluetooth headphones providing pure balanced audio and clarity output for all music genres with soft audio and immerse yourself in the wonderful world of music.【Hear Your Surroundings】Open earbuds that rest on your ears without covering them, you can hear your music and your surroundings at the same time. Whether you are cycling, walking, run

In [14]:
def get_description(parent_asin: str) -> str:
    points = qdrant_client.scroll(
        collection_name="Amazon-items-collection-00",
        scroll_filter=Filter(
            must=[
                FieldCondition(
                    key="parent_asin",
                    match=MatchValue(value=parent_asin)
                )
            ]
        ),
        limit=100,
        with_payload=True,
        with_vectors=False
    )[0]
    
    return points[0].payload["description"]

In [15]:
get_description("B0CBMPG524")

'Open Ear Headphones, Bluetooth 5.3 Earbuds with 60H Playtime IPX7 Waterproof Wireless Earbuds Immersive Premium Sound True Wireless Open Ear Earbuds with Earhooks for Running, Walking and Workouts 【Open-ear Design Headphones】Feature with a new generation of true open-ear wireless earbuds design, the headphones can rest gently and firmly fit your ears without entering your ear canal, which will reduce stress and hearing loss after extended wear. There is no pinching of the auricle, no blockage of the ear canal, and no pain or damage to hearing.【Powerful Stereo Sound】Equipped with 16.2 millimeters vibrating diaphragm speaker driver, bluetooth headphones providing pure balanced audio and clarity output for all music genres with soft audio and immerse yourself in the wonderful world of music.【Hear Your Surroundings】Open earbuds that rest on your ears without covering them, you can hear your music and your surroundings at the same time. Whether you are cycling, walking, running, working ou

# Create Eval dataset in Langsmith

In [20]:
client = Client(api_key=os.environ["LANGSMITH_API_KEY"])

In [23]:
dataset_name = "rag-evaluation-dataset"

# Try to create dataset, if it already exists, read the existing one
try:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Dataset for evaluating RAG pipeline"
    )
    print(f"Created new dataset: {dataset_name}")
except Exception as e:
    if "already exists" in str(e):
        dataset = client.read_dataset(dataset_name=dataset_name)
        print(f"Using existing dataset: {dataset_name}")
    else:
        raise e

Using existing dataset: rag-evaluation-dataset


In [24]:
for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "reference_context_ids": item["chunk_ids"],
            "reference_descriptions": [get_description(id) for id in item["chunk_ids"]]
        }
    )