In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import pandas as pd
import openai


In [None]:
### Let's create a dataset

qdrant_client = QdrantClient(
    url="http://localhost:6333"
)
qdrant_client.create_collection(
    collection_name='Amazon-items-collection-02',
    vectors_config=VectorParams(
        size=1536, distance=Distance.COSINE)
    )

True

In [None]:
df_items = pd.read_json("data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

df_items['preprocessed_title'] = df_items.apply(preprocess_data, axis=1)
df_items['preprocessed_title'].head(2)

df_sample = df_items.sample(n=50, random_state=25)

In [None]:
def get_embeddings(text, model='text-embedding-3-small'):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding


get_embeddings("Hello, world!")

[-0.019143931567668915,
 -0.025292053818702698,
 -0.0017211713129654527,
 0.01883450709283352,
 -0.03382139280438423,
 -0.019682060927152634,
 -0.02102738246321678,
 0.05160655081272125,
 -0.03218010067939758,
 -0.03043118305504322,
 -0.0021508336067199707,
 -0.028924422338604927,
 -0.0024871639907360077,
 -0.03148053586483002,
 0.010291713289916515,
 0.01856544241309166,
 -0.04614454507827759,
 0.04140901193022728,
 0.00043050304520875216,
 0.04116685315966606,
 0.053651440888643265,
 0.0018481360748410225,
 0.004564004950225353,
 0.009955382905900478,
 0.04781274124979973,
 0.002164286794140935,
 -0.00984775647521019,
 0.038422394543886185,
 0.0009131372789852321,
 -0.05209086835384369,
 0.051122233271598816,
 -0.032529886811971664,
 -0.01408552099019289,
 -0.012605667114257812,
 0.013271600939333439,
 0.01856544241309166,
 0.0016320437425747514,
 -0.0008479732787236571,
 -0.012773832306265831,
 -0.029677802696824074,
 -0.004510191734880209,
 -0.015309764072299004,
 0.025668743997812

In [None]:
data_to_embed = df_sample['preprocessed_title'].tolist()

pointStructs = []

for i, text in enumerate(data_to_embed):
    embedding = get_embeddings(text)
    pointStructs.append(PointStruct(id=i, vector=embedding, payload={"text": text}))

In [None]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02",
    points=pointStructs,
    wait=True
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
# Lets create a synthetic data

import json


batch_output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question."
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question."
                }
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context."
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks."
            }
        }
    }
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.

<OUTPUT JSON SCHEMA>
{json.dumps(batch_output_schema, indent=2)}
</OUTPUT JSON SCHEMA>

I need to be able to parse the json output.
"""


USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(data_to_embed)]}
"""




In [None]:
print(USER_PROMPT)


Here is the list of chunks, each list element is a dictionary with id and text:
[{'id': 0, 'text': 'Bluetooth Car Adapter, LDNIO Bluetooth FM Transmitter for Car, 43W PD&QC 3.0 Three USB Port Car Bluetooth Adapter with LED Display, Hands-Free Calling, and AUX Input for All Smartphones Audio Player Fast Charging Type C Multi Ports: USB Type C Durable Fast Connect & Clear Bluetooth Sound 3 in 1 Value'}, {'id': 1, 'text': 'Bluetooth Multi-Device Keyboard, Dual Channel Universal Rechargeable Wireless Keyboard with Integrated Stand for iPad Smartphone Tablet MacBook iOS Windows Android Devices - Pink 【Easy-Switch to 2 Devices】 Simply press the FN+BT1/BT2 to switch typing between 2 connected Bluetooth devices, work with your smartphone and tablet on the slot stablely. 【Type Anywhere in Comfort】0.46in thick and 11.34in long, the compact Bluetooth keyboard is small enough to tuck into your briefcase and light enough to hold in hand. Minimalist layout lets you multitask at home or on the go. 【

In [None]:
print(SYSTEM_PROMPT)


I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.

<OUTPUT JSON SCHEMA>
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "question": {
        "type": "string",
       

In [None]:
response = openai.chat.completions.create(
    model='gpt-4.1',
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

In [None]:
print(response.choices[0].message.content)

```json
[
  {
    "question": "Which products are available for wireless Bluetooth audio playback?",
    "chunk_ids": [0,2,5,9,23,47,49],
    "answer_example": "We have the following products that support wireless Bluetooth audio playback: Bluetooth Car Adapter (id:0), Active Noise Cancelling Wireless Earbuds (id:2), Glorious Model D Wireless Gaming Mouse (id:5, note: it's a mouse, not an audio device but mentions Bluetooth), maio Bluetooth Speaker (id:9), Tiksounds Wireless Earbuds (id:47), Razer Leviathan V2 X PC Soundbar (id:49), and the co2CREA Soft Silicone Case (id:23, case for Bose Bluetooth speaker - actual speaker sold separately).",
    "reasoning": "These chunks mention Bluetooth-enabled audio devices like speakers, earbuds, and adapters, as well as cases for Bluetooth speakers."
  },
  {
    "question": "Do you have any accessories compatible with iPhone 14?",
    "chunk_ids": [13,22,26,39],
    "answer_example": "Yes, we have several accessories compatible with iPhone 14, 

In [None]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json.loads(json_output)

In [None]:
json_output

[{'question': 'Which products are available for wireless Bluetooth audio playback?',
  'chunk_ids': [0, 2, 5, 9, 23, 47, 49],
  'answer_example': "We have the following products that support wireless Bluetooth audio playback: Bluetooth Car Adapter (id:0), Active Noise Cancelling Wireless Earbuds (id:2), Glorious Model D Wireless Gaming Mouse (id:5, note: it's a mouse, not an audio device but mentions Bluetooth), maio Bluetooth Speaker (id:9), Tiksounds Wireless Earbuds (id:47), Razer Leviathan V2 X PC Soundbar (id:49), and the co2CREA Soft Silicone Case (id:23, case for Bose Bluetooth speaker - actual speaker sold separately).",
  'reasoning': 'These chunks mention Bluetooth-enabled audio devices like speakers, earbuds, and adapters, as well as cases for Bluetooth speakers.'},
 {'question': 'Do you have any accessories compatible with iPhone 14?',
  'chunk_ids': [13, 22, 26, 39],
  'answer_example': 'Yes, we have several accessories compatible with iPhone 14, including a CarPlay USB A 

In [None]:
from langsmith import Client
import os

client = Client(api_key=os.environ['LANGSMITH_API_KEY'])
dataset_name = "rag-evaluation-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description='Dataset for evaluating RAG pipeline'
)

In [None]:
for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [
                qdrant_client.retrieve(collection_name="Amazon-items-collection-02", ids=[id], with_payload=True)[0].payload["text"]
                for id in item["chunk_ids"]
            ]
        }
    )