In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import openai

import pandas as pd

In [5]:
# Initialize QdrantClient
qdrant_client = QdrantClient(
    url="http://localhost:6333"
)

# # Create collection
# qdrant_client.create_collection(
#     collection_name="Amazon-items-collection-00",
#     vectors_config=VectorParams(
#         size=1536, 
#         distance=Distance.COSINE
#     ),
# )

In [4]:
df_items = pd.read_json(
    "../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    lines=True
    )

In [8]:
df_items.head(5)

# The columns `title`, `features`, and `description` contain the description we're looking for

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Industrial & Scientific,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...",4.4,119,[【Fast Charging Cord】These USB C cables provid...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Type-C Charger Cable ', 'url': 'ht...",RAVODOI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'RAVODOI', 'Connector Type': 'USB Ty...",B09R4Y2HKY,,,
1,All Electronics,"SNESH-2 Pack USB-C Female to USB Male Adapter,...",4.5,352,[🔹(Light & compact) Easy to carry and light we...,[],4.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'USB Male & Female Adapter', 'url':...",SNESH,"[Electronics, Computers & Accessories, Compute...",{'Package Dimensions': '3.54 x 2.4 x 0.35 inch...,B09JV5FM2S,,,
2,All Electronics,USB C Docking Station Dual Monitor for MacBook...,3.9,1193,[【18-in-1Docking Station】With USB C Docking St...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],ZMUIPNG,"[Electronics, Computers & Accessories, Laptop ...","{'Product Dimensions': '3.94""L x 1.18""W x 3.94...",B09SFN9NRX,,,
3,Camera & Photo,[2023 Upgraded] Telescopes for Adults Astronom...,4.1,219,[🎁【2023 All New Experience】The newly upgraded ...,[],169.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Good picture quality', 'url': 'htt...",HUTACT,"[Electronics, Camera & Photo, Binoculars & Sco...","{'Product Dimensions': '32.5""D x 5.5""W x 9.7""H...",B09TP3SZ7C,,,
4,AMAZON FASHION,"Laptop Bag 15.6 Inch, Laptop Briefcase Messeng...",4.5,222,"[Leather,Mesh, Imported, Multi-pockets and Lar...",[],24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],KPIQIU,"[Electronics, Computers & Accessories, Laptop ...",{'Product Dimensions': '16 x 2 x 12 inches; 1....,B0B5H7T7XZ,,,


In [6]:
# Concatenate the title, and features. Then, we'll embed it to the database
def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])} {' '.join(row['description'])}"

# # No `description` column
# def preprocess_data(row):
#     return f"{row['title']} {' '.join(row['features'])}"

In [7]:
df_items['preprocess_data'] = df_items.apply(preprocess_data, axis=1)

In [23]:
df_items.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,preprocess_data
0,Industrial & Scientific,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...",4.4,119,[【Fast Charging Cord】These USB C cables provid...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Type-C Charger Cable ', 'url': 'ht...",RAVODOI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'RAVODOI', 'Connector Type': 'USB Ty...",B09R4Y2HKY,,,,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T..."
1,All Electronics,"SNESH-2 Pack USB-C Female to USB Male Adapter,...",4.5,352,[🔹(Light & compact) Easy to carry and light we...,[],4.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'USB Male & Female Adapter', 'url':...",SNESH,"[Electronics, Computers & Accessories, Compute...",{'Package Dimensions': '3.54 x 2.4 x 0.35 inch...,B09JV5FM2S,,,,"SNESH-2 Pack USB-C Female to USB Male Adapter,..."
2,All Electronics,USB C Docking Station Dual Monitor for MacBook...,3.9,1193,[【18-in-1Docking Station】With USB C Docking St...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],ZMUIPNG,"[Electronics, Computers & Accessories, Laptop ...","{'Product Dimensions': '3.94""L x 1.18""W x 3.94...",B09SFN9NRX,,,,USB C Docking Station Dual Monitor for MacBook...
3,Camera & Photo,[2023 Upgraded] Telescopes for Adults Astronom...,4.1,219,[🎁【2023 All New Experience】The newly upgraded ...,[],169.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Good picture quality', 'url': 'htt...",HUTACT,"[Electronics, Camera & Photo, Binoculars & Sco...","{'Product Dimensions': '32.5""D x 5.5""W x 9.7""H...",B09TP3SZ7C,,,,[2023 Upgraded] Telescopes for Adults Astronom...
4,AMAZON FASHION,"Laptop Bag 15.6 Inch, Laptop Briefcase Messeng...",4.5,222,"[Leather,Mesh, Imported, Multi-pockets and Lar...",[],24.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],KPIQIU,"[Electronics, Computers & Accessories, Laptop ...",{'Product Dimensions': '16 x 2 x 12 inches; 1....,B0B5H7T7XZ,,,,"Laptop Bag 15.6 Inch, Laptop Briefcase Messeng..."


In [24]:
# Subsample the data once more, only 50 rows.
df_sample = df_items.sample(n=50, random_state=42)

In [8]:
# Start calculating some embeddings on the text data. Create a function to calculate the embeddings.
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

In [9]:
get_embedding("What is the best way to learn AI?")

[-0.031235190108418465,
 -0.008484674617648125,
 0.014977925457060337,
 0.0010658646933734417,
 0.010138160549104214,
 -0.04180784523487091,
 0.0065958392806351185,
 0.028362711891531944,
 -0.022617753595113754,
 0.06507733464241028,
 0.027155786752700806,
 -0.05175289511680603,
 0.0020276322029531,
 -0.06613942980766296,
 -0.00024496784317307174,
 -0.0010764251928776503,
 -0.010983007028698921,
 0.02577989362180233,
 0.05078735575079918,
 0.011580434627830982,
 0.015231379307806492,
 0.02313673123717308,
 0.021024614572525024,
 0.03483182191848755,
 0.04931490868330002,
 -0.04991837218403816,
 0.03963538259267807,
 0.04337684437632561,
 -0.044318243861198425,
 0.0111459419131279,
 0.04190439730882645,
 -0.015364141203463078,
 -0.0625186562538147,
 -0.00770620908588171,
 0.0017153405351564288,
 0.01626933366060257,
 -0.01339685544371605,
 0.028362711891531944,
 -0.013384786434471607,
 -0.01567794196307659,
 0.050642527639865875,
 0.005687628872692585,
 -0.011833888478577137,
 0.0149055

In [35]:
data_to_embed = df_sample["preprocess_data"].to_list()
pointstructs = []

# We use PointStruct to create a list of points to insert into the database.
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data}
        )
    )

# Took me 38sec to process 50 rows

In [36]:
pointstructs

[PointStruct(id=0, vector=[0.012753646820783615, -0.013963313773274422, 0.010360382497310638, -0.010615873150527477, -0.05038889870047569, -0.0017180403228849173, -0.03793766722083092, 0.04039871320128441, 0.004398079123347998, -0.009354065172374249, -0.027488645166158676, 0.016685064882040024, -0.03067966364324093, 0.08701261878013611, 0.018280575051903725, 0.007372713182121515, -0.03689485043287277, 0.0070442259311676025, -0.01829100213944912, 0.015162552706897259, 0.011210278607904911, 0.018718557432293892, 0.03585203364491463, 0.03234817087650299, 0.023463374003767967, 0.021075323224067688, -0.03199361264705658, -0.02686295472085476, -0.024860747158527374, 0.052099116146564484, -0.01295178197324276, -0.021283887326717377, -0.018051154911518097, -0.050764311105012894, -0.03958531841635704, -0.01920868083834648, -0.018228434026241302, 0.01648692972958088, -0.0206790529191494, -9.597171447239816e-05, 0.020981470122933388, 0.04169180616736412, 0.011001714505255222, 0.001878373324871063

In [37]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-00",
    wait=True,
    points=pointstructs
)

# Took 0.0 seconds to insert 50 rows.
# Now, we have our data as vectors in the database.
# We can now query the database.
# We can use the `qdrant_client.search` method to search the database.

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
def retrieve_data(query):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query=query_embedding,
        limit=10
    )
    return results

In [14]:
retrieve_data('Best earphones for gaming')

QueryResponse(points=[ScoredPoint(id=21, version=0, score=0.53769386, payload={'text': "RUSAM GA33 A Enhanced Edition TWS Game Earbuds True Wireless Stereo Headset Low Latency Bluetooth 5.2 Headphones Wireless Deep Bass Touch Control Earbuds HD Physical Noise Cancellation Earphones,White 【Advanced Bluetooth 5.2 Technology】RUSAM GA33 A built-in Bluetooth 5.2 chip,stable connection within 10 meters, Even if you cover your ears with your hands, you won't be disconnected,smooth like a wired headset. 【Game/Music Mode】The game/music mode can be switched by key operation to meet the needs of different usage scenarios.In music mode,13mm Dual Coil Dynamic Drivers，strong bass effect,heavy and low frequency all around the ear, shock more penetrating. 【Low Latency Game Mode】45ms low latency,there is a sound effect adjustment function in the game mode,and the subtle sound can also be captured to clearly distinguish the enemy's position. 【Long battery life】Built-in 40MAH battery in the headset, 400M

In [16]:
retrieve_data('Best earphones for gaming').points

[ScoredPoint(id=21, version=0, score=0.53769386, payload={'text': "RUSAM GA33 A Enhanced Edition TWS Game Earbuds True Wireless Stereo Headset Low Latency Bluetooth 5.2 Headphones Wireless Deep Bass Touch Control Earbuds HD Physical Noise Cancellation Earphones,White 【Advanced Bluetooth 5.2 Technology】RUSAM GA33 A built-in Bluetooth 5.2 chip,stable connection within 10 meters, Even if you cover your ears with your hands, you won't be disconnected,smooth like a wired headset. 【Game/Music Mode】The game/music mode can be switched by key operation to meet the needs of different usage scenarios.In music mode,13mm Dual Coil Dynamic Drivers，strong bass effect,heavy and low frequency all around the ear, shock more penetrating. 【Low Latency Game Mode】45ms low latency,there is a sound effect adjustment function in the game mode,and the subtle sound can also be captured to clearly distinguish the enemy's position. 【Long battery life】Built-in 40MAH battery in the headset, 400MAH battery in the cha

In [17]:
retrieve_data('Best earphones for gaming').points[0].payload['text']

"RUSAM GA33 A Enhanced Edition TWS Game Earbuds True Wireless Stereo Headset Low Latency Bluetooth 5.2 Headphones Wireless Deep Bass Touch Control Earbuds HD Physical Noise Cancellation Earphones,White 【Advanced Bluetooth 5.2 Technology】RUSAM GA33 A built-in Bluetooth 5.2 chip,stable connection within 10 meters, Even if you cover your ears with your hands, you won't be disconnected,smooth like a wired headset. 【Game/Music Mode】The game/music mode can be switched by key operation to meet the needs of different usage scenarios.In music mode,13mm Dual Coil Dynamic Drivers，strong bass effect,heavy and low frequency all around the ear, shock more penetrating. 【Low Latency Game Mode】45ms low latency,there is a sound effect adjustment function in the game mode,and the subtle sound can also be captured to clearly distinguish the enemy's position. 【Long battery life】Built-in 40MAH battery in the headset, 400MAH battery in the charging box, can listen to music continuously for 3-4 hours, and the

# Creating and using synthetic data

In [3]:
qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [10]:
# Subsample the data once more, only 50 rows.
df_sample = df_items.sample(n=50, random_state=25)

In [12]:
df_sample.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author',
       'preprocess_data'],
      dtype='object')

In [13]:
# Start calculating some embeddings on the text data. Create a function to calculate the embeddings.
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

data_to_embed = df_sample["preprocess_data"].to_list()
pointstructs = []

# We use PointStruct to create a list of points to insert into the database.
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data}
        )
    )

qdrant_client.upsert(
    collection_name="Amazon-items-collection-02",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [14]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question."
            },
            # Indexes and not text chunks because LLMs will probably hallucinate the text chunks.
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question."
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context."
            },
            # They respond with better accuracy when they have a reason.
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks"
            }
        }
    }
}

SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output, I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use a single chunk.
Also, include 5 answers that can't be answered within the context of the chunks.

<OUTPUT JSON SCHEMA>
{json.dumps(output_schema,indent=2)}
</OUTPUT JSON SCHEMA>

I need to be able to parse the JSON output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(data_to_embed)]}
"""

In [15]:
print(SYSTEM_PROMPT)


I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output, I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multiple chunks and questions that could use a single chunk.
Also, include 5 answers that can't be answered within the context of the chunks.

<OUTPUT JSON SCHEMA>
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "question": {
        "type": "string",

In [16]:
print(USER_PROMPT)



Here is the list of chunks, each list element is a dictionary with id and text:
[{'id': 0, 'text': 'Bluetooth Car Adapter, LDNIO Bluetooth FM Transmitter for Car, 43W PD&QC 3.0 Three USB Port Car Bluetooth Adapter with LED Display, Hands-Free Calling, and AUX Input for All Smartphones Audio Player Fast Charging Type C Multi Ports: USB Type C Durable Fast Connect & Clear Bluetooth Sound 3 in 1 Value '}, {'id': 1, 'text': 'Bluetooth Multi-Device Keyboard, Dual Channel Universal Rechargeable Wireless Keyboard with Integrated Stand for iPad Smartphone Tablet MacBook iOS Windows Android Devices - Pink 【Easy-Switch to 2 Devices】 Simply press the FN+BT1/BT2 to switch typing between 2 connected Bluetooth devices, work with your smartphone and tablet on the slot stablely. 【Type Anywhere in Comfort】0.46in thick and 11.34in long, the compact Bluetooth keyboard is small enough to tuck into your briefcase and light enough to hold in hand. Minimalist layout lets you multitask at home or on the go. 

In [17]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[  
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "Which products do you have that support Bluetooth audio connectivity for cars?",
    "chunk_ids": [0, 12, 29],
    "answer_example": "We offer several products that support Bluetooth audio connectivity for cars. These include the LDNIO Bluetooth FM Transmitter for cars (Chunk 0), the Corehan 10 inch Touchscreen Car Stereo with wireless Apple CarPlay and Android Auto (Chunk 12), and the Wireless CarPlay Adapter for Factory Wired CarPlay (Chunk 29).",
    "reasoning": "Chunks 0, 12, and 29 all feature Bluetooth solutions aimed at car audio or infotainment systems."
  },
  {
    "question": "Do you have any keyboards that can connect to multiple devices at once?",
    "chunk_ids": [1, 25, 28],
    "answer_example": "Yes, we have several keyboards that connect to multiple devices: the Bluetooth Multi-Device Keyboard (Chunk 1), the Nasuque Bluetooth Keyboard for Mac (Chunk 25), and the Samsers Ultra Slim Rechargeable Keyboard and Mouse Combo (Chunk 28).",
    

In [21]:
json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json.loads(json_output)

In [22]:
json_output

[{'question': 'Which products do you have that support Bluetooth audio connectivity for cars?',
  'chunk_ids': [0, 12, 29],
  'answer_example': 'We offer several products that support Bluetooth audio connectivity for cars. These include the LDNIO Bluetooth FM Transmitter for cars (Chunk 0), the Corehan 10 inch Touchscreen Car Stereo with wireless Apple CarPlay and Android Auto (Chunk 12), and the Wireless CarPlay Adapter for Factory Wired CarPlay (Chunk 29).',
  'reasoning': 'Chunks 0, 12, and 29 all feature Bluetooth solutions aimed at car audio or infotainment systems.'},
 {'question': 'Do you have any keyboards that can connect to multiple devices at once?',
  'chunk_ids': [1, 25, 28],
  'answer_example': 'Yes, we have several keyboards that connect to multiple devices: the Bluetooth Multi-Device Keyboard (Chunk 1), the Nasuque Bluetooth Keyboard for Mac (Chunk 25), and the Samsers Ultra Slim Rechargeable Keyboard and Mouse Combo (Chunk 28).',
  'reasoning': 'All three chunks descri

In [23]:
from langsmith import Client
import os

client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

# Create a dataset stored on LangSmith.
dataset_name = "rag-evaluation-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating the RAG pipeline.",
)

In [24]:
# Now we push the examples to the dataset
for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [
                qdrant_client.retrieve(
                    collection_name="Amazon-items-collection-02",
                    ids=item["chunk_ids"],
                    with_payload=True,
                )[0].payload["text"]
                for id in item["chunk_ids"]
        ],
    },
)