###  This implementation fuses Contextual Vector and Exact Keyword search

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, Prefetch, Filter, FieldCondition, MatchText, FusionQuery

import pandas as pd

In [2]:
import openai
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
df_items = pd.read_json('../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl', lines=True)

In [4]:
def preprocess_data(row):
    return f"{row['title']} {''.join(row['features'])}"

In [5]:
def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [6]:
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)
df_items["first_large_image"] = df_items.apply(extract_first_large_image, axis=1)

In [7]:
df_items.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,preprocessed_data,first_large_image
0,Sports & Outdoors,Lowrance Hook2-4x Bullet Skimmer Ce Row One Size,4.2,314,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Lowrance,"[Electronics, Car & Vehicle Electronics, Marin...",{'Item Package Dimensions L x W x H': '10.63 x...,B0773K75DM,,,,Lowrance Hook2-4x Bullet Skimmer Ce Row One Size,https://m.media-amazon.com/images/I/51Vh9xbPx2...
1,Cell Phones & Accessories,"Maxjoy Airpod Pro 2 Case 2022 Crystal Clear, A...",4.2,145,[【Perfect Compatibility】: This protective case...,[],10.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'AIRSPO Clear AirPods Pro 2 Case', ...",Maxjoy,"[Electronics, Headphones, Earbuds & Accessorie...",{'Package Dimensions': '3.58 x 3.15 x 1.3 inch...,B0BJ6986CY,,,,"Maxjoy Airpod Pro 2 Case 2022 Crystal Clear, A...",https://m.media-amazon.com/images/I/41O+wOzuPS...


In [8]:
qdrant_client = QdrantClient(
    url="http://localhost:6333"
)

In [9]:
qdrant_client.create_collection(collection_name="Amazon-items-collection-01-hybrid", vectors_config=VectorParams(size=1536, distance=Distance.COSINE))

True

In [10]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [11]:
df_sample = df_items.sample(50, random_state=25)

In [12]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

In [13]:
data_to_embed = df_sample[["preprocessed_data", "first_large_image", "rating_number", "price", "average_rating"]].to_dict(orient="records")

In [14]:
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["preprocessed_data"])
    pointstructs.append(
        PointStruct(
        id=i,
        vector=embedding,
        payload={
            "text": data["preprocessed_data"],
            "first_large_image": data["first_large_image"],
            "average_rating": data["average_rating"],
            "rating_number": data["rating_number"],
            "price": data["price"],

        }
        )
    )


In [15]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid",
    wait=True,
    points=pointstructs,
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
def retrieve_data(query, k=5):
    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [17]:
retrieve_data("earphones").points

[ScoredPoint(id=26, version=2, score=0.75, payload={'text': 'Wireless Earbuds Headphones Bluetooth 5.3, 50H Playtime Over Ear Buds with Noise Cancelling Mic, LED Display, Stereo Bass Bluetooth Earbuds with Earhooks, IP7 Waterproof Earphones for Sports Workout Superior Clear Call and Immersive Stereo SoundWireless Earbuds,DETACHABLE EARHOOK FOR SPORTSONE BUTTON CONTROL DESIGN', 'first_large_image': 'https://m.media-amazon.com/images/I/51mrgqFpwpL._AC_.jpg', 'average_rating': 4.9, 'rating_number': 139, 'price': 59.99}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=6, version=2, score=0.6666667, payload={'text': 'WeurGhy Wireless Earbuds, Bluetooth 5.1 Headphones with HD Microphone, Deep Bass in Ear Sports Earphones with LED Display, 80 Hours of Playtime, IPX7 Waterproof Earbuds for Workout Running Powerful Deep Bass and Clear Calls: Wireless earbuds has four powerful built-in microphones to effectively remove ambient noise and make calls clearer. 10mm speaker drivers pr

In [18]:
import instructor
from pydantic import BaseModel
from openai import OpenAI

In [19]:
class RAGGenerationResponse(BaseModel): 
    answer: str

client = instructor.from_openai(OpenAI())

prompt = """
You are a helpful assistant.
Return an answer to the question.
Question: What is your name
""" 

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4.1",
    response_model=RAGGenerationResponse,
    messages=[{"role": "user", "content": prompt}],
    temperature=0.5,
)

In [20]:
response

RAGGenerationResponse(answer="I am an AI assistant created by OpenAI. I don't have a personal name, but you can call me Assistant.")

In [21]:
raw_response

ChatCompletion(id='chatcmpl-C3HQMsnErk6qRQa9CUC2lHXABXwzV', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nxws0LOMSgpvhGdSVSrxGTpB', function=Function(arguments='{"answer":"I am an AI assistant created by OpenAI. I don\'t have a personal name, but you can call me Assistant."}', name='RAGGenerationResponse'), type='function')]))], created=1754897998, model='gpt-4.1-2025-04-14', object='chat.completion', service_tier='default', system_fingerprint='fp_799e4ca3f1', usage=CompletionUsage(completion_tokens=28, prompt_tokens=92, total_tokens=120, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [22]:
import yaml
from jinja2 import Template
from langsmith import Client

ls_client = Client()


def prompt_template_config(yaml_file, prompt_key):

    with open(yaml_file, "r") as f:
        config = yaml.safe_load(f)

    template_content = config["prompts"][prompt_key]

    template = Template(template_content)

    return template


def prompt_template_registry(prompt_name):

    template_content = ls_client.pull_prompt(prompt_name).messages[1].content

    template = Template(template_content)

    return template

In [23]:
print(ls_client.pull_prompt('rag-prompt').messages[1])

prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Context: {context}\nQuestion: {question}') additional_kwargs={}
