## Dependencies

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams,
    Distance,
    PayloadSchemaType,
    PointStruct,
    SparseVectorParams,
    Document,
    Prefetch,
    FusionQuery,
)
from qdrant_client import models

import pandas as pd
import openai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qdrant_client = QdrantClient(url="http://localhost:6333")

### Create a Qdrant Collection for Hybrid Search

In [3]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid-search",
    vectors_config={
        "text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE),
    },
    sparse_vectors_config={"bm25": SparseVectorParams(modifier=models.Modifier.IDF)},
)

True

In [4]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid-search",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD,
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

### Define Embedding Functions

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [6]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]

    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i : i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1

    return all_embeddings

### Process and embed Amazon items data

In [7]:
df_items = pd.read_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    lines=True,
)

In [8]:
def preprocess_description(row):
    return f"{row['title']} {' '.join(row['features'])}"


def extract_first_large_image(row):
    return row["images"][0].get("large", "")


df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [9]:
df_sample = df_items.sample(n=500, random_state=42)

In [10]:
data_to_embed = df_sample[
    ["description", "image", "rating_number", "price", "average_rating", "parent_asin"]
].to_dict(orient="records")

In [11]:
text_to_embed = [data["description"] for data in data_to_embed]

In [12]:
text_to_embed

['KEEPRO Pencil 2nd Generation for iPad, Magnetic Wireless Charge Tilt Sensitivity Palm Rejection Active Pen for Apple iPad Pro 11" 4/3/2/1, iPad Pro 12.9" 6/5/4/3, iPad Air 4/5, iPad Mini 6 [Compatibility]- ONLY compatible with iPad mini (6th generation), iPad Air (4th and 5th generation), iPad Pro 12.9-inch (3rd, 4th, 5th and 6th generation), iPad Pro 11-inch (1st, 2nd, 3rd and 4th generation), check and confirm your device before place the order (Note: If the pen doesn\'t charge, fully charge your iPad first then try charging the pen again) [Charging and Pairs Magnetically]- Charges wirelessly, attaches and pairs magnetically to the compatible iPad, this pen is a preferable alternative to the Apple Pencil 2nd Generation [Tilt Sensitivity & Pixel Precision]- Pixel-perfect precision and industry-leading low latency with tilt sensitivity making drawing, sketching, coloring, taking notes, and marking up PDFs, as easy and natural as a real pencil [Native Palm Rejection]- Rest your palm o

In [13]:
embeddings = get_embeddings_batch(text_to_embed)

Processed 100 of 500
Processed 200 of 500
Processed 300 of 500
Processed 400 of 500
Processed 500 of 500


In [14]:
points_structs = []

for i, (embedding, data) in enumerate(zip(embeddings, data_to_embed), 1):
    points_structs.append(
        PointStruct(
            id=i,
            vector={
                "text-embedding-3-small": embedding,
                "bm25": Document(text=data["description"], model="qdrant/bm25"),
            },
            payload=data,
        )
    )

In [15]:
points_structs[0]

PointStruct(id=1, vector={'text-embedding-3-small': [0.010869191028177738, -0.014954255893826485, 0.011192244477570057, -0.011265192180871964, -0.05060478672385216, -0.0003106785879936069, -0.038787275552749634, 0.04174686223268509, 0.0026130869518965483, -0.01020745187997818, -0.02903314121067524, 0.014506149105727673, -0.03105483017861843, 0.08687015622854233, 0.01433941163122654, 0.004861435852944851, -0.03586937114596367, 0.009087185375392437, -0.01874753087759018, 0.01695510372519493, 0.010098030790686607, 0.017246894538402557, 0.034014418721199036, 0.03065882995724678, 0.021738382056355476, 0.02112353779375553, -0.03311820700764656, -0.026427870616316795, -0.02307227998971939, 0.05206373706459999, -0.013255619443953037, -0.020269008353352547, -0.019477006047964096, -0.04881235957145691, -0.04114244133234024, -0.01925816386938095, -0.01727815717458725, 0.015516994521021843, -0.018059739843010902, 0.0006089821108616889, 0.02403102070093155, 0.040016964077949524, 0.00806591939181089

In [16]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search", points=points_structs
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Hybrid Retrieval

In [19]:
def retrieve_data(query, qdrant_client, k=5):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid-search",
        prefetch=[
            Prefetch(query=query_embedding, limit=20, using="text-embedding-3-small"),
            Prefetch(
                query=Document(text=query, model="qdrant/bm25"),
                using="bm25",
                limit=20,
            ),
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )
    retrieved_context_ids = []
    retrieved_context = []
    retrieved_context_ratings = []
    similarity_scores = []

    for result in results.points:
        retrieved_context_ids.append(result.payload["parent_asin"])
        retrieved_context.append(result.payload["description"])
        retrieved_context_ratings.append(result.payload["average_rating"])
        similarity_scores.append(result.score)

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "retrieved_context_ratings": retrieved_context_ratings,
        "similarity_scores": similarity_scores,
    }

In [20]:
results = retrieve_data("can i get some earphones?", qdrant_client=qdrant_client, k=5)

In [21]:
results

{'retrieved_context_ids': ['B09YMRF553',
  'B09T7YT8HC',
  'B09VB5M3L5',
  'B0C34FJG3P',
  'B0C13NF5JC'],
 'retrieved_context': ["EUQQ Wireless Earbuds Bluetooth 5.1 True Wireless Earbuds,with Microphone Wireless Earphones in-Ear,Ear Phone Wireless Earbuds,with Charging Case audifonos Bluetooth inalambricos 【Advanced Bluetooth 5.1&Bass Sound】Equipped with latest bluetooth 5.1 technology,EUQQ true wireless earbuds has more stable connection, long transmission range with low power consumption.The bluetooth earphones also support HSP, HFP, A2DP, AVRCP, which greatly improve the transmission speed and providing you with a low-latency listening experience when you listening to music,watch videos or playing games. 【Hi-Fi Stereo Sound Quality】 Face of one pair of Hi-Fi bluetooth earphones, heard the backdrop of real, rich bass, tenor out so melodious, exquisitely carved treble then so ethereal and elegant. Each earbud has a build-in microphoneand CVC8.0 noise reduction which can make others h