In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import openai
import json
import tiktoken

### Load Amazon dataset (Items)

In [2]:
df_items = pd.read_json("../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [3]:
df_items_sample = df_items.sample(n=50, random_state=25)

### Define functions to preprocess title and features data and extract image url from the first large image in the images list

In [4]:
def preprocess_items_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [5]:
df_items_sample["preprocessed_data"] = df_items_sample.apply(preprocess_items_data, axis=1)
df_items_sample["first_large_image"] = df_items_sample.apply(extract_first_large_image, axis=1)

### Load Amazon dataset (reviews)

In [6]:
df_reviews = pd.read_json("../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [7]:
len(df_reviews)

96107

In [8]:
df_reviews_sample = df_reviews[df_reviews['parent_asin'].isin(df_items_sample['parent_asin'])]

In [9]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
23,5,Great monitor,[[VIDEOID:ece6af781d39ac0dc456e49c2f10f7f1]] I...,[{'small_image_url': 'https://m.media-amazon.c...,B0B6H779BG,B0C53DQ4P5,AFGDRVPCP742YM5MMLFIKZCGNNRQ,2022-09-12 18:03:58.316,0,True
29,5,Awesome Extension Cord,This cord is great ‼️ I was having trouble wit...,[],B09TT41YN6,B0BM9L7X81,AGJHJIPO33F2GHD4AOH2XDXHI2QQ,2023-02-03 16:53:40.189,0,True


### Define functions to preprocess reviews data

In [10]:
def preprocess_reviews_data(row):
    return f"{row['title']} {row['text']}"

def token_count(row, model="text-embedding-3-small"):

    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row["preprocessed_data"]))

In [11]:
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)


In [12]:
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)


In [13]:
df_reviews_sample = df_reviews_sample[df_reviews_sample["preprocessed_data_token_count"] < 8192]

In [14]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,preprocessed_data,preprocessed_data_token_count
23,5,Great monitor,[[VIDEOID:ece6af781d39ac0dc456e49c2f10f7f1]] I...,[{'small_image_url': 'https://m.media-amazon.c...,B0B6H779BG,B0C53DQ4P5,AFGDRVPCP742YM5MMLFIKZCGNNRQ,2022-09-12 18:03:58.316,0,True,Great monitor [[VIDEOID:ece6af781d39ac0dc456e4...,62
29,5,Awesome Extension Cord,This cord is great ‼️ I was having trouble wit...,[],B09TT41YN6,B0BM9L7X81,AGJHJIPO33F2GHD4AOH2XDXHI2QQ,2023-02-03 16:53:40.189,0,True,Awesome Extension Cord This cord is great ‼️ I...,78


### Create a new Qdrant collection for items

In [15]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [16]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-items",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [17]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

### Create a new Qdrant collection for reviews

In [19]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [20]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embedding functions

In [21]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Embed the text data and add additional fields to the payload of each vector (items)

In [22]:
data_to_embed_items = df_items_sample[["preprocessed_data", "first_large_image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")


In [23]:
data_to_embed_items[0]

{'preprocessed_data': 'Dangbei Mars Pro 4K Projector, 3200 ANSI Lumens DLP Projector with Android 4GB+128G, 2 * 10W HiFi Speakers, Auto Keystone Auto Focus HDR10 Home Theater True 4K 3D Projector: True 4K resolution(3840 x 2160P) and incredible 3200 ANSI Lumens, delivering ultimate color and image with support for 4K, HDR 10, and HLG. Advanced ALPD Technology: Dangbei Mars Pro Projector uses ALPD light source with a lifespan of 30000H, providing a wider color gamut, and more realistic colors. Smart Screen Adjustment: Auto focus, ±40 degrees Auto keystone correction, intelligent obstacle anvoidance and intelligent screen alignment. No space constraints, it can quickly adjust settings itself. Dual 10W Speakers, Dolby Audio&dts HD: Dolby Digital and DTS Studio Sound technology is well applied to Mars Pro, provides the ultimate shocking sound for home theater. 4G+128G ROM, Android OS: Dangbei customizable UI carries thousands of apps. The interface is cleaner, apps launch faster, and playb

In [24]:
text_to_embed_items = [data["preprocessed_data"] for data in data_to_embed_items]

In [25]:
embeddings_items = get_embeddings_batch(text_to_embed_items)

In [26]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_items, data_to_embed_items):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "first_large_image": data["first_large_image"],
                "average_rating": data["average_rating"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [27]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-items",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embed the text data and add additional fields to the payload of each vector (reviews)

In [28]:
data_to_embed_reviews = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(orient="records")

In [29]:
text_to_embed_reviews = [data["preprocessed_data"] for data in data_to_embed_reviews]

In [30]:
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews)

Processed 100 of 3899
Processed 200 of 3899
Processed 300 of 3899
Processed 400 of 3899
Processed 500 of 3899
Processed 600 of 3899
Processed 700 of 3899
Processed 800 of 3899
Processed 900 of 3899
Processed 1000 of 3899
Processed 1100 of 3899
Processed 1200 of 3899
Processed 1300 of 3899
Processed 1400 of 3899
Processed 1500 of 3899
Processed 1600 of 3899
Processed 1700 of 3899
Processed 1800 of 3899
Processed 1900 of 3899
Processed 2000 of 3899
Processed 2100 of 3899
Processed 2200 of 3899
Processed 2300 of 3899
Processed 2400 of 3899
Processed 2500 of 3899
Processed 2600 of 3899
Processed 2700 of 3899
Processed 2800 of 3899
Processed 2900 of 3899
Processed 3000 of 3899
Processed 3100 of 3899
Processed 3200 of 3899
Processed 3300 of 3899
Processed 3400 of 3899
Processed 3500 of 3899
Processed 3600 of 3899
Processed 3700 of 3899
Processed 3800 of 3899
Processed 3900 of 3899


In [31]:
len(embeddings_reviews)

3899

In [32]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [33]:
batch_size_qdrant = 100
counter = 1
for i in range(0, len(pointstructs), batch_size_qdrant):
    batch = pointstructs[i:i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-02-reviews",
        wait=True,
        points=batch
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

Processed 100 of 3899
Processed 200 of 3899
Processed 300 of 3899
Processed 400 of 3899
Processed 500 of 3899
Processed 600 of 3899
Processed 700 of 3899
Processed 800 of 3899
Processed 900 of 3899
Processed 1000 of 3899
Processed 1100 of 3899
Processed 1200 of 3899
Processed 1300 of 3899
Processed 1400 of 3899
Processed 1500 of 3899
Processed 1600 of 3899
Processed 1700 of 3899
Processed 1800 of 3899
Processed 1900 of 3899
Processed 2000 of 3899
Processed 2100 of 3899
Processed 2200 of 3899
Processed 2300 of 3899
Processed 2400 of 3899
Processed 2500 of 3899
Processed 2600 of 3899
Processed 2700 of 3899
Processed 2800 of 3899
Processed 2900 of 3899
Processed 3000 of 3899
Processed 3100 of 3899
Processed 3200 of 3899
Processed 3300 of 3899
Processed 3400 of 3899
Processed 3500 of 3899
Processed 3600 of 3899
Processed 3700 of 3899
Processed 3800 of 3899
Processed 3900 of 3899


In [34]:
len(embeddings_reviews)

3899

### Perform hybrid search and perform rrf rank fusion on the retrieved results

In [35]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-items",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [36]:
result = retrieve_data("earphones")

In [37]:
result.points

[ScoredPoint(id=27, version=4, score=0.75, payload={'text': 'Wireless Earbuds Headphones Bluetooth 5.3, 50H Playtime Over Ear Buds with Noise Cancelling Mic, LED Display, Stereo Bass Bluetooth Earbuds with Earhooks, IP7 Waterproof Earphones for Sports Workout Superior Clear Call and Immersive Stereo Sound Wireless Earbuds, DETACHABLE EARHOOK FOR SPORTS ONE BUTTON CONTROL DESIGN', 'first_large_image': 'https://m.media-amazon.com/images/I/51mrgqFpwpL._AC_.jpg', 'average_rating': 4.9, 'rating_number': 139, 'price': 59.99, 'parent_asin': 'B0C85LNB75'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=7, version=4, score=0.6666667, payload={'text': 'WeurGhy Wireless Earbuds, Bluetooth 5.1 Headphones with HD Microphone, Deep Bass in Ear Sports Earphones with LED Display, 80 Hours of Playtime, IPX7 Waterproof Earbuds for Workout Running Powerful Deep Bass and Clear Calls: Wireless earbuds has four powerful built-in microphones to effectively remove ambient noise and make calls 

In [38]:
parent_asins = []
for data in result.points:
    parent_asins.append(data.payload["parent_asin"])

In [39]:
parent_asins

['B0C85LNB75', 'B09NZNP8Z3', 'B0B658CT5S', 'B09W5DB49K', 'B0BRMV85ZK']

### A function to run search agains reviews on a prefiltered set of product IDs

In [40]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin",
                            match=MatchAny(
                                any=parent_asins
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [41]:
reviews = retrieve_prefiltered_reviews_data("negative review", ['B0C85LNB75', 'B09NZNP8Z3', 'B0B658CT5S', 'B09W5DB49K', 'B0BRMV85ZK'], k=5)

In [42]:
reviews.points

[ScoredPoint(id=508, version=7, score=0.5, payload={'text': 'Poor Not worth the purchase', 'parent_asin': 'B0BRMV85ZK'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3544, version=37, score=0.33333334, payload={'text': 'look for a better item two issues, the lights for charging or charged on mine are barely visible, the voice and  script  for "device connected, poor English and heavy accented.', 'parent_asin': 'B0BRMV85ZK'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=2662, version=28, score=0.25, payload={'text': 'I don’t write reviews. I am an audiophile. These are great. I just ordered a second pair. I love these.', 'parent_asin': 'B0B658CT5S'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3653, version=38, score=0.2, payload={'text': "Don't by No", 'parent_asin': 'B09NZNP8Z3'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=1252, version=14, score=0.16666667, payload={'text': 'Not in the ear I wanted somethi