# Import Dependencies

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams,
    Distance,
    PayloadSchemaType,
    PointStruct,
    MatchAny,
    FieldCondition,
    Filter,
    Prefetch,
    FusionQuery,
)

import pandas as pd
import numpy as np
import openai
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


## Retrieve item ids

In [4]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [3]:
dummy_vector = np.zeros(1536).tolist()
COLLECTION = "Amazon-items-collection-01-hybrid-search"

payload = qdrant_client.query_points(
    collection_name=COLLECTION,
    query=dummy_vector,
    using="text-embedding-3-small",
    limit=1000,
    with_payload=[
        "parent_asin",
    ],
    with_vectors=False,
)

In [6]:
parent_asin_list = [item.payload["parent_asin"] for item in payload.points]

In [8]:
len(parent_asin_list)

500

In [9]:
!ls ../../data

Electornics_2022_2023_with_category_ratings_100.jsonl
Electornics_2022_2023_with_category_ratings_100_sample_1000.jsonl
meta_Electronics_2022_2023.jsonl
meta_Electronics_2022_2023_no_category.jsonl
meta_Electronics_2022_2023_no_date.jsonl
meta_Electronics_2022_2023_with_category.jsonl
meta_Electronics_2022_2023_with_category_ratings_100.jsonl
meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl


In [11]:
df_reviews = pd.read_json(
    "../../data/Electornics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    lines=True,
)

In [13]:
df_reviews.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5,Perfect!,This is perfect! Thank you so much!!! I absolu...,[],B09992M2LX,B09ZPV8WBV,AHX4XWVVQUKT3FCNWCVASDF4Q56Q,2022-08-05 04:06:39.589,0,True
1,5,3ft mini usb cables,I don't have many things that still use a mini...,[],B09Y94B2NM,B09Y95BMKX,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2022-07-16 16:03:28.714,3,True
2,5,I would buy it again.,Great product. Worked well for what we needed ...,[],B07T55DL33,B0B2JWCMCY,AF5KFHNT3TQJ2GNSE3FCDFQOBICA,2019-12-09 22:35:00.531,0,True
3,5,Great to Have Around,My husband and I were recently working a booth...,[],B09M89JN7B,B0BYYGZHG5,AHV6QCNBJNSGLATP56JAWJ3C4G2A,2022-03-22 01:43:49.342,0,False
4,5,Easy to use,Work as advertised and at a very good price.,[],B07T55DL33,B0B2JWCMCY,AG7WKTZINOFIXMZJYIPKIB7PV7NQ,2019-12-28 06:12:24.960,0,True


In [14]:
df_reviews.shape

(105918, 10)

In [17]:
df_reviews_sample = df_reviews[df_reviews["parent_asin"].isin(parent_asin_list)]

In [18]:
df_reviews_sample.shape

(45948, 10)

## Define Functionsto preprocess data

In [24]:
def preprocess_review_data(row):
    return f"{row['title']} - {row['text']}"

In [20]:
def token_count(row, model="text-embedding-3-small"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row["preprocessed_data"]))

In [25]:
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(
    preprocess_review_data, axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_review_data, axis=1)


In [29]:
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(
    token_count, axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)


In [30]:
df_reviews_sample.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,preprocessed_data,token_count,preprocessed_data_token_count
0,5,Perfect!,This is perfect! Thank you so much!!! I absolu...,[],B09992M2LX,B09ZPV8WBV,AHX4XWVVQUKT3FCNWCVASDF4Q56Q,2022-08-05 04:06:39.589,0,True,Perfect! - This is perfect! Thank you so much!...,22,22
1,5,3ft mini usb cables,I don't have many things that still use a mini...,[],B09Y94B2NM,B09Y95BMKX,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2022-07-16 16:03:28.714,3,True,3ft mini usb cables - I don't have many things...,115,115
10,5,Great privacy screen!,I've tried a few different privacy screens for...,[],B0B7LGQ836,B0C65RMTNV,AHUPTBY3F3UN2S5H7K5JLP6MAV5Q,2023-01-20 23:08:45.699,0,False,Great privacy screen! - I've tried a few diffe...,90,90
11,1,Didn't work,Doesn't pair with any device.,[],B0B979SXNT,B0BGRL2618,AHHFW36BP4VMQWC6V2NTKIXFAA2A,2023-01-02 01:29:08.702,0,True,Didn't work - Doesn't pair with any device.,12,12
12,4,Sony's Midrange ANC Buds,I have worked a lot with active noise cancelin...,[],B09YL76VSR,B0BJS6CXDN,AFLX66DKF6R3H6OEOC3TIVAYXZIQ,2022-06-26 10:58:20.816,0,False,Sony's Midrange ANC Buds - I have worked a lot...,479,479


In [28]:
df_reviews_sample.shape

(45948, 12)

In [31]:
df_reviews_sample = df_reviews_sample[
    df_reviews_sample["preprocessed_data_token_count"] < 8192
]

In [32]:
df_reviews_sample.shape

(45948, 13)

In [33]:
df_reviews_sample["preprocessed_data_token_count"].sum()

np.int64(2885879)

## Create new data collection for Reviews

In [9]:
COLLECTION_REVIEWS = "Amazon-items-collection-01-reviews"

In [None]:
qdrant_client.create_collection(
    collection_name=COLLECTION_REVIEWS,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [37]:
qdrant_client.create_payload_index(
    collection_name=COLLECTION_REVIEWS,
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD,
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

## Embedding Functions

In [7]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [39]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]

    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i : i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1

    return all_embeddings

## Embed the text and add additional payload for each reviews vector

In [42]:
data_to_embed = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(
    orient="records"
)

In [43]:
data_to_embed[:10]

[{'preprocessed_data': 'Perfect! - This is perfect! Thank you so much!!! I absolutely love it!!! It’s great quality!!!',
  'parent_asin': 'B09ZPV8WBV'},
 {'preprocessed_data': "3ft mini usb cables - I don't have many things that still use a mini USB charger cable, but I after a while you have to replace the charger cables.  These seem to work well and it was noted in the ad that they are reinforced at the head area to prolong life.  We shall see - time will tell.  I never hesitate to update my reviews should new info seem useful.  I do not accept any discounts or deals that are not available to all shoppers. And my reviews are based purely on my personal experience with each item I review.",
  'parent_asin': 'B09Y95BMKX'},
 {'preprocessed_data': 'Great privacy screen! - I\'ve tried a few different privacy screens for my 2018 13" Macbook Pro and this is my favorite. It provides good privacy and I appreciate that it has a cover for the webcam as well.<br /><br />I also really like that y

In [44]:
text_to_embed_reviews = [item["preprocessed_data"] for item in data_to_embed]
embedding_reviews = get_embeddings_batch(text_to_embed_reviews, batch_size=500)

Processed 500 of 45948
Processed 1000 of 45948
Processed 1500 of 45948
Processed 2000 of 45948
Processed 2500 of 45948
Processed 3000 of 45948
Processed 3500 of 45948
Processed 4000 of 45948
Processed 4500 of 45948
Processed 5000 of 45948
Processed 5500 of 45948
Processed 6000 of 45948
Processed 6500 of 45948
Processed 7000 of 45948
Processed 7500 of 45948
Processed 8000 of 45948
Processed 8500 of 45948
Processed 9000 of 45948
Processed 9500 of 45948
Processed 10000 of 45948
Processed 10500 of 45948
Processed 11000 of 45948
Processed 11500 of 45948
Processed 12000 of 45948
Processed 12500 of 45948
Processed 13000 of 45948
Processed 13500 of 45948
Processed 14000 of 45948
Processed 14500 of 45948
Processed 15000 of 45948
Processed 15500 of 45948
Processed 16000 of 45948
Processed 16500 of 45948
Processed 17000 of 45948
Processed 17500 of 45948
Processed 18000 of 45948
Processed 18500 of 45948
Processed 19000 of 45948
Processed 19500 of 45948
Processed 20000 of 45948
Processed 20500 of 4

In [45]:
pointstructs = []

for i, (embedding, data) in enumerate(zip(embedding_reviews, data_to_embed), 1):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "parent_asin": data["parent_asin"],
                "text": data["preprocessed_data"],
            },
        )
    )

In [46]:
batch_size_qdrant = 100
counter = 1

for i in range(0, len(text_to_embed_reviews), batch_size_qdrant):
    batch = pointstructs[i : i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name=COLLECTION_REVIEWS,
        wait=True,
        points=batch,
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

Processed 100 of 45948
Processed 200 of 45948
Processed 300 of 45948
Processed 400 of 45948
Processed 500 of 45948
Processed 600 of 45948
Processed 700 of 45948
Processed 800 of 45948
Processed 900 of 45948
Processed 1000 of 45948
Processed 1100 of 45948
Processed 1200 of 45948
Processed 1300 of 45948
Processed 1400 of 45948
Processed 1500 of 45948
Processed 1600 of 45948
Processed 1700 of 45948
Processed 1800 of 45948
Processed 1900 of 45948
Processed 2000 of 45948
Processed 2100 of 45948
Processed 2200 of 45948
Processed 2300 of 45948
Processed 2400 of 45948
Processed 2500 of 45948
Processed 2600 of 45948
Processed 2700 of 45948
Processed 2800 of 45948
Processed 2900 of 45948
Processed 3000 of 45948
Processed 3100 of 45948
Processed 3200 of 45948
Processed 3300 of 45948
Processed 3400 of 45948
Processed 3500 of 45948
Processed 3600 of 45948
Processed 3700 of 45948
Processed 3800 of 45948
Processed 3900 of 45948
Processed 4000 of 45948
Processed 4100 of 45948
Processed 4200 of 45948
P

## Function to Run Search against reviews on prefiltered set of product ids

In [5]:
def retrieve_prefiltered_reviews(query: str, parent_asins: list[str], k=5):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name=COLLECTION_REVIEWS,
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin", match=MatchAny(any=parent_asins)
                        )
                    ]
                ),
                limit=20,
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )
    return results

In [10]:
reviews = retrieve_prefiltered_reviews("bad quality", ["B09WCFC5D9"])

In [11]:
reviews

QueryResponse(points=[ScoredPoint(id=16121, version=163, score=0.5, payload={'parent_asin': 'B09WCFC5D9', 'text': 'Not good. - I can’t find the return item area. Not good sound.'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=25017, version=252, score=0.33333334, payload={'parent_asin': 'B09WCFC5D9', 'text': 'Do not buy thes unless you only want them for phone calls. There is zero music quality - They claim These buds to have deep bass, are enhanced, however there is zero bass reproduction. The voices singing or phone call sound horrible. The Bluetooth connect when you aren’t using them. The lid is not easy to open. The only reason I didn’t give one star is because people on the other end of the phone call can hear me great. Yet they do not on this end. But it will work. Music or app are always dropping or stopping. I don’t know why. Do not buy these earbuds.'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=7950, version=81, score=0.25, payload={'pare

In [13]:
[point.payload["text"] for point in reviews.points]

['Not good. - I can’t find the return item area. Not good sound.',
 'Do not buy thes unless you only want them for phone calls. There is zero music quality - They claim These buds to have deep bass, are enhanced, however there is zero bass reproduction. The voices singing or phone call sound horrible. The Bluetooth connect when you aren’t using them. The lid is not easy to open. The only reason I didn’t give one star is because people on the other end of the phone call can hear me great. Yet they do not on this end. But it will work. Music or app are always dropping or stopping. I don’t know why. Do not buy these earbuds.',
 'Looks pretty cool but pretty average sound - Bass is not very strong, very treble biased<br />it has taken my earring off twice now<br /> not really its fault but the magnet it uses to attach itself to the charger is good at that<br />You get a mini RGB light show when you open the case.<br /><br />Update: microphone sucks, will return',
 'Nice for the price - Goo

In [14]:
reviews = retrieve_prefiltered_reviews(
    "bad quality", ["B09WCFC5D9", "B0CF1WM24K"], k=20
)

In [16]:
reviews.points

[ScoredPoint(id=16121, version=163, score=0.5, payload={'parent_asin': 'B09WCFC5D9', 'text': 'Not good. - I can’t find the return item area. Not good sound.'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=25017, version=252, score=0.33333334, payload={'parent_asin': 'B09WCFC5D9', 'text': 'Do not buy thes unless you only want them for phone calls. There is zero music quality - They claim These buds to have deep bass, are enhanced, however there is zero bass reproduction. The voices singing or phone call sound horrible. The Bluetooth connect when you aren’t using them. The lid is not easy to open. The only reason I didn’t give one star is because people on the other end of the phone call can hear me great. Yet they do not on this end. But it will work. Music or app are always dropping or stopping. I don’t know why. Do not buy these earbuds.'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=7950, version=81, score=0.25, payload={'parent_asin': 'B09WCFC5

In [19]:
for review in reviews.points:
    print("-" * 50)
    print(review.payload["parent_asin"])
    print(review.payload["text"])
    print("-" * 50)

--------------------------------------------------
B09WCFC5D9
Not good. - I can’t find the return item area. Not good sound.
--------------------------------------------------
--------------------------------------------------
B09WCFC5D9
Do not buy thes unless you only want them for phone calls. There is zero music quality - They claim These buds to have deep bass, are enhanced, however there is zero bass reproduction. The voices singing or phone call sound horrible. The Bluetooth connect when you aren’t using them. The lid is not easy to open. The only reason I didn’t give one star is because people on the other end of the phone call can hear me great. Yet they do not on this end. But it will work. Music or app are always dropping or stopping. I don’t know why. Do not buy these earbuds.
--------------------------------------------------
--------------------------------------------------
B09WCFC5D9
Looks pretty cool but pretty average sound - Bass is not very strong, very treble biased