In [15]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import pandas as pd


In [20]:
qdrant_client = QdrantClient(
    url="http://localhost:6333"
)

qdrant_client.create_collection(
    collection_name="Amazon-items-collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE))

True

In [3]:
# df_ratings_100 = pd.read_json('C:/CODE/ai_engineering_bootcamp_data/meta_Electronics_2022_2023_with_category_ratings_100.jsonl', lines=True)
df_items = pd.read_json('C:/CODE/ai_engineering_bootcamp_data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl', lines=True)

# df_items_100 = pd.read_json("C:/CODE/ai_engineering_bootcamp_data/Electronics_2022_2023_with_category_ratings_100.jsonl", lines=True)
# df_items_sample_1000 = pd.read_json("C:/CODE/ai_engineering_bootcamp_data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

### Concatenate title and featues

In [4]:
def preprocess_data(row):
    return f"{row['title']} {''.join(row['features'])}"

In [5]:
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)

In [6]:
df_items.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,preprocessed_data
0,Sports & Outdoors,Lowrance Hook2-4x Bullet Skimmer Ce Row One Size,4.2,314,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Lowrance,"[Electronics, Car & Vehicle Electronics, Marin...",{'Item Package Dimensions L x W x H': '10.63 x...,B0773K75DM,,,,Lowrance Hook2-4x Bullet Skimmer Ce Row One Size
1,Cell Phones & Accessories,"Maxjoy Airpod Pro 2 Case 2022 Crystal Clear, A...",4.2,145,[【Perfect Compatibility】: This protective case...,[],10.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'AIRSPO Clear AirPods Pro 2 Case', ...",Maxjoy,"[Electronics, Headphones, Earbuds & Accessorie...",{'Package Dimensions': '3.58 x 3.15 x 1.3 inch...,B0BJ6986CY,,,,"Maxjoy Airpod Pro 2 Case 2022 Crystal Clear, A..."


### Sample 50 items 

In [7]:
df_sample = df_items.sample(50, random_state=42)

In [8]:
import openai

### Embedding function

In [9]:
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPEN_AI_KEY")

In [11]:
def get_embeddings(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

### Embed data

In [12]:
get_embeddings("Sample text")

[0.01777815632522106,
 0.010169903747737408,
 0.01451090257614851,
 -0.012041288428008556,
 -0.023929184302687645,
 -0.05565069243311882,
 0.010814150795340538,
 -0.01089851651340723,
 0.02748788334429264,
 0.000663900631479919,
 0.03313271701335907,
 -0.03972858190536499,
 -0.05488373339176178,
 0.014894383028149605,
 0.05393270030617714,
 0.04656987637281418,
 -0.012869606725871563,
 0.005272857379168272,
 -0.02572387270629406,
 0.05405541509389877,
 0.015891432762145996,
 0.022395262494683266,
 -0.004199111834168434,
 0.015270194038748741,
 0.01696517877280712,
 -0.0365380235016346,
 -0.026644226163625717,
 -0.004923889879137278,
 0.07197162508964539,
 -0.04862533137202263,
 -0.004747489001601934,
 -0.04672326520085335,
 0.0002330842544324696,
 -0.03212032839655876,
 -0.015707362443208694,
 0.07908902317285538,
 0.039176370948553085,
 -0.012900284491479397,
 -0.0038424748927354813,
 -0.05985363945364952,
 0.013728602789342403,
 -0.025892604142427444,
 0.028622986748814583,
 0.037795

In [17]:
data_to_emded = df_sample["preprocessed_data"].tolist()
pointstructs = []
for i, data in enumerate(data_to_emded):
    embedding = get_embeddings(data)
    pointstructs.append(
        PointStruct(
        id=i,
        vector=embedding,
        payload={"text": data}
    )
)

In [18]:
pointstructs

[PointStruct(id=0, vector=[0.019847752526402473, 0.01610613986849785, -0.0022094871383160353, 0.01123559195548296, -0.05178048834204674, -0.07272492349147797, -0.01927790977060795, 0.007364957127720118, 0.01028943620622158, 0.009681962430477142, -0.020084291696548462, 0.027589023113250732, -0.04928607866168022, -0.02197660319507122, 0.008348743431270123, 0.02666437067091465, -0.02311628870666027, 0.007477850653231144, -0.04251246899366379, 0.00533018633723259, 0.014643900096416473, 0.06519868969917297, 0.03214776888489723, -0.0005009649321436882, -0.028212623670697212, -0.028212623670697212, 0.005703810136765242, 0.031803712248802185, 0.04105022922158241, -0.003276599571108818, 0.02769654057919979, -0.03352399542927742, 0.006418802309781313, -0.013998794369399548, -0.007235935889184475, 0.01777266338467598, 0.0038437549956142902, -0.007155297789722681, 0.023933423683047295, -0.014310595579445362, 0.027115944772958755, 0.043351106345653534, 0.0339970737695694, -0.05724238231778145, 0.00

### Write embedded data to Qdrant

In [21]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

### Function for data retrieval

In [22]:
def retrieve_data(query):
    query_embedding = get_embeddings(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection",
        query=query_embedding,
        limit=10
    )
    return results

In [23]:
retrieve_data("What earphones can I get?").points

[ScoredPoint(id=11, version=0, score=0.5330224, payload={'text': 'Wireless Earbuds Bluetooth 5.3 Headphones with 4-Mics Clear Call and ENC Noise Cancelling, Bluetooth Earbuds Touch Control Stereo Sound with LED Display, Waterproof Running Headphones for Workout Enjoy Bluetooth 5.3 and Fast Auto Pairing: Latest Bluetooth 5.3 technology achieves lower latency and stronger anti-interference. MD058A true bluetooth headphone offer audio delays much lower than 65ms. Feature reduces the number of disconnections of the ear buds in complex environments(places with many people, such as subways and buses). Adopting hall switch, after the first connection, it will connect with your device as soon as the charging case opens.4-Mic and ENC Call Noise Reduction: Each headset has 2 built-in microphones, equipped with Environmental Noise Reduction (ENC) technology that can suppress 80% of the interfering background noise in the calling environment, intelligently enhances human voicethrough the four micr