In [5]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from dotenv import load_dotenv

# Load data

In [6]:
# Load environment variables from .env file
load_dotenv()

True

In [7]:
# read product data
spark = SparkSession.builder.appName("read_data").getOrCreate()
data = spark.read.parquet(os.environ["PRODUCT_DATA_PATH"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/26 20:24:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [8]:
data.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: string (nullable = true)
 |-- class: string (nullable = true)
 |-- subclass: string (nullable = true)
 |-- product_description: string (nullable = true)


# Initialize vectorstore

In [9]:
from vectordb import SimilarProductVectorDB

vectorstore = SimilarProductVectorDB(
    collection_name="cosmetics_similar_product_db",
    distance_function="cosine",
    n_query_result=10,  # show at most top 10 most similar product in result
)

In [10]:
# extract documents and metadata from table
df = data.toPandas()
df = df[:200] # TODO
df = df[~df["brand"].isna()] # TODO
# adding random stock level
df["stock_level"] = np.random.randint(0, 101, size=len(df)) # TODO

                                                                                

In [11]:
df.head()

Unnamed: 0,product_id,product_name,brand,category,class,subclass,product_description,stock_level
0,BKK023,Prisme Libre Loose Powder Set – 5 Satin Blanc,GIVENCHY BEAUTY,Makeup,Face,Powder,Givenchy draws inspiration from the delicacy o...,2
1,BKU919,Artist Liquid Matte – 301 Rust,MAKE UP FOR EVER,Makeup,Lip,Lip Gloss,MAKE UP FOR EVERs Artist Liquid Matte is a lon...,77
2,AIJ788,Loubilaque Lip Lacquer – Goldissima,CHRISTIAN LOUBOUTIN,Makeup,Lip,Lip Gloss,Drawing inspiration from Christian Louboutins ...,79
3,AIK109,Brow Sculptor – Taupe,TOM FORD,Makeup,Eye,Eyebrow,A perfectly groomed and shaped brow is the mos...,97
4,AIK118,Skin Long–Wear Weightless Compact Foundation S...,BOBBI BROWN,Makeup,Face,Foundation,Bobbi Browns Skin Long-Wear Weightless Compact...,79


# Add to vectordb
Simply use product description as the features and pass into the vectorstore.

In [12]:
# put document into vectordb
product_ids = list(df["product_id"])
product_desc = list(df["product_description"])
product_metadatas = df[["product_name", "brand", "category", "class", "subclass", "stock_level"]].to_dict(orient='records')

vectorstore.add_documents(
    documents=product_desc,
    metadatas=product_metadatas,
    product_ids=product_ids,
)

# Query

In [13]:
# small function for visualizing the result
def visualize_result(query_result):
    result_dict = query_result["metadatas"][0]
    query_id = query_result["ids"][0][0]
    for i, (_id, distance) in enumerate(zip(query_result["ids"][0], query_result["distances"][0])):
        result_dict[i]["product_id"] = _id
        result_dict[i]["distance"] = distance
    result_df = pd.DataFrame(result_dict)
    result_df = result_df[["product_id", "product_name", "class", "subclass", "stock_level", "brand", "distance"]]
    print(f"For Product {query_id}, the top {len(result['ids'][0]) - 1} most similar items are:")
    display(result_df)

In [14]:
random_product_id = np.random.choice(product_ids, 1)[0]
random_product_id

'AJK466'

In [15]:
result = vectorstore.query_with_product_id(random_product_id)
result

{'ids': [['AJK466',
   'AJH310',
   'AJE072',
   'AIW587',
   'AJE069',
   'AJL162',
   'AJH480',
   'AJL158',
   'AJB964',
   'AJI576']],
 'distances': [[-1.1920928955078125e-07,
   0.420322060585022,
   0.4324754476547241,
   0.45584380626678467,
   0.4642549753189087,
   0.466641366481781,
   0.4721042513847351,
   0.48723477125167847,
   0.5467393398284912,
   0.5687351822853088]],
 'metadatas': [[{'brand': 'DIOR BEAUTY',
    'category': 'Fragrance',
    'class': 'Perfume',
    'product_name': "J'Adore Body Mist 100ml",
    'stock_level': 59,
    'subclass': 'Spray'},
   {'brand': 'DIPTYQUE',
    'category': 'Fragrance',
    'class': 'Perfume',
    'product_name': 'Tempo Eau de Parfum 75ml',
    'stock_level': 18,
    'subclass': 'Spray'},
   {'brand': 'SERGE LUTENS',
    'category': 'Fragrance',
    'class': 'Perfume',
    'product_name': 'La Religieuse 50ml',
    'stock_level': 18,
    'subclass': 'Spray'},
   {'brand': 'TOM FORD',
    'category': 'Fragrance',
    'class': 'Perfu

In [16]:
visualize_result(result)

For Product AJK466, the top 9 most similar items are:


Unnamed: 0,product_id,product_name,class,subclass,stock_level,brand,distance
0,AJK466,J'Adore Body Mist 100ml,Perfume,Spray,59,DIOR BEAUTY,-1.192093e-07
1,AJH310,Tempo Eau de Parfum 75ml,Perfume,Spray,18,DIPTYQUE,0.4203221
2,AJE072,La Religieuse 50ml,Perfume,Spray,18,SERGE LUTENS,0.4324754
3,AIW587,Eau de Soleil Blanc 50ml,Perfume,Spray,100,TOM FORD,0.4558438
4,AJE069,L'orpheline 50ml,Perfume,Spray,27,SERGE LUTENS,0.464255
5,AJL162,Tropical Cherimoya Cologne 30ml,Perfume,Spray,55,JO MALONE LONDON,0.4666414
6,AJH480,Iris Rebelle Cologne Absolute 30ml,Perfume,Spray,1,ATELIER COLOGNE,0.4721043
7,AJL158,Plum Blossom Cologne 100ml,Perfume,Spray,55,JO MALONE LONDON,0.4872348
8,AJB964,Aqua Vitae Eau de Toilette 70ml,Perfume,Spray,26,MAISON FRANCIS KURKDJIAN,0.5467393
9,AJI576,Aqua Allegoria Passiflora 75ml,Perfume,Spray,74,GUERLAIN,0.5687352
