In [1]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from dotenv import load_dotenv

from vectordb import SimilarProductVectorDB

load_dotenv()

True

# Load data

In [2]:
# read product data
spark = SparkSession.builder.appName("read_data").getOrCreate()
# data cannot be provided according to company policy
# please load ur data here to try!
data = spark.read.parquet(os.environ["PRODUCT_DATA_PATH"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/02 14:50:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
data.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: string (nullable = true)
 |-- class: string (nullable = true)
 |-- subclass: string (nullable = true)
 |-- product_description: string (nullable = true)


# Initialize vectorstore

In [4]:
vectorstore = SimilarProductVectorDB(
    collection_name="cosmetics_similar_product_db",
    distance_function="ip",
    n_query_result=11,  # show at most top 10 most similar product in result + first result should be itself
)

In [5]:
# extract documents and metadata from table
df = data.toPandas()
# adding random stock level
df["stock_level"] = np.random.randint(0, 101, size=len(df))

                                                                                

In [6]:
df.head()

Unnamed: 0,product_id,product_name,brand,category,class,subclass,product_description,stock_level
0,KLK169,Luminous Deep Hydration Lifting Mask,TATCHA,Skincare,Masks & Treatments,Sheet Mask,Boosting moisture levels up to 200% in 15 minu...,8
1,LGY307,Jour d'Hermès Absolu Eau de Parfum 50ml,HERMÈS,Fragrance,Perfume,Spray,I wanted to express the essence of femininity ...,60
2,LGY220,Jour d'Hermès Absolu Eau de Parfum 85ml,HERMÈS,Fragrance,Perfume,Spray,I wanted to express the essence of femininity ...,44
3,LKK287,Love In White Spray 75ml,CREED,Fragrance,Perfume,Spray,"Inspired by a love for sailing, this Millésime...",5
4,AKY054,Neroli Portofino Eau de Parfum 100ml,TOM FORD,Fragrance,Perfume,Spray,Vibrant. Sparkling. Transportive.<br>To TOM FO...,89


# Add to vectorstore
Simply use product description as the features and pass into the vectorstore.

In [7]:
# put document into vectordb
product_ids = list(df["product_id"])
product_desc = list(df["product_description"])
product_metadatas = df[["product_name", "brand", "category", "class", "subclass", "stock_level"]].to_dict(
    orient='records')

vectorstore.add_documents(
    documents=product_desc,
    metadatas=product_metadatas,
    product_ids=product_ids,
)

# Query

In [8]:
# small function for visualizing the result
def visualize_result(query_result):
    result_dict = query_result["metadatas"][0]
    query_id = query_result["ids"][0][0]
    for i, (_id, distance) in enumerate(zip(query_result["ids"][0], query_result["distances"][0])):
        result_dict[i]["product_id"] = _id
        result_dict[i]["distance"] = round(distance, 3)
    result_df = pd.DataFrame(result_dict)
    result_df = result_df[["product_id", "product_name", "class", "subclass", "stock_level", "brand", "distance"]]
    print(f"For Product {query_id}, the top {len(query_result['ids'][0]) - 1} most similar items are:")
    display(result_df)

In [15]:
random_product_id = np.random.choice(product_ids, 1)[0]
result = vectorstore.query_with_product_id(random_product_id)
visualize_result(result)

For Product AKW568, the top 10 most similar items are:


Unnamed: 0,product_id,product_name,class,subclass,stock_level,brand,distance
0,AKW568,Sugar Chocolate Hydrating Lip Balm Limited Edi...,Lip Care,Lip Balm,76,FRESH,0.07
1,AKW565,Sugar Lemon Hydrating Lip Balm Limited Edition,Lip Care,Lip Balm,53,FRESH,0.076
2,AKW569,Sugar Coconut Hydrating Lip Balm,Lip Care,Lip Balm,28,FRESH,0.089
3,AJB605,Sugar Lip Caramel Hydrating Balm,Lip Care,Lip Balm,96,FRESH,0.164
4,ALE646,Sugar Lip Wonder Drops Advanced Therapy,Lip Care,Lip Treatment,37,FRESH,0.247
5,AKW563,Sugar Dream Lip Treatment Advanced Therapy,Lip Care,Lip Treatment,89,FRESH,0.272
6,AJB603,Sugar Cream Lip Treatment – Baby,Lip Care,Lip Treatment,43,FRESH,0.273
7,AJB609,Sugar Cream Lip Treatment – Gilt,Lip Care,Lip Treatment,65,FRESH,0.276
8,AJB602,Sugar Cream Lip Treatment – Pearl,Lip Care,Lip Treatment,4,FRESH,0.278
9,ALB647,Rose Petal Lip Balm 4.4g,Lip Care,Lip Balm,71,SUBTLE ENERGIES,0.298


# Use Extracted Features
Use extracted features as the features and pass the embeddings into the vectorstore.

In [16]:
# load data
data = spark.read.parquet(os.environ["PERFUME_PRODUCT_DATA_PATH"])
data.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: string (nullable = true)
 |-- class: string (nullable = true)
 |-- subclass: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)


In [17]:
# another vectorstore for features
perfume_vectorstore = SimilarProductVectorDB(
    collection_name="perfume_similar_product_db",
    distance_function="ip",
    n_query_result=11, 
)

In [18]:
# extract documents and metadata from table
df = data.toPandas()
# adding random stock level
df["stock_level"] = np.random.randint(0, 101, size=len(df))
df.head()

Unnamed: 0,product_id,product_name,brand,category,class,subclass,product_description,features,stock_level
0,YBB493,Acqua Di Gioia Eau de Parfum 100ml,GIORGIO ARMANI BEAUTY,Fragrance,Perfume,Spray,"Perfect for the spring and summer, Giorgio Arm...","[lmr cedarwood heart, brown sugar, labdanum]",92
1,YQX798,My Burberry Eau De Parfum 90ml,BURBERRY BEAUTY,Fragrance,Perfume,Spray,<li>90ml</li><li>Contemporary floral scent</li...,"[sweet pea, bergamot fuse, geranium leaf, gold...",83
2,OYK369,Rose Amazone Eau de Toilette 100ml,HERMÈS,Fragrance,Perfume,Spray,A new Amazone who is so much more contemporary...,"[citrus fruits, currant, raspberry]",73
3,JIK820,Arancia di Capri Eau de Toilette 150ml,ACQUA DI PARMA,Fragrance,Perfume,Spray,Reminisce on the summer vacations at Capri wit...,"[orange, mandarin, lemon, petitgrain, cardamon...",33
4,EOI621,Blu Mediterraneo Fico Di Amalfi Eau de Toilett...,ACQUA DI PARMA,Fragrance,Perfume,Spray,"A unique combination of fig nectar, jasmine an...","[bergamot, lemon, grapefruit, citron, fig nect...",33


In [19]:
# embed the lists of words
from sklearn.preprocessing import MultiLabelBinarizer

# extract the complete vocabulary of unique words
vocabulary = set(word for sublist in df["features"] for word in sublist)

# initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=list(vocabulary))

# fit and transform the 'features' column
word_freq_vectors = mlb.fit_transform(df['features'])
words = mlb.classes_

# normalize the vector
normalized_word_freq_array = word_freq_vectors / np.linalg.norm(word_freq_vectors, axis=1, keepdims=True)

In [20]:
# put embedding into vectordb
product_ids = list(df["product_id"])
product_metadatas = df[["product_name", "brand", "category", "class", "subclass", "stock_level"]].to_dict(
    orient='records')

perfume_vectorstore.add_embeddings(
    embeddings=normalized_word_freq_array.tolist(),
    metadatas=product_metadatas,
    product_ids=product_ids,
)

In [21]:
visualize_result(perfume_vectorstore.query_with_product_id(np.random.choice(product_ids, 1)[0]))

For Product AYG279, the top 10 most similar items are:


Unnamed: 0,product_id,product_name,class,subclass,stock_level,brand,distance
0,AYG279,Baies Scented Oval,Room and Car Fragrances,Room and Car Fragrances,35,DIPTYQUE,0.0
1,BLE882,Perfumed Car Diffuser with Baies Insert,Room and Car Fragrances,Room and Car Fragrances,78,DIPTYQUE,0.5
2,AHO287,RED ROSES SCENT SURROUND™ DIFFUSER 165ML,Room and Car Fragrances,Room and Car Fragrances,37,JO MALONE LONDON,0.592
3,EEV060,Fir & Rose Home Spray 110ml,Room and Car Fragrances,Room and Car Fragrances,95,HANDHANDHAND,0.646
4,BYY042,SPEARMINT SCENT DIFFUSER 100ML,Room and Car Fragrances,Room and Car Fragrances,2,HANDHANDHAND,1.0
5,EEV058,Black Tea Home Spray 110ml,Room and Car Fragrances,Room and Car Fragrances,99,HANDHANDHAND,1.0
6,BYY053,PEAR SCENT DIFFUSER 100ML,Room and Car Fragrances,Room and Car Fragrances,61,HANDHANDHAND,1.0
7,BXV769,INSIEME ROOM DIFFUSER 180ML,Room and Car Fragrances,Room and Car Fragrances,7,ACQUA DI PARMA,1.0
8,EAA181,ORANGE BLOSSOM SCENT SURROUND™ DIFFUSER 165ML,Room and Car Fragrances,Room and Car Fragrances,13,JO MALONE LONDON,1.0
9,EBO572,10Minutes Incense Matches – Lavender,Room and Car Fragrances,Room and Car Fragrances,19,HIBI,1.0
