# Semantic Search

Improvement on the previous notebook. It includes:
* Embedding and Retrieval: Uses SentenceTransformer for fast embedding-based similarity search (basic retrieval) to select top-k candidates based on semantic similarity.
* Re-ranking with CrossEncoder: Adds a second level of complexity by using a CrossEncoder model to re-rank the retrieved candidates based on query-context relevance.
* Boosting Scores: Introduces additional score boosts for candidates whose store names or titles include keywords from the query.



In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity

os.chdir("../")
from aux.text_pre_processing import (
    combine_text_features,
    pre_process_text,
    pre_process_query,
)

pd.set_option("display.max_colwidth", 150)
tqdm.pandas()

# Load Data

In [3]:
dataset_items = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_All_Beauty",
    split="full",
    trust_remote_code=True,
)
df = dataset_items.to_pandas()

In [4]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",4.8,10,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'], 'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg'...","{'title': [], 'url': [], 'user_id': []}",Howard Products,[],"{""Package Dimensions"": ""7.1 x 5.5 x 3 inches; 2.38 Pounds"", ""UPC"": ""617390882781""}",B01CUPMQZE,,,
1,All Beauty,"Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.",4.5,3,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71g1lP0pMbL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/81OqvR94isL._SL1500_.jpg'], 'larg...","{'title': [], 'url': [], 'user_id': []}",Yes To,[],"{""Item Form"": ""Powder"", ""Skin Type"": ""Acne Prone"", ""Brand"": ""Yes To"", ""Age Range (Description)"": ""Adult"", ""Unit Count"": ""10 Fl Oz"", ""Is Discontinu...",B076WQZGPM,,,
2,All Beauty,Eye Patch Black Adult with Tie Band (6 Per Pack),4.4,26,[],[],,"{'hi_res': [None, None], 'large': ['https://m.media-amazon.com/images/I/31bz+uqzWCL.jpg', 'https://m.media-amazon.com/images/I/31bz+uqzWCL.jpg'], ...","{'title': [], 'url': [], 'user_id': []}",Levine Health Products,[],"{""Manufacturer"": ""Levine Health Products""}",B000B658RI,,,
3,All Beauty,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4D Imitation Eyebrow Tattoos, 4D Hair-like Authentic Eyebrows Waterproof Long Lasting for Woman & Man...",3.1,102,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71GJhXQGvyL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/61NS1lONhzL._SL1001_.jpg', 'https...","{'title': [], 'url': [], 'user_id': []}",Cherioll,[],"{""Brand"": ""Cherioll"", ""Item Form"": ""Powder"", ""Finish Type"": ""Natural"", ""Product Benefits"": ""Long Lasting"", ""Skin Type"": ""All"", ""Package Dimensions...",B088FKY3VD,,,
4,All Beauty,Precision Plunger Bars for Cartridge Grips – 93mm – Bag of 10 Plungers,4.3,7,"[Material: 304 Stainless Steel; Brass tip, Lengths Available: 88mm, 93mm, 98mm, Accepts cartridge needles with vice style tattoo machines, Works p...","[The Precision Plunger Bars are designed to work seamlessly with the Precision Disposable 1. 25"" Contoured Soft Cartridge Grips and the Precision ...",,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/31TgqAZ8kQL.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/31TgqAZ8kQL._S...","{'title': [], 'url': [], 'user_id': []}",Precision,[],"{""UPC"": ""644287689178""}",B07NGFDN6G,,,


In [5]:
df.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
      dtype='object')

## Pre process

In [7]:
# item features
df["title_description_features"] = df.apply(
    lambda row: combine_text_features(
        row, col1="title", col2="features", col3="description"
    ),
    axis=1,
)
df = pre_process_text(
    df,
    input_col="title_description_features",
    output_col="product_title_description_feature_pre_processed",
)

df.loc[df.store.isnull(), "store"] = "unknown"
df = pre_process_text(df, input_col="store", output_col="store_pre_processed")

##  Filter by minimum ammount of words

* some of the shorter descriptions are bad

In [8]:
df["num_words"] = df["title_description_features"].str.split().str.len()

In [9]:
df[df.num_words >= 5].shape[0] / df.shape[0]

0.9855671018740563

In [10]:
df = df.loc[df.num_words >= 5].reset_index(drop=True)

In [11]:
df.product_title_description_feature_pre_processed.head()

0                                                                                                                 howard lc leather conditioner ounce pack
1                                      yes tomato detoxifying charcoal cleanser pack charcoal powder tomato fruit extract gingko biloba leaf extract fl oz
2                                                                                                                  eye patch black adult tie band per pack
3              tattoo eyebrow sticker waterproof eyebrow imitation eyebrow tattoo hairlike authentic eyebrow waterproof long lasting woman man makeup tool
4    precision plunger bar cartridge grip mm bag plunger precision plunger bar designed work seamlessly precision disposable contoured soft cartridge g...
Name: product_title_description_feature_pre_processed, dtype: object

In [12]:
df.shape

(110965, 20)

In [13]:
# Model

In [14]:
bi_encoder = SentenceTransformer("all-MiniLM-L6-v2")  # fast retrieval
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # Accurate ranking

In [15]:
document_embeddings = np.load("search/product_embeddings.npy")
document_embeddings.shape

(110965, 384)

In [16]:
# Features

In [17]:
def basic_retrieve(query, df, document_embeddings, retrieval_k=100):
    query = pre_process_query(query)
    query_embedding = bi_encoder.encode(query)
    similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

    top_k_indices = np.argsort(similarity_scores)[-retrieval_k:]
    candidates = df.iloc[top_k_indices].copy()

    candidates.loc[:, "final_score"] = similarity_scores[top_k_indices]
    return candidates.sort_values("final_score", ascending=False)

In [26]:
def retrieve_and_rerank(
    query,
    df,
    document_embeddings,
    retrieval_k=100,
    use_title_keyword_match=False,
    use_store=False,
):
    # 1. semantic similarities
    query = pre_process_query(query)
    query_embedding = bi_encoder.encode(query)
    similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

    top_k_indices = np.argsort(similarity_scores)[-retrieval_k:]
    candidates = df.iloc[top_k_indices].copy()

    # 2. rerank with store info
    if use_store:
        pairs = [
            [query, f"{text} [Brand: {store}]"]
            for text, store in zip(
                candidates["product_title_description_feature_pre_processed"],
                candidates["store"],
            )
        ]
    else:
        pairs = [
            [query, text]
            for text in candidates["product_title_description_feature_pre_processed"]
        ]

    rerank_scores = cross_encoder.predict(pairs)

    # 3. boosts after reranking
    final_scores = rerank_scores.copy()
    query_terms = query.lower().split()

    if use_title_keyword_match:
        title_matches = (
            candidates["title"]
            .str.lower()
            .apply(lambda x: any(term in x.lower() for term in query_terms))
        )
        final_scores += title_matches.astype(float) * 3

    if use_store:
        store_matches = (
            candidates["store"]
            .str.lower()
            .apply(lambda x: any(term in x.lower() for term in query_terms))
        )
        final_scores += store_matches.astype(float) * 3

    candidates.loc[:, "final_score"] = final_scores
    return candidates.sort_values("final_score", ascending=False)

In [21]:
query = "cerave spf moisturizer"
res = basic_retrieve(query, df, document_embeddings)
res[["title", "store", "final_score"]].head(10)

Unnamed: 0,title,store,final_score
80371,Revitalizing Light-Weight Moisturizer SPF 15,BB Lifestyle,0.721638
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,0.708719
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,0.705689
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,0.703798
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,0.700271
28803,Face Cream Moisturizer Olive Oil & Honey for Women SPF-20 50ml,Health & Beauty Dead Sea Minerals,0.699831
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",LUXSURE,0.690701
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",CeraVe,0.686267
99815,Sofina Jenne Moisturizing Whitening UV Cut Emulsion SPF50+/PA++++/30ml,Sofina,0.680897
57652,"Céla Skin Care Essential Balm, Coconut/Bergamot Oil Moisturizer for Dry Skin",Céla,0.679205


In [22]:
query = "cerave spf moisturizer"
res = retrieve_and_rerank(
    query, df, document_embeddings, use_title_keyword_match=False, use_store=False
)
res[["title", "store", "final_score"]].head(10)

Unnamed: 0,title,store,final_score
10143,CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce,CeraVe,7.079282
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",LUXSURE,4.998802
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,4.595958
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,4.507841
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,4.495101
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,4.4488
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",CeraVe,4.172186
107879,CeraVe Hydrating Oil Cleanser 236ml,CeraVe,2.804509
20208,"Cetaphil Moisturizing Cream 20 oz, 2 Pack",Cetaphil,2.115604
68802,Clinique 'Moisture Surge' Tinted Moisturizer SPF 15 Provides a No-Makeup Look and Feel (Shade 02),unknown,2.050982


In [23]:
query = "cerave spf moisturizer"
res = retrieve_and_rerank(
    query, df, document_embeddings, use_title_keyword_match=True, use_store=False
)
res[["title", "store", "final_score"]].head(10)

Unnamed: 0,title,store,final_score
10143,CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce,CeraVe,10.079282
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",LUXSURE,7.998802
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.595958
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.507841
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.495101
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.4488
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",CeraVe,7.172186
107879,CeraVe Hydrating Oil Cleanser 236ml,CeraVe,5.804509
68802,Clinique 'Moisture Surge' Tinted Moisturizer SPF 15 Provides a No-Makeup Look and Feel (Shade 02),unknown,5.050982
78162,Murad Oil and Pore Control Mattifier Broad Spectrum SPF 45 | PA++++ | Travel Size - 0.8 Fl Oz - Moisturizer for Mattifying Facial Skin,Murad,4.963304


In [24]:
query = "cerave spf moisturizer"
res = retrieve_and_rerank(
    query, df, document_embeddings, use_title_keyword_match=True, use_store=True
)
res[["title", "store", "final_score"]].head(10)

Unnamed: 0,title,store,final_score
10143,CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce,CeraVe,13.063652
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",CeraVe,10.687904
107879,CeraVe Hydrating Oil Cleanser 236ml,CeraVe,9.319592
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",LUXSURE,8.185555
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.732594
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.629389
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.623445
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.560297
68802,Clinique 'Moisture Surge' Tinted Moisturizer SPF 15 Provides a No-Makeup Look and Feel (Shade 02),unknown,5.124448
78162,Murad Oil and Pore Control Mattifier Broad Spectrum SPF 45 | PA++++ | Travel Size - 0.8 Fl Oz - Moisturizer for Mattifying Facial Skin,Murad,4.921179


In [25]:
# best one so far
query = "cerave spf moisturizer"
res = retrieve_and_rerank(
    query, df, document_embeddings, use_title_keyword_match=True, use_store=True
)
res[["title", "store", "final_score"]].head(10)

Unnamed: 0,title,store,final_score
10143,CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce,CeraVe,13.063652
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",CeraVe,10.687904
107879,CeraVe Hydrating Oil Cleanser 236ml,CeraVe,9.319592
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",LUXSURE,8.185555
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.732594
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.629389
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.623445
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",C'est Moi,7.560297
68802,Clinique 'Moisture Surge' Tinted Moisturizer SPF 15 Provides a No-Makeup Look and Feel (Shade 02),unknown,5.124448
78162,Murad Oil and Pore Control Mattifier Broad Spectrum SPF 45 | PA++++ | Travel Size - 0.8 Fl Oz - Moisturizer for Mattifying Facial Skin,Murad,4.921179
