# Basic Search Techniques

Experiments with basic search techniques on the Amazon Reviews dataset (products):
- **Inverted Index**: Build an inverted index to map words to the documents they appear in.
- **TF-IDF Search**: Use TF-IDF to compute term importance and document similarity.
- **Sentence Transformers**: Utilize pre-trained embeddings from `SentenceTransformers` to compute semantic similarity and rank documents based on the query.


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

from collections import defaultdict
from datasets import load_dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

os.chdir("../")
from aux.text_pre_processing import (
    combine_text_features,
    pre_process_text,
    pre_process_query,
)

pd.set_option("display.max_colwidth", 150)

# Load Data

In [2]:
dataset_items = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_All_Beauty",
    split="full",
    trust_remote_code=True,
)
df = dataset_items.to_pandas()

In [3]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",4.8,10,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'], 'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg'...","{'title': [], 'url': [], 'user_id': []}",Howard Products,[],"{""Package Dimensions"": ""7.1 x 5.5 x 3 inches; 2.38 Pounds"", ""UPC"": ""617390882781""}",B01CUPMQZE,,,
1,All Beauty,"Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.",4.5,3,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71g1lP0pMbL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/81OqvR94isL._SL1500_.jpg'], 'larg...","{'title': [], 'url': [], 'user_id': []}",Yes To,[],"{""Item Form"": ""Powder"", ""Skin Type"": ""Acne Prone"", ""Brand"": ""Yes To"", ""Age Range (Description)"": ""Adult"", ""Unit Count"": ""10 Fl Oz"", ""Is Discontinu...",B076WQZGPM,,,
2,All Beauty,Eye Patch Black Adult with Tie Band (6 Per Pack),4.4,26,[],[],,"{'hi_res': [None, None], 'large': ['https://m.media-amazon.com/images/I/31bz+uqzWCL.jpg', 'https://m.media-amazon.com/images/I/31bz+uqzWCL.jpg'], ...","{'title': [], 'url': [], 'user_id': []}",Levine Health Products,[],"{""Manufacturer"": ""Levine Health Products""}",B000B658RI,,,
3,All Beauty,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4D Imitation Eyebrow Tattoos, 4D Hair-like Authentic Eyebrows Waterproof Long Lasting for Woman & Man...",3.1,102,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71GJhXQGvyL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/61NS1lONhzL._SL1001_.jpg', 'https...","{'title': [], 'url': [], 'user_id': []}",Cherioll,[],"{""Brand"": ""Cherioll"", ""Item Form"": ""Powder"", ""Finish Type"": ""Natural"", ""Product Benefits"": ""Long Lasting"", ""Skin Type"": ""All"", ""Package Dimensions...",B088FKY3VD,,,
4,All Beauty,Precision Plunger Bars for Cartridge Grips – 93mm – Bag of 10 Plungers,4.3,7,"[Material: 304 Stainless Steel; Brass tip, Lengths Available: 88mm, 93mm, 98mm, Accepts cartridge needles with vice style tattoo machines, Works p...","[The Precision Plunger Bars are designed to work seamlessly with the Precision Disposable 1. 25"" Contoured Soft Cartridge Grips and the Precision ...",,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/31TgqAZ8kQL.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/31TgqAZ8kQL._S...","{'title': [], 'url': [], 'user_id': []}",Precision,[],"{""UPC"": ""644287689178""}",B07NGFDN6G,,,


In [4]:
df.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [5]:
# Pre process

In [6]:
# item features
df["title_description_features"] = df.apply(
    lambda row: combine_text_features(
        row, col1="title", col2="features", col3="description"
    ),
    axis=1,
)
df = pre_process_text(
    df,
    input_col="title_description_features",
    output_col="product_title_description_feature_pre_processed",
)

##  Filter by minimum ammount of words

* some of the shorter descriptions are bad

In [10]:
df["num_words"] = df["title_description_features"].str.split().str.len()

In [11]:
df[df.num_words >= 5].shape[0] / df.shape[0]

1.0

In [9]:
df = df.loc[df.num_words >= 5].reset_index(drop=True)

In [12]:
df.product_title_description_feature_pre_processed.head()

0                                                                                                                 howard lc leather conditioner ounce pack
1                                      yes tomato detoxifying charcoal cleanser pack charcoal powder tomato fruit extract gingko biloba leaf extract fl oz
2                                                                                                                  eye patch black adult tie band per pack
3              tattoo eyebrow sticker waterproof eyebrow imitation eyebrow tattoo hairlike authentic eyebrow waterproof long lasting woman man makeup tool
4    precision plunger bar cartridge grip mm bag plunger precision plunger bar designed work seamlessly precision disposable contoured soft cartridge g...
Name: product_title_description_feature_pre_processed, dtype: object

# 1. Index Based

* which words are in each document

In [13]:
def build_inverted_index(df, column):
    inverted_index = defaultdict(set)
    for idx, text in df[column].items():
        tokens = text.split()
        for token in tokens:
            inverted_index[token].add(idx)
    return {k: list(v) for k, v in inverted_index.items()}

In [14]:
inverted_index = build_inverted_index(
    df, "product_title_description_feature_pre_processed"
)

In [15]:
for key in sorted(inverted_index)[:50]:
    print(f"{key}: {inverted_index[key]}")

aa: [52225, 27652, 77320, 79379, 23580, 96798, 31775, 88095, 47654, 82475, 23598, 27697, 45617, 58961, 72786, 13396, 18516, 51797, 48735, 45667, 77932, 53869, 68208, 46707, 27766, 51830, 99960, 70266, 4219, 89722, 85117, 11906, 21123, 30339, 51330, 109706, 108178, 108182, 23707, 47771, 95395, 94887, 56488, 44201, 9386, 104108, 53433, 75454, 78527, 88772, 56517, 13511, 22215, 51408, 6866, 78040, 79069, 76002, 101608, 105206, 97017, 61184, 38149, 75017, 18700, 53012, 47898, 98586, 85281, 39202, 51495, 92458, 10028, 1326, 53551, 6964, 57142, 107322, 16198, 68935, 47434, 39758, 44879, 96592, 64850, 83796, 11608, 67931, 55144, 67457, 2951, 5004, 38796, 77712, 26002, 72097, 41896, 81851, 13757, 19393, 25031, 465, 27091, 64472, 17886, 33766, 41960, 110568, 68077, 103406, 89072, 92658, 74227, 2550]
aaa: [27652, 15367, 98314, 26636, 54287, 38416, 44562, 27174, 69671, 31785, 49714, 51775, 104009, 19018, 46156, 93264, 8282, 97373, 19042, 43621, 102014, 19584, 10393, 96413, 63655, 689, 33462, 1056

In [16]:
def search(query, inverted_index, df):
    query = pre_process_query(query)
    query_tokens = query.split()
    matching_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            matching_docs.update(inverted_index[token])
    return df.loc[list(matching_docs)]

In [19]:
query = "cerave spf"

In [20]:
res = search(query, inverted_index, df)
res.title

88064                                                     Christian Dior Makeup Diorskin Sculpt Line Smoothing Lifting Makeup Spf20 # 030 Medium Beige 30Ml/1Oz
79876                                                                SoftLips Pearl Tint Lip Protectant SPF 15! Lip Balm Moisturizer and Conditioner! (24 Pack)
10245                                                                                          Amorepacific - IOPE Whitegen Skin Luminous Special Kit (4 Items)
92164                                                                                                                           MAC Studio Fix Fluid SPF15 NC47
110598                                                                                                      MAC Studio Stick Foundation SPF 15 - NW47 9g/0.31oz
                                                                                  ...                                                                          
45041     Nakeup Face Waterking Cover Cu

# 2. Tf-IDF

* statistical measure that evaluates how important a term is to a document in relation to the entire dataset.
    * TF (Term Frequency): How often the term appears in a document.
    * IDF (Inverse Document Frequency): How rare the term is across all documents.
* Results seem much better

In [25]:
def query_tfidf(query, vectorizer, df):
    # query vector
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    df["similarity_score"] = similarity_scores[0]
    res = df.copy(deep=True).sort_values(by="similarity_score", ascending=False)
    return res

In [26]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    df["product_title_description_feature_pre_processed"]
)

In [29]:
query = "cerave moisturizer spf"
sorted_df = query_tfidf(query, vectorizer, df)

In [30]:
sorted_df[["title", "similarity_score"]].head()

Unnamed: 0,title,similarity_score
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",0.481708
80371,Revitalizing Light-Weight Moisturizer SPF 15,0.399087
83064,"Cerave Moisturizing Cream With Pump For Normal To Dry Skin Value Pack of 2 x 19 Oz (Total 38 Oz)""",0.383651
14825,Collagen Firming Face Cream Moisturizer SPF-20 50ml,0.351494
93345,"Simple Moisturizer SPF15, Protecting Light 4.2 Ounce, pack of 3",0.345158


# 3. Sentence Transformers

In [31]:
from sentence_transformers import SentenceTransformer

2025-01-17 15:45:59.406853: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-17 15:45:59.558990: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737128759.622956   13560 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737128759.644493   13560 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-17 15:45:59.791409: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [32]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [33]:
embeddings = model.encode(
    df["product_title_description_feature_pre_processed"].tolist(),
    batch_size=64,
    show_progress_bar=True,
)

Batches:   0%|          | 0/1734 [00:00<?, ?it/s]

In [37]:
np.save("search/product_embeddings.npy", embeddings)

In [38]:
def query_sentence_transformer(query, model, df):
    query_embedding = model.encode(query)

    similarity_scores = cosine_similarity([query_embedding], embeddings)[0]

    # Add similarity scores to DataFrame
    df["similarity_score_sentence_transformers"] = similarity_scores
    return df

In [39]:
query = "cerave moisturizer spf"
res = query_sentence_transformer(query, model, df)

In [42]:
res.sort_values(by="similarity_score_sentence_transformers", ascending=False)[
    ["title", "similarity_score_sentence_transformers"]
].head(10)

Unnamed: 0,title,similarity_score_sentence_transformers
80371,Revitalizing Light-Weight Moisturizer SPF 15,0.746062
28803,Face Cream Moisturizer Olive Oil & Honey for Women SPF-20 50ml,0.713684
62664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",0.712769
95664,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",0.709647
38057,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",0.709056
4509,"C'est Moi Tinted Moisturizing Lotion SPF 30 | Gentle, Hydrating Formula Provides Broad Spectrum SPF Protection with Sheer, Buildable Coverage, Fra...",0.704436
31074,"Moisturizing Sunscreen Lotion, SPF 30",0.700573
31957,"Cerave Sunscreen Bundle SPF 50 | Contains Mineral Sunscreen for Face SPF 50, 2.5 Ounce, and Mineral Body Sunscreen SPF 50, 5 Ounce 1 ea",0.697636
88825,"e.l.f. Beauty Shield SPF 50 Skin Shielding Moisturizer, 1.69 fl oz",0.694835
109741,DeVita Solar Body Moisturizer SPF 30+,0.687732
