In [None]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.1


In [None]:
import pandas as pd
import numpy as np
import torch
import time
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Load embeddings for both sentence transformer and BERT
embeddings_sentence_transformer = pd.read_pickle('/content/sentence_transformer_embeddings.pkl')
embeddings_bert = pd.read_pickle('/content/bert_embeddings.pkl')

embeddings_bert

Unnamed: 0,link,price,actual_price,ratings,color,processed_title,feature_embedding
0,https://www.daraz.com.np/products/dell-vostro-...,55999.0,55999.0,13.0,Black,dell vostro 3520 i3 12th gen 16gb ram 512gb ss...,"[[-1.0659682, -0.44252846, 0.28969133, -0.0586..."
1,https://www.daraz.com.np/products/apple-macboo...,109900.0,139900.0,76.0,Space Grey,apple macbook air 13inch m1 256gb oliz store,"[[-0.8023374, -0.16204228, 0.045940593, 0.1096..."
2,https://www.daraz.com.np/products/dell-vostro-...,68999.0,68999.0,22.0,Black,dell vostro 3520 i5 12th gen 16gb ram 512gb ss...,"[[-0.65138435, -0.4231668, 0.11806483, -0.2789..."
3,https://www.daraz.com.np/products/dell-vostro-...,64000.0,64000.0,8.0,Black,dell vostro 3520 i5 12th gen 8gb ram 256gb ssd...,"[[-0.67667896, -0.4205639, 0.12135259, -0.2486..."
4,https://www.daraz.com.np/products/acer-nitro-v...,137999.0,137999.0,2.0,Black,acer nitro v 15 i7 13th gen 13620h 16gb ddr5 5...,"[[-0.61645573, -0.32771653, 0.48033115, 0.1337..."
...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,75990.0,88000.0,0.0,Silver,asus x515 i5 11th gen 8gb ram 512gb ssd 2gb nv...,"[[-1.0215707, -0.63885677, 0.11740286, -0.0237..."
529,https://www.daraz.com.np/products/asus-vivoboo...,89990.0,110000.0,0.0,Black,asus vivobook 16 f1605va intel core i5 13th ge...,"[[-0.7567022, -0.38317725, 0.121513866, -0.136..."
530,https://www.daraz.com.np/products/lenovo-ideap...,52000.0,52000.0,0.0,Brown,lenovo ideapad 3 amd ryzen 5300u processor 4gb...,"[[-0.76590127, -0.55628425, 0.2714341, -0.1968..."
531,https://www.daraz.com.np/products/lenovo-ideap...,55000.0,55000.0,0.0,Grey,lenovo ideapad slim 3 windows 11 156 inch hd i...,"[[-0.94413733, -0.60363483, 0.29676333, -0.145..."


In [None]:
synonym_dict = {
    "laptop": ["notebook", "portable computer", "ultrabook", "chromebook"],
    "microsoft": ["Microsoft", "Surface", "Surface Pro", "Surface Book", "Surface Laptop"],
    "msi": ["MSI", "GF", "GL", "Prestige", "Modern"],
    "intel": ["Intel Core", "Pentium", "Celeron"],
    "amd": ["AMD Ryzen", "Athlon"],
    "ram": ["RAM", "Memory", "DDR5"],
    "color": ["Black", "Silver", "Grey", "Space Grey", "Matte Black", "Blue", "White", "Gold"],
    "battery_life": ["battery backup", "extended battery"],
    "touchscreen": ["touch display", "interactive screen", "multi-touch", "fingerprint reader"],
    "graphics": ["Integrated Graphics", "Discrete Graphics"],
    "operating_system": ["Windows", "macOS", "Linux", "Chrome OS", "Ubuntu", "Fedora"],
    "connectivity": ["Wi-Fi", "Bluetooth", "USB-C", "Thunderbolt", "HDMI", "Ethernet", "4G LTE", "5G"],
    "audio": ["Dolby Audio", "stereo speakers", "Bang & Olufsen", "Harman Kardon"],
    "build": ["metal body", "plastic body", "aluminum chassis", "carbon fiber"],
    "weight": ["lightweight", "portable", "thin and light", "ultra-light"]
}

In [None]:
 # Create a mapping function for the synonyms
def map_synonyms(query):
    words = query.lower().split()
    expanded_query = []
    for word in words:
        # Add the word itself
        expanded_query.append(word)
        # Check if the word has synonyms in the dictionary
        if word in synonym_dict:
            expanded_query.extend(synonym_dict[word])
    return ' '.join(expanded_query)


In [None]:
# Function to perform semantic search using sentence transformer
def semantic_search_sentence_transformer(data, query, top_n=10):

    model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Generate embedding for the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Calculate cosine similarity between the query and the embeddings
    similarities = cosine_similarity(query_embedding.reshape(1, -1), data['feature_embedding'].tolist())

    # Add similarity scores to the DataFrame
    data['similarity'] = similarities[0]

    # Sort by similarity and return top_n results
    results = data.sort_values(by='similarity', ascending=False).head(top_n)

    return results[['processed_title', 'similarity']]

In [None]:
# Function to perform semantic search using BERT
def semantic_search_bert(data, query, top_n=10):
    try:
        # Load pre-trained BERT model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')

        # Map synonyms in the query
        expanded_query = map_synonyms(query)

        # Start time
        start_time = time.time()

        # Generate BERT embedding for the query
        encoded_input = tokenizer(expanded_query, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Extract the CLS token's embedding as the query embedding
        query_embedding = model_output.last_hidden_state[:, 0, :].squeeze().numpy()  # 2D array

        # Ensure the product embeddings are in the correct shape (list of 1D vectors)
        product_embeddings = [embedding.squeeze() for embedding in data['feature_embedding'].tolist()]

        # Calculate cosine similarity between the query embedding and product embeddings
        similarities = cosine_similarity([query_embedding], product_embeddings)

        # End time
        end_time = time.time()

        # Calculate and print inference time
        inference_time = end_time - start_time
        print(f"Inference time for BERT: {inference_time:.2f} seconds")

        # Add similarity scores to the DataFrame
        data['similarity'] = similarities[0]

        # Sort by similarity and return top_n results
        results = data.sort_values(by='similarity', ascending=False).head(top_n)

        return results[['processed_title', 'similarity']]

    except Exception as e:
        print(f"Error during semantic search: {e}")
        return None

In [None]:
# def cluster_product_embeddings(data, num_clusters=5):
#     try:
#         # Extract product embeddings (convert to correct shape)
#         product_embeddings = [embedding.squeeze() for embedding in data['feature_embedding'].tolist()]

#         # Perform KMeans clustering
#         kmeans = KMeans(n_clusters=num_clusters, random_state=0)
#         cluster_labels = kmeans.fit_predict(product_embeddings)

#         # Add cluster labels to the DataFrame
#         data['cluster'] = cluster_labels

#         return data, kmeans

#     except Exception as e:
#         print(f"Error during clustering: {e}")
#         return None, None

In [None]:
# def semantic_search_bert_with_clustering(data, query, kmeans, top_n=10):
#     try:
#         # Load pre-trained BERT model and tokenizer
#         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         model = BertModel.from_pretrained('bert-base-uncased')

#         # Map synonyms in the query
#         expanded_query = map_synonyms(query)

#         # Generate BERT embedding for the query
#         encoded_input = tokenizer(expanded_query, padding=True, truncation=True, return_tensors='pt')
#         with torch.no_grad():
#             model_output = model(**encoded_input)

#         # Extract the CLS token's embedding as the query embedding
#         query_embedding = model_output.last_hidden_state[:, 0, :].squeeze().numpy()  # 2D array

#         # Predict the cluster of the query using the KMeans model
#         query_cluster = kmeans.predict([query_embedding])[0]

#         # Filter products to the ones in the same cluster
#         clustered_data = data[data['cluster'] == query_cluster]

#         if clustered_data.empty:
#             print(f"No products found in cluster {query_cluster}. Expanding to all products.")
#             clustered_data = data  # Fallback to all products if cluster is empty

#         # Ensure the product embeddings are in the correct shape (list of 1D vectors)
#         product_embeddings = [embedding.squeeze() for embedding in clustered_data['feature_embedding'].tolist()]

#         # Calculate cosine similarity between the query embedding and product embeddings
#         similarities = cosine_similarity([query_embedding], product_embeddings)

#         # Add similarity scores to the filtered DataFrame
#         clustered_data['similarity'] = similarities[0]

#         # Sort by similarity and return top_n results
#         results = clustered_data.sort_values(by='similarity', ascending=False).head(top_n)

#         return results[['combined_features', 'similarity']]

#     except Exception as e:
#         print(f"Error during semantic search with clustering: {e}")
#         return None

In [None]:
# Sentence Transformer
if embeddings_sentence_transformer is not None and embeddings_bert is not None:
    query = "dell with 16 gb ram and intel processor"
    query = query.lower()
    # expanded_query = map_synonyms(query)

    search_results = semantic_search_sentence_transformer(embeddings_sentence_transformer, query)
    if search_results is not None:
        print("Sentence Transformer Results")
        print(search_results)
# BERT

# if embeddings_sentence_transformer is not None:
#     query = "Dell with 16GB RAM"
#     expanded_query = expand_query(query)
    # clustered_data, kmeans = cluster_product_embeddings(embeddings_bert)
    search_results = semantic_search_bert(embeddings_bert,query)
    if search_results is not None:
        print("BERT Results")
        print(search_results)



Sentence Transformer Results
                                       processed_title  similarity
275  asus vivobook s16 intel core ultra 9 185h 16gb...    0.887254
117  lenovo s540 intel i5 10 gen ram 16 gb storage ...    0.875414
146  dell inspiron 14 plus 7440 2024 intel ultra 7 ...    0.872831
69   dell vostro 3520 core i5 12th gen 16gb ram512 ...    0.865127
29   dell vostro 3520 core i5 12th gen 16gb ram512 ...    0.865127
221  dell vostro 3520 i7 12th gen 16gb ram 512gb ss...    0.864429
82   lenovo ideapad 1 intel celeron processor 4 gb ...    0.858376
17   dell latitude e5400 i58th gen 8 gb ram 256 ssd...    0.857212
57   dell latitude e5400 i58th gen 8 gb ram 256 ssd...    0.857212
0    dell vostro 3520 i3 12th gen 16gb ram 512gb ss...    0.848467




Inference time for BERT: 0.14 seconds
BERT Results
                                       processed_title  similarity
279  dell latitude 5310 intel core i5 10th gen 1031...    0.948444
322  dell latitude 13inch 5320 model 11th generatio...    0.942574
292  dell 5470 laptop intel celeron processor 8 gb ...    0.941823
82   lenovo ideapad 1 intel celeron processor 4 gb ...    0.939942
117  lenovo s540 intel i5 10 gen ram 16 gb storage ...    0.938776
525      dell latitude 3440 i5 1235u 8gb ram 512gb ssd    0.938668
17   dell latitude e5400 i58th gen 8 gb ram 256 ssd...    0.937593
57   dell latitude e5400 i58th gen 8 gb ram 256 ssd...    0.937593
40   dell vostro 3520 i3 12th gen 16gb ram 512gb ss...    0.936513
225  dell vostro 3520 i3 12th gen 16gb ram 512gb ss...    0.936513
