In [None]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.1


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

In [None]:
# Load preprocessed data
preprocessed_product_data = pd.read_csv('/content/preprocessed_product_data.csv')
preprocessed_product_data

Unnamed: 0,link,price,actual_price,ratings,color,Brand,Model,Processor,RAM,Storage
0,https://www.daraz.com.np/products/dell-vostro-...,55999.0,55999.0,13.0,Black,Dell,Vostro 3520,i3 12th Gen,16GB RAM,512GB SSD
1,https://www.daraz.com.np/products/apple-macboo...,109900.0,139900.0,76.0,Space Grey,Apple,MacBook Air 13,M1,Unknown,Unknown
2,https://www.daraz.com.np/products/dell-vostro-...,68999.0,68999.0,22.0,Black,Dell,Vostro 3520,i5 12th Gen,16GB RAM,512GB SSD
3,https://www.daraz.com.np/products/dell-vostro-...,64000.0,64000.0,8.0,Black,Dell,Vostro 3520,i5 12th Gen,8GB RAM,256GB SSD
4,https://www.daraz.com.np/products/acer-nitro-v...,137999.0,137999.0,2.0,Black,Acer,Nitro V 15,i7 13th Gen,Unknown,512GB SSD
...,...,...,...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,75990.0,88000.0,0.0,Silver,Asus,X515,i5 11th Gen,8GB RAM,Unknown
529,https://www.daraz.com.np/products/asus-vivoboo...,89990.0,110000.0,0.0,Black,ASUS,VivoBook 16 F1605VA Intel Core,i5 13th Gen,8GB RAM,512GB SSD
530,https://www.daraz.com.np/products/lenovo-ideap...,52000.0,52000.0,0.0,Brown,Lenovo,Ideapad 3,AMD Ryzen,4GB RAM,Unknown
531,https://www.daraz.com.np/products/lenovo-ideap...,55000.0,55000.0,0.0,Grey,Lenovo,IdeaPad slim,Unknown,Unknown,Unknown


In [None]:
# Function to generate embeddings using Sentence Transformer
def generate_sentence_transformer_embeddings(data):
    try:
        # Load the Sentence Transformer model
        model = SentenceTransformer('bert-base-nli-mean-tokens')

        # Combine relevant columns into a single string for embedding
        data['combined_features'] = data[['Brand', 'Model', 'Processor', 'RAM', 'Storage']].agg(' '.join, axis=1)

        # Generate embeddings for the combined features
        data['feature_embedding'] = data['combined_features'].apply(lambda x: model.encode(x, convert_to_tensor=True))

        print("Embeddings generated!")
        return data
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return data

In [None]:
# Function to generate embeddings using BERT

def generate_bert_embeddings(data):
    try:
        # Load pre-trained BERT model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')

        # Combine relevant columns into a single string for embedding
        data['combined_features'] = data[['Brand', 'Model', 'Processor', 'RAM', 'Storage']].agg(' '.join, axis=1)

        # Generate embeddings for each row in 'combined_features'
        all_embeddings = []
        for text in data['combined_features']:
            encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                model_output = model(**encoded_input)
            # Use the CLS token's embedding as the sentence embedding
            embedding = model_output.last_hidden_state[:, 0, :].numpy()
            all_embeddings.append(embedding)

        # Store embeddings in the DataFrame
        data['feature_embedding'] = all_embeddings

        print("BERT embeddings generated!")
        return data

    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return data

In [None]:
# Function to save embeddings
def save_embeddings(data, file_path):
    try:
        data.to_pickle(file_path)
        print(f"Embeddings saved to {file_path}")
    except Exception as e:
        print(f"Error saving embeddings: {e}")

In [None]:
if preprocessed_product_data is not None:
    # Generate embeddings using Sentence Transformer
    sentence_transformer_data = generate_sentence_transformer_embeddings(preprocessed_product_data)
    save_embeddings(sentence_transformer_data, 'sentence_transformer_embeddings.pkl')

    # Generate embeddings using Bert
    bert_data = generate_bert_embeddings(preprocessed_product_data)
    save_embeddings(bert_data, 'bert_embeddings.pkl')





Embeddings generated!
Embeddings saved to sentence_transformer_embeddings.pkl


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT embeddings generated!
Embeddings saved to bert_embeddings.pkl
