In [None]:
pip install sentence_transformers



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

In [None]:
# Load preprocessed data
preprocessed_product_data = pd.read_csv('/content/preprocessed_product_data.csv')
preprocessed_product_data

Unnamed: 0,link,price,actual_price,ratings,color,processed_title
0,https://www.daraz.com.np/products/dell-vostro-...,55999.0,55999.0,13.0,Black,dell vostro 3520 i3 12th gen 16gb ram 512gb ss...
1,https://www.daraz.com.np/products/apple-macboo...,109900.0,139900.0,76.0,Space Grey,apple macbook air 13inch m1 256gb oliz store
2,https://www.daraz.com.np/products/dell-vostro-...,68999.0,68999.0,22.0,Black,dell vostro 3520 i5 12th gen 16gb ram 512gb ss...
3,https://www.daraz.com.np/products/dell-vostro-...,64000.0,64000.0,8.0,Black,dell vostro 3520 i5 12th gen 8gb ram 256gb ssd...
4,https://www.daraz.com.np/products/acer-nitro-v...,137999.0,137999.0,2.0,Black,acer nitro v 15 i7 13th gen 13620h 16gb ddr5 5...
...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,75990.0,88000.0,0.0,Silver,asus x515 i5 11th gen 8gb ram 512gb ssd 2gb nv...
529,https://www.daraz.com.np/products/asus-vivoboo...,89990.0,110000.0,0.0,Black,asus vivobook 16 f1605va intel core i5 13th ge...
530,https://www.daraz.com.np/products/lenovo-ideap...,52000.0,52000.0,0.0,Brown,lenovo ideapad 3 amd ryzen 5300u processor 4gb...
531,https://www.daraz.com.np/products/lenovo-ideap...,55000.0,55000.0,0.0,Grey,lenovo ideapad slim 3 windows 11 156 inch hd i...


In [None]:
# Function to generate embeddings using Sentence Transformer
def generate_sentence_transformer_embeddings(data):
    try:
        # Load the Sentence Transformer model
        model = SentenceTransformer('bert-base-nli-mean-tokens')

        # Generate embeddings for the processed title
        data['feature_embedding'] = data['processed_title'].apply(lambda x: model.encode(x))

        print("Embeddings generated!")
        return data
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return data

In [None]:
# Function to generate embeddings using BERT

def generate_bert_embeddings(data):
    try:
        # Load pre-trained BERT model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')

        # Generate embeddings for each row in 'combined_features'
        all_embeddings = []
        for text in data['processed_title']:
            encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                model_output = model(**encoded_input)
            # Use the CLS token's embedding as the sentence embedding
            embedding = model_output.last_hidden_state[:, 0, :].numpy()
            all_embeddings.append(embedding)

        # Store embeddings in the DataFrame
        data['feature_embedding'] = all_embeddings

        print("BERT embeddings generated!")
        return data

    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return data

In [None]:
# Function to save embeddings
def save_embeddings(data, file_path):
    try:
        data.to_pickle(file_path)
        print(f"Embeddings saved to {file_path}")
    except Exception as e:
        print(f"Error saving embeddings: {e}")

In [None]:
if preprocessed_product_data is not None:
    # Generate embeddings using Sentence Transformer
    sentence_transformer_data = generate_sentence_transformer_embeddings(preprocessed_product_data)
    save_embeddings(sentence_transformer_data, 'sentence_transformer_embeddings.pkl')

    # Generate embeddings using Bert
    bert_data = generate_bert_embeddings(preprocessed_product_data)
    save_embeddings(bert_data, 'bert_embeddings.pkl')



Embeddings generated!
Embeddings saved to sentence_transformer_embeddings.pkl
BERT embeddings generated!
Embeddings saved to bert_embeddings.pkl
