In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
from sentence_transformers import SentenceTransformer




In [None]:
class SemanticSearch:
    def __init__(self, documents):
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.documents = documents
        self.document_embeddings = self.model.encode(documents)
        
    def retriver(self,query,topk = 5):
        query_embedding = self.model.encode(query)
        scores = np.dot(self.document_embeddings, query_embedding.T)
        top_indices = np.argsort(scores)[::-1][:topk]
        top_documents = [self.documents[i] for i in top_indices]
        return top_documents

In [None]:
from huggingface_hub import login
login(token="")

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

class SemanticRetriever:
    def __init__(self, documents):
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        # Store embeddings and keep them on CPU
        self.document_embeddings = self.model.encode(documents, convert_to_tensor=True)
        if torch.is_tensor(self.document_embeddings):
            self.document_embeddings = self.document_embeddings.cpu()

    def retrieve(self, query, top_k=5):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        if torch.is_tensor(query_embedding):
            query_embedding = query_embedding.cpu()
            
        # Now we can safely perform numpy operations
        scores = np.dot(self.document_embeddings.numpy(), query_embedding.numpy().T)
        top_indices = np.argsort(scores, axis=0)[-top_k:][::-1]
        return top_indices.flatten()  # Flatten to handle the case when it's a 2D array

class BM25Retriever:
    def __init__(self, documents):
        tokenized_documents = [doc.split(" ") for doc in documents]
        self.bm25 = BM25Okapi(tokenized_documents)

    def retrieve(self, query, top_k=5):
        tokenized_query = query.split(" ")
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return top_indices

class SearchEngine:
    def __init__(self, documents):
        self.semantic_retriever = SemanticRetriever(documents)
        self.bm25_retriever = BM25Retriever(documents)
        self.documents = documents

    def search(self, query, retriever_type='semantic', top_k=5):
        if retriever_type == 'semantic':
            top_indices = self.semantic_retriever.retrieve(query, top_k)
        elif retriever_type == 'bm25':
            top_indices = self.bm25_retriever.retrieve(query, top_k)
        else:
            raise ValueError("Invalid retriever type. Choose 'semantic' or 'bm25'.")

        return [self.documents[i] for i in top_indices]

# Example usage
documents = [
    "Semantic retrieval uses deep learning models.",
    "BM25 is a ranking function used in information retrieval.",
    "Both methods have their own advantages and disadvantages."
]

search_engine = SearchEngine(documents)
results = search_engine.search("deep learning", retriever_type='semantic')
print("Semantic Retrieval Results:", results)

results = search_engine.search("ranking function", retriever_type='bm25')
print("BM25 Retrieval Results:", results)

Semantic Retrieval Results: ['Semantic retrieval uses deep learning models.', 'BM25 is a ranking function used in information retrieval.', 'Both methods have their own advantages and disadvantages.']
BM25 Retrieval Results: ['BM25 is a ranking function used in information retrieval.', 'Both methods have their own advantages and disadvantages.', 'Semantic retrieval uses deep learning models.']
