In [None]:
from dotenv import load_dotenv
load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")


In [19]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [None]:
api_key = "your_api_key_from_Pinecone"

In [25]:
import os
from pinecone import Pinecone , ServerlessSpec
index_name = "hybrid-search-langchain-pinecore"

#Initializing the pinecone client
pc = Pinecone(api_key=api_key)

# create the index

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, #dimension of dense vectore
        metric="dotproduct", #sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud='aws' , region="us-east-1"),
    )


In [26]:
index = pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x746f0c829a30>

In [27]:
#vectore embedding and sparse matrix

import os


from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embedding

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [28]:
#Sparce Matrix

from pinecone_text.sparse import BM25Encoder

bm25_encodeer = BM25Encoder.default()
bm25_encodeer

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x746f5089f620>

In [29]:
sentences = [
    "In 2023, I visited Benglore",
    "In 2024, I visited Kerala",
    "In 2025, I visited Mumbai"
]

#TFIDF values on these sentence
bm25_encodeer.fit(sentences)

# Store the values to a .json file
bm25_encodeer.dump("bm25_values.json")

#load to your BM25Encoder object
bm25_encodeer = BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 4953.90it/s]


In [30]:
retriever = PineconeHybridSearchRetriever(embeddings=embedding , sparse_encoder=bm25_encodeer , index=index)


In [31]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x746f2705fb90>, index=<pinecone.db_data.index.Index object at 0x746f0c829a30>)

In [32]:
retriever.add_texts(
    [
    "In 2023, I visited Benglore",
    "In 2024, I visited Kerala",
    "In 2025, I visited Mumbai"
]
)

100%|██████████| 1/1 [00:02<00:00,  2.00s/it]


In [36]:
retriever.invoke("City name I visited in 2024")

[Document(metadata={'score': 0.340681016}, page_content='In 2024, I visited Kerala'),
 Document(metadata={'score': 0.293765306}, page_content='In 2023, I visited Benglore'),
 Document(metadata={'score': 0.280019522}, page_content='In 2025, I visited Mumbai')]

In [34]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x746f2705fb90>, index=<pinecone.db_data.index.Index object at 0x746f0c829a30>)