In [3]:
pip install pinecone-client pinecone-text pinecone-notebooks langchain_community

Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain_community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting SQLAlchemy<3.0.0,>=1.4.0 (from langchain_community)
  Downloading sqlalchemy-2.0.45-py3-none-any.whl.metadata (9.5 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.13.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.10.1 (from langchain_community)
  Downloading pydantic_settings-2.12.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.3-py3-none-any.whl.metadata (9.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp<4.0.0,>=3.8.3->lang

In [5]:
pip install langchain-huggingface pinecone


Collecting pinecone
  Downloading pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Collecting pinecone-plugin-assistant<4.0.0,>=3.0.1 (from pinecone)
  Downloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl.metadata (30 kB)
Collecting packaging>=20.9 (from huggingface-hub<1.0.0,>=0.33.4->langchain-huggingface)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-8.0.0-py3-none-any.whl (745 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.9/745.9 kB[0m [31m2.5 MB/s[0m  [33m0:00:00[0m36m-:--:--[0m
[?25hDownloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl (280 kB)
Downloading packaging-24.2-py3-none-any.whl (65 kB)
Installing collected packages: packaging, pinecone-plugin-assistant, pinecone
[2K  Attempting uninstall: packaging
[2K    Found existing installation: packaging 25.0
[2K    Uninstalling packaging-25.0:
[2K      Successfully uninstalled packaging-25.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
load_dotenv()
from langchain_huggingface import HuggingFaceEndpointEmbeddings, HuggingFaceEndpoint, ChatHuggingFace

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [2]:
index_name = "hybrid-search-langchain-pinecone"
#2 Initialize Pinecone
pinecone_api_key = os.getenv("PINE_CONE_API_KEY")
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
pinecone = Pinecone(api_key=pinecone_api_key )

#3 Create Pinecone index
if not pinecone.has_index(index_name):
    pinecone.create_index(
        name=index_name,
        dimension=384,
        metric="dotproduct", # Using dotproduct for sparse values
        spec=ServerlessSpec(cloud="aws", region="us-east-1")

    )

In [3]:
index = pinecone.Index(name=index_name)
embaddings_model = HuggingFaceEndpointEmbeddings(
    repo_id="BAAI/bge-small-en",
    huggingfacehub_api_token=hugging_face_token,
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from pinecone_text.sparse import BM25Encoder

sparse_encoder = BM25Encoder().default()

In [5]:
sentences = [
    "LangChain is a framework for developing applications powered by language models.",
    "Pinecone is a vector database that makes it easy to add semantic search capabilities to your applications.",
    "Hybrid search combines both dense and sparse retrieval methods to improve search accuracy and relevance."
]

#4 Upsert data into Pinecone index
# applying tf-idf encoding to the sentences
sparse_encoder.fit(sentences)

#store the sparse
sparse_encoder.dump("bm25_values.json")

#load the sparse values
sparse_encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 284.47it/s]


In [6]:
retriever  = PineconeHybridSearchRetriever(
    embeddings=embaddings_model,
    sparse_encoder=sparse_encoder,
    index=index
)


In [9]:
retriever.add_texts([
    "LangChain is a framework for developing applications powered by language models.",
    "Pinecone is a vector database that makes it easy to add semantic search capabilities to your applications.",
    "Hybrid search combines both dense and sparse retrieval methods to improve search accuracy and relevance."
])

retriever.add_texts([
    "LangChain enables developers to build applications that can understand and generate human-like text.",
    "Pinecone provides a fully managed service for storing and querying vector embeddings at scale.",
    "By leveraging both dense embeddings and sparse representations, hybrid search can deliver more relevant results."
])

100%|██████████| 1/1 [00:07<00:00,  7.30s/it]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


In [10]:
retriever.invoke("What is LangChain?")

[Document(metadata={'score': 0.720260859}, page_content='LangChain is a framework for developing applications powered by language models.'),
 Document(metadata={'score': 0.687145412}, page_content='LangChain enables developers to build applications that can understand and generate human-like text.'),
 Document(metadata={'score': 0.390558243}, page_content='Hybrid search combines both dense and sparse retrieval methods to improve search accuracy and relevance.'),
 Document(metadata={'score': 0.386444092}, page_content='Pinecone provides a fully managed service for storing and querying vector embeddings at scale.')]