<a target="_blank" href="https://colab.research.google.com/github/deepset-ai/haystack-core-integrations/blob/main/integrations/pinecone/examples/pinecone_documentstore_example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" width="200" alt="Open In Colab"/>
</a>

In [None]:
# Install the Pinecone integration, Haystack will come as a dependency
# Install also some optional dependencies needed for Markdown conversion and text embedding
!pip install -U pinecone-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0"

In [None]:
# Download some markdown files to index
!git clone https://github.com/anakin87/neural-search-pills

Cloning into 'neural-search-pills'...
remote: Enumerating objects: 190, done.[K
remote: Counting objects: 100% (190/190), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 190 (delta 97), reused 130 (delta 51), pack-reused 0[K
Receiving objects: 100% (190/190), 1.38 MiB | 21.77 MiB/s, done.
Resolving deltas: 100% (97/97), done.


In [None]:
# Create the indexing Pipeline and index some documents

import glob

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.preprocessors import DocumentSplitter


from pinecone_haystack import PineconeDocumentStore
from pinecone_haystack.dense_retriever import PineconeDenseRetriever


file_paths = glob.glob("neural-search-pills/pills/*.md")

document_store = PineconeDocumentStore(api_key="YOUR-PINECONE-API-KEY",
                                       environment="gcp-starter",
                                       index="default",
                                       namespace="default",
                                       dimension=768)

indexing = Pipeline()
indexing.add_component("converter", MarkdownToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
indexing.add_component("embedder", SentenceTransformersDocumentEmbedder())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "splitter")
indexing.connect("splitter", "embedder")
indexing.connect("embedder", "writer")

indexing.run({"converter": {"sources": file_paths}})

Converting markdown files to Documents: 100%|██████████| 14/14 [00:00<00:00, 163.72it/s]


Batches:   0%|          | 0/5 [00:00<?, ?it/s]



Upserted vectors:   0%|          | 0/130 [00:00<?, ?it/s]

{'writer': {'documents_written': 130}}

In [None]:
# Create the querying Pipeline and try a query

querying = Pipeline()
querying.add_component("embedder", SentenceTransformersTextEmbedder())
querying.add_component("retriever", PineconeDenseRetriever(document_store=document_store, top_k=3))
querying.connect("embedder", "retriever")

results = querying.run({"embedder": {"text": "What is Question Answering?"}})

for doc in results["retriever"]["documents"]:
    print(doc)
    print('-'*10)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document(id=c0e30c24615acd02ae5a5ddb82d7fd8f9dc0f3e9a37e31fc22850d5ed9b953f5, content: 'Combining document retrieval and machine comprehension for Question Answering

A Question Answering ...', meta: {'file_path': 'neural-search-pills/pills/machine-reading-at-scale.md', 'source_id': '289dfdab5e71750a6fe9220f1a8adb3dad1d1985377e0632735acfc36613c310'}, score: 0.553784668, embedding: vector of size 768)
----------
Document(id=e343a663d9f9a1e8aee56c313b4bf5fcfa201fb890b669fa8aabea428270aa53, content: 'Extractive Question Answering Evaluation

A Reader 📖 is a model that performs Extractive Question An...', meta: {'file_path': 'neural-search-pills/pills/extractive-qa-evaluation.md', 'source_id': '18d3947b572918509a4e0624179e0e70e9ebbffbfd3c9bd03ae1e0d5145c9959'}, score: 0.521423876, embedding: vector of size 768)
----------
Document(id=ba2af8037ab206430aaa8c0eb401319e004063c3c41a085562cd16739b637d50, content: '

Instead, a Question Answering system expects a question asked in a natural langu