# PDF with `PGVector`


In [1]:
%load_ext autoreload
%autoreload 2

## On its own


In [2]:
import re

from haystack import Pipeline, component, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PDFMinerToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.utils import Secret
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder


from typing import List, Dict, Any

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
@component
class CleanBinaryData:
    """
    A component for splitting the text by whitespace
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        docs: List[Document] = []
        for doc in documents:
            doc.content = re.sub(r'\s+', ' ', re.sub(r'[^\x20-\x7E]', ' ', doc.content))
            docs.append(doc)
        return {"documents": docs}

In [4]:
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

document_store = PgvectorDocumentStore(
    embedding_dimension=384,
    vector_function="cosine_similarity",
    recreate_table=True,
    search_strategy="hnsw",
    connection_string=Secret.from_token("postgresql://tunm4444:Aa123123@61.28.231.204:5432/cuongdm3")
)

In [None]:
pipeline = Pipeline()
pipeline.add_component("converter", PDFMinerToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("cleaner2", CleanBinaryData())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=20, split_overlap=15))
pipeline.add_component("embedder", doc_embedder)
pipeline.add_component("writer", DocumentWriter(document_store=document_store))

pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "cleaner2")
pipeline.connect("cleaner2", "embedder")
pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7cdfb4be5f30>
🚅 Components
  - converter: PDFMinerToDocument
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - converter.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> splitter.documents (List[Document])
  - splitter.documents -> embedder.documents (List[Document])
  - embedder.documents -> writer.documents (List[Document])

In [6]:
file_names = ["vi-vks.pdf"]
pipeline.run({"converter": {"sources": file_names}})

Batches: 100%|██████████| 25/25 [00:22<00:00,  1.10it/s]


DocumentStoreError: Could not write documents to PgvectorDocumentStore. 
You can find the SQL query and the parameters in the debug logs.

# Retrieve

In [7]:
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever

document_store = PgvectorDocumentStore(
    embedding_dimension=384,
    vector_function="cosine_similarity",
    recreate_table=False,
    search_strategy="hnsw",
    connection_string=Secret.from_token("postgresql://tunm4444:Aa123123@61.28.231.204:5432/cuongdm3")
)


query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(truncate_dim=384, model="sentence-transformers/all-MiniLM-L6-v2"))
query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "VKS"

result = query_pipeline.run({"text_embedder": {"text": query}})


Batches: 100%|██████████| 1/1 [00:00<00:00, 89.04it/s]
HNSW index already exists and won't be recreated. If you want to recreate it, pass 'hnsw_recreate_index_if_exists=True' to the Document Store constructor


In [8]:
result

{'retriever': {'documents': [Document(id=0a1913f96cfc98f4b6988362e636c0de86bf4cf5614e0866b1e867389a9e0408, content: 'vn/vserver/block- store/snapshot/overview B c 2 Ch n Activate Snapshot Service. V d : 277 C i t VNGC...', meta: {'split_id': 619, 'file_path': 'vi-vks.pdf', 'source_id': 'de28511a614fbb9b5c43ed3dab62a0ffc098170490430149a54b1c96e12be71b', 'page_number': 276, '_split_overlap': [{'range': [295, 1502], 'doc_id': '0460a64f86f0c3116124931fa304741018d72f09b99954d66f341f54804331b7'}, {'range': [0, 1512], 'doc_id': '810694f116851996e63dfc4149ab15672d7e1392e98babd6806005a914eac78a'}], 'split_idx_start': 271557}, score: 0.3311329483985901, embedding: vector of size 384),
   Document(id=9e74d4e411e854b03bf7093e61e6ea5a1711124fe90ee68c9eb553cb37f4c736, content: 'console.vngcloud.vn/vserver/block- store/snapshot/overview B c 2 Ch n Activate Snapshot Service. V d...', meta: {'split_id': 212, 'file_path': 'vi-vks.pdf', 'source_id': 'de28511a614fbb9b5c43ed3dab62a0ffc098170490430149a54b1c