# PDF with `PGVector`


In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## On its own


In [17]:
import re

from haystack import Pipeline, component, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PDFMinerToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.utils import Secret
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder


from typing import List, Dict, Any

In [18]:
@component
class CleanBinaryData:
    """
    A component for splitting the text by whitespace
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        docs: List[Document] = []
        for doc in documents:
            doc.content = re.sub(r'\s+', ' ', re.sub(r'[^\x20-\x7E]', ' ', doc.content))
            docs.append(doc)
        return {"documents": docs}

In [19]:
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

document_store = PgvectorDocumentStore(
    embedding_dimension=384,
    vector_function="cosine_similarity",
    recreate_table=True,
    search_strategy="hnsw",
    connection_string=Secret.from_token("postgresql://tunm4444:Aa123123@61.28.231.204:5432/con_ga_cuong")
)

In [20]:
pipeline = Pipeline()
pipeline.add_component("converter", PDFMinerToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("cleaner2", CleanBinaryData())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=20, split_overlap=15))
pipeline.add_component("embedder", doc_embedder)
pipeline.add_component("writer", DocumentWriter(document_store=document_store))

pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "cleaner2")
pipeline.connect("cleaner2", "embedder")
pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x78714af6f1c0>
🚅 Components
  - converter: PDFMinerToDocument
  - cleaner: DocumentCleaner
  - cleaner2: CleanBinaryData
  - splitter: DocumentSplitter
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - converter.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> splitter.documents (List[Document])
  - cleaner2.documents -> embedder.documents (List[Document])
  - splitter.documents -> cleaner2.documents (List[Document])
  - embedder.documents -> writer.documents (List[Document])

In [21]:
file_names = ["sample.pdf"]
pipeline.run({"converter": {"sources": file_names}})

Batches: 100%|██████████| 22/22 [00:20<00:00,  1.06it/s]


{'writer': {'documents_written': 687}}

# Retrieve

In [22]:
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever

document_store = PgvectorDocumentStore(
    embedding_dimension=384,
    vector_function="cosine_similarity",
    recreate_table=False,
    search_strategy="hnsw",
    connection_string=Secret.from_token("postgresql://tunm4444:Aa123123@61.28.231.204:5432/con_ga_cuong")
)


query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(truncate_dim=384, model="sentence-transformers/all-MiniLM-L6-v2"))
query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "VKS"

result = query_pipeline.run({"text_embedder": {"text": query}})


Batches: 100%|██████████| 1/1 [00:00<00:00, 92.95it/s]
HNSW index already exists and won't be recreated. If you want to recreate it, pass 'hnsw_recreate_index_if_exists=True' to the Document Store constructor


In [23]:
result

{'retriever': {'documents': [Document(id=314ec1b95adceda8a555a248d7d0075fdf48ab807c77707fb23d15c76383c206, content: 'ThuyVT2 1 VKS VKS VNGCloud Kubernetes Service) is a managed service on VNGCloud that helps you simpl...', meta: {'split_id': 0, 'file_path': 'sample.pdf', 'source_id': 'ce307010c1253c7d9f718347fb7d6fcf15e389396daee9ba25f87680a22ef7c5', 'page_number': 1, '_split_overlap': [{'range': [0, 1981], 'doc_id': '409ac8359def34bbf908fc5227ada892aaec849aa2957d23c7108b04854f85c8'}], 'split_idx_start': 0}, score: 0.5519827604293823, embedding: vector of size 384),
   Document(id=9cb4836ba44b6aec2cef5318e82947ff1cb58a6cdaeddf42d61a887bc7435fd3, content: ' Event History : VKS will display the history of events that occur when users work with the Cluster ...', meta: {'split_id': 33, 'file_path': 'sample.pdf', 'source_id': 'ce307010c1253c7d9f718347fb7d6fcf15e389396daee9ba25f87680a22ef7c5', 'page_number': 19, '_split_overlap': [{'range': [796, 2705], 'doc_id': '6a9286f2416994e6d09c91aee11