# Cache

Run Postgre Database in a Docker container:
  ```bash
  docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=postgres ankane/pgvector
  ```

In [1]:
%load_ext autoreload
%autoreload 2

## On its own

In [2]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.caching import CacheChecker
from haystack.utils import Secret

In [3]:
my_doc_store = InMemoryDocumentStore()

In [4]:
# For URL-based caching
cache_checker = CacheChecker(document_store=my_doc_store, cache_field="url")
cache_check_results = cache_checker.run(items=["https://example.com/resource", "https://another_example.com/other_resources"])

In [5]:
print(cache_check_results["hits"])    # List of Documents that were found in the cache: all of these have 'url': <one of the above> in the metadata
print(cache_check_results["misses"])  # URLs that were not found in the cache, like ["https://example.com/resource"]

[]
['https://example.com/resource', 'https://another_example.com/other_resources']


In [6]:
# For caching based on a custom identifier
cache_checker = CacheChecker(document_store=my_doc_store, cache_field="metadata_field")
cache_check_results = cache_checker.run(items=["12345", "ABCDE"])

In [7]:
print(cache_check_results["hits"])    # Documents that were found in the cache: all of these have 'metadata_field': <one of the above> in the metadata
print(cache_check_results["misses"])  # Values that were not found in the cache, like: ["ABCDE"]

[]
['12345', 'ABCDE']


## In pipeline

In [8]:
from haystack import Pipeline
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.caching import CacheChecker
from haystack.document_stores.in_memory import InMemoryDocumentStore

In [9]:
pipeline = Pipeline()
document_store = InMemoryDocumentStore()

pipeline.add_component(instance=CacheChecker(document_store, cache_field="file_path"), name="cache_checker")
pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
pipeline.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")

pipeline.connect("cache_checker.misses", "text_file_converter.sources")
pipeline.connect("text_file_converter.documents", "cleaner.documents")
pipeline.connect("cleaner.documents", "splitter.documents")
pipeline.connect("splitter.documents", "writer.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7bc0aff3ef90>
🚅 Components
  - cache_checker: CacheChecker
  - text_file_converter: TextFileToDocument
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
  - writer: DocumentWriter
🛤️ Connections
  - cache_checker.misses -> text_file_converter.sources (List)
  - text_file_converter.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> splitter.documents (List[Document])
  - splitter.documents -> writer.documents (List[Document])

In [10]:
pipeline.draw("pipeline.png")

In [11]:
# Take the current directory as input and run the pipeline
result = pipeline.run({"cache_checker": {"items": ["code_of_conduct_1.txt"]}})
print(result)

{'cache_checker': {'hits': []}, 'writer': {'documents_written': 1}}


In [12]:
# The second execution skips the files that were already processed
result = pipeline.run({"cache_checker": {"items": ["code_of_conduct_1.txt"]}})
print(result)

{'cache_checker': {'hits': [Document(id=090bc39394bb7294026a00230d15a9da39d9eb432a65fe6134bad58a96389109, content: 'My name is Cuong, I am a software engineer.', meta: {'file_path': 'code_of_conduct_1.txt', 'source_id': '8a71ffffa4d1c4114c5cd8e459f1d7bf193440d187dccd80f2d1de3c74112677', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []})]}, 'writer': {'documents_written': 0}}
