# Using Haystack and the Astra DB integration to build a simple Star Wars RAG

Author: [Carter Rabasa](https://github.com/crtr0)

Blog Post: [datastax.com/blog/using-genai-to-find-a-needle-with-haystack-and-astra-db](https://www.datastax.com/blog/using-genai-to-find-a-needle-with-haystack-and-astra-db)

## Setup

In [None]:
!pip install astra-haystack sentence-transformers

## Loading data using the Astra DB DocumentStore

In [None]:
import logging
import os
from getpass import getpass 
from haystack import Pipeline
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.astra import AstraDocumentStore

os.environ["ASTRA_DB_API_ENDPOINT"] = getpass("Astra DB API Endpoint:")
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass("Astra DB Application Token:")

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

fetcher = LinkContentFetcher()

converter = HTMLToDocument()

splitter = DocumentSplitter(split_by="word", split_length=50)

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

document_store = AstraDocumentStore(
    duplicates_policy=DuplicatePolicy.SKIP,
    embedding_dimension=384,
    collection_name="starwars"
)

index_pipeline = Pipeline()

index_pipeline.add_component(
    instance=SentenceTransformersDocumentEmbedder(model=embedding_model_name),
    name="embedder",
)

index_pipeline.add_component(instance=fetcher, name="fetcher")

index_pipeline.add_component(instance=converter, name="converter")

index_pipeline.add_component(instance=splitter, name="splitter")

index_pipeline.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")

index_pipeline.connect("fetcher.streams", "converter.sources")

index_pipeline.connect("converter.documents", "splitter.documents")

index_pipeline.connect("splitter.documents", "embedder.documents")

index_pipeline.connect("embedder.documents", "writer.documents")

index_pipeline.run(data={"fetcher": {"urls": ["https://en.wikipedia.org/wiki/Star_Wars"]}})

print(document_store.count_documents())

## Running a simple RAG using the data stored

In [None]:
import os
from getpass import getpass 
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.document_stores.astra import AstraDocumentStore
from haystack_integrations.components.retrievers.astra import AstraEmbeddingRetriever

os.environ["ASTRA_DB_API_ENDPOINT"] = getpass("Astra DB API Endpoint:")
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass("Astra DB Application Token:")
os.environ["OPENAI_API_KEY"] = getpass("OpenAI API Key:")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

prompt_template = """
                Given these documents, answer the question.
                Documents:
                {% for doc in documents %}
                    {{ doc.content }}
                {% endfor %}
                Question: {{question}}
                Answer:
                """

document_store = AstraDocumentStore(
    embedding_dimension=384,
    collection_name="starwars"
)

rag_pipeline = Pipeline()
rag_pipeline.add_component(
    instance=SentenceTransformersTextEmbedder(model=embedding_model_name),
    name="embedder",
)
rag_pipeline.add_component(instance=AstraEmbeddingRetriever(document_store=document_store), name="retriever")
rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
rag_pipeline.add_component(instance=OpenAIGenerator(), name="llm")
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
rag_pipeline.connect("embedder", "retriever")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("llm.meta", "answer_builder.meta")
rag_pipeline.connect("retriever", "answer_builder.documents")

# Run the pipeline
question = "Who is Luke's sister?"
result = rag_pipeline.run(
    {
        "embedder": {"text": question},
        "retriever": {"top_k": 2},
        "prompt_builder": {"question": question},
        "answer_builder": {"query": question},
    }
)

print(result)