# Extract information from any website using custom component with RAG Pipeline
*Notebook by [Bhavik Jikadara](https://www.linkedin.com/in/bhavikjikadara/)*

I'm going to build a nice Retrieval Augmented Generation pipeline for extract data from sitemap of company.

### Install dependencies

In [None]:
!pip install -q haystack-ai transformers accelerate sentence_transformers

In [None]:
from typing import List
from haystack import component, Pipeline, Document
from langchain_community.document_loaders.sitemap import SitemapLoader
from IPython.display import Image

# fixes a bug with asyncio and jupyter
import nest_asyncio
nest_asyncio.apply()

# Creating Indexing Pipeline
from typing import List
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

# Creating RAG Pipeline
import os
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Creating Custom Components
- `FetchDataFromWebsite`: A component extracting information from the any website.
- `PageContent`: Return generating text from the sitemap

In [None]:
@component
class FetchDataFromWebsite:
    """
    A component extracting information from the any website.
    """
    @component.output_types(sitemap_document=List[Document])
    def run(self, sitemap:str):
        sitemap_loader = SitemapLoader(web_path=sitemap)
        documents = sitemap_loader.load()

        return {"sitemap_document": documents} 
    
@component
class PageContent:
    """
    Return generating text from the sitemap
    """
    @component.output_types(text=List[str])
    def run(self, documents:List[Document]):
        data = []
        for d in range(len(documents)):
            data.append(documents[d].page_content)
        return {"text": data} 

## Dataset

In [None]:
# Create a pipeline instance
text_pipeline = Pipeline()

# Add components to the pipeline
text_pipeline.add_component(name="sitemap_fetch_data", instance= FetchDataFromWebsite())
text_pipeline.add_component(name="page_content", instance= PageContent())

# Connect the components
text_pipeline.connect(sender="sitemap_fetch_data", receiver="page_content")

In [None]:
result = text_pipeline.run({
    "sitemap_fetch_data":{
        "sitemap": "https://www.enactsoft.com/sitemap_index.xml"
    }
})

In [None]:
enactsoft_data = result['page_content']['text']

In [None]:
all_documents = [Document(content=doc) for doc in enactsoft_data]

## Creating Indexing Pipeline

In [None]:
document_store = InMemoryDocumentStore()

document_embedder = SentenceTransformersDocumentEmbedder("sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)

In [None]:
indexing = Pipeline()

indexing.add_component(instance=document_embedder, name="document_embedder")
indexing.add_component(instance=document_writer, name="document_writer")

indexing.connect("document_embedder.documents", "document_writer.documents")

In [None]:
indexing.run({
    "document_embedder": {
        "documents": all_documents
    }
})

In [None]:
# Creating Indexing pipeline graph
indexing.draw("indexing.png")
Image("indexing.png")

## Creating RAG Pipeline

In [None]:
template = """
You have to answer the following question based on the given context information only.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

In [None]:
rag_pipeline = Pipeline()

rag_pipeline.add_component("query_embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
rag_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
rag_pipeline.add_component("generator", OllamaGenerator(model="llama3", url="http://localhost:11434/api/generate", timeout=600))
rag_pipeline.add_component("answer_builder", AnswerBuilder())

In [None]:
rag_pipeline.connect("query_embedder", "retriever.query_embedding")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")
rag_pipeline.connect("generator.replies", "answer_builder.replies")
rag_pipeline.connect("generator.meta", "answer_builder.meta")
rag_pipeline.connect("retriever", "answer_builder.documents")

In [None]:
# Creating RAG pipeline graph
rag_pipeline.draw("RAG.png")
Image("RAG.png")

## Ask a Question

In [16]:
question = "What is EnactSoft?"

response = rag_pipeline.run({
    "query_embedder": {
        "text": question 
    },
    "prompt_builder":{
        "question": question
    },
    "answer_builder": {
        "query": question
    }
})

In [None]:
response["answer_builder"]["answers"]