In [1]:
import os
import json

In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.INFO)
logging.getLogger("haystack").setLevel(logging.INFO)

## RAG with Agent Pipeline

In [3]:
from haystack import Document, Pipeline, component
from haystack.components.others import Multiplexer
from haystack.components.routers import ConditionalRouter
from haystack.components.embedders import OpenAITextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever

In [4]:
@component
class ReturnDocumentsFromRetriever:
    @component.output_types(documents=list[dict])
    def run(self, docs: list[Document]):
        return {"documents": [{"id": doc.id, **doc.meta} for doc in docs]}

@component
class PredictHousePrice:
    @component.output_types(prediction=float, features=dict)
    def run(self, replies: list[str]):
        try:
            features = json.loads(replies[0])
            print(features)
            return {
                "prediction": 1000000,
                "features": {
                    "land_area": features.get("land_area", 0),
                    "house_size": features.get("house_size", 0),
                    "bedrooms": features.get("bedrooms", 0),
                    "bathrooms": features.get("bathrooms", 0),
                }
            }
        except:
            return -1

### Primer for Conditional Router

In [5]:
prompt_for_router_system = """
You are a clever AI agent that can answer questions related to house/real estate price prediction and recommendation.
You will be given a question about house/real estate price prediction and recommendation, and you need to pick the best answer from the following options.
If the user wanted to predict house/real estate price (by using keywords such as predict, perkiraan, berapa, prediksi, or other relevant keywords) and also provides a number of house/real estate specifications such as land area, house area, number of bedrooms, and number of bathrooms, you must answer with 'PREDICTION'.
If the user did not intend to predict house/real estate price or did not provide at least one information above, you should answer with 'DATABASE_SEARCH'.
All other questions should be answered with 'DATABASE_SEARCH' too.
Never answer other than 'PREDICTION' or 'DATABASE_SEARCH'.

Example:
Question: What is the price of a house with 3 bedrooms, 2 bathrooms, 1000 sqft land, and 800 sqft house?
Answer: PREDICTION

Question: Predict the price of a house with 3 bedrooms and a lawn.
Answer: PREDICTION

Question: Recommend me a house with 3 bedrooms, 2 bathrooms, and a swimming pool.
Answer: DATABASE_SEARCH

Question: Show me a house with 2 bedrooms and a garage.
Answer: DATABASE_SEARCH
"""

prompt_for_router = """
Question: {{ query }}
Answer:
"""

prompt_for_prediction_system = """
You are a clever AI agent that can answer questions related to house/real estate price prediction.
You will be given a number of house/real estate specifications such as land area, house area, number of bedrooms, and number of bathrooms, then you must extract those information as JSON.
The user may provide the information in Bahasa Indonesia.

The output schema is:
{
  "land_area": float,   // area of the land in meter squared/luas tanah
  "house_area": float,  // area of the house in meter squared/luas bangunan
  "bedrooms": int,      // number of bedrooms/kamar tidur
  "bathrooms": int      // number of bathrooms/kamar mandi
}

If the user did not provide at least one information above, you should answer with empty JSON such as '{}'.
All other questions should be answered with '{}' too.
Never answer other than the JSON schema or '{}'.
"""

prompt_for_prediction = """
Question: {{ question }}
Answer:
"""

prompt_for_prediction_result = """
You just predicted the price of a house with the following specifications:
Land area: {{ features.land_area }} meter squared
House area: {{ features.house_area }} meter squared
Bedrooms: {{ features.bedrooms }}
Bathrooms: {{ features.bathrooms }}

The predicted price is: IDR {{ prediction }}.

Paraphrase the information above in a complete sentence. Include the specificationa and predicted price in the sentence.
Format the price in Indonesian Rupiah (IDR) with period as the thousand separator and two decimal places.
Always answer in Bahasa Indonesia.
"""

prompt_for_rag = """
You are an assistant for house recommendation/suggestion tasks. You will be given a few documents about property listing along with it's price, address, and specifications.
Give a summary about the house specs and address if you have a match. Do not return the result as lists, but as a paragraph. 
You can suggest more than one house based on the context. If you don't know the answer, just say that you don't know. 
Answer using the same language as the question. Use five sentences maximum and keep the answer concise.

Context:
###
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
###

Question: {{ question }}

Answer:
"""

### Build Pipeline

In [6]:
routes = [
    {
        "condition": "{{'PREDICTION' in replies[0]}}",
        "output": "{{query}}",
        "output_name": "features",
        "output_type": str,
    },
    {
        "condition": "{{'PREDICTION' not in replies[0]}}",
        "output": "{{query}}",
        "output_name": "question",
        "output_type": str,
    }
]

In [7]:
# router
router = ConditionalRouter(routes)
router_prompt = PromptBuilder(prompt_for_router)
router_llm = OpenAIGenerator(model="gpt-3.5-turbo", system_prompt=prompt_for_router_system)

# extraction of input features
prediction_prompt = PromptBuilder(prompt_for_prediction)
prediction_llm = OpenAIGenerator(model="gpt-3.5-turbo", system_prompt=prompt_for_prediction_system, generation_kwargs={"response_format": { "type": "json_object" }})
prediction_component = PredictHousePrice()

# prediction result
prediction_result_prompt = PromptBuilder(prompt_for_prediction_result)
prediction_result_llm = OpenAIGenerator(model="gpt-3.5-turbo")

# RAG
document_store = QdrantDocumentStore(url="localhost:6333", index="houses_haystack", embedding_dim=1536, hnsw_config={"m": 16, "ef_construct": 100}, return_embedding=True, wait_result_from_api=True)
rag_embedder = OpenAITextEmbedder()
rag_retriever = QdrantEmbeddingRetriever(document_store=document_store)
rag_prompt = PromptBuilder(template=prompt_for_rag)
rag_llm = OpenAIGenerator(model="gpt-3.5-turbo")
rag_doc_returner = ReturnDocumentsFromRetriever()

# create pipeline
pipeline = Pipeline()

# router phase
pipeline.add_component("router_prompt", router_prompt)
pipeline.add_component("router_llm", router_llm)
pipeline.add_component("router", router)


# if the route is PREDICTION
pipeline.add_component("prediction_prompt", prediction_prompt)
pipeline.add_component("prediction_llm", prediction_llm)
pipeline.add_component("prediction_component", prediction_component)
pipeline.add_component("prediction_prompt_for_result", prediction_result_prompt)
pipeline.add_component("prediction_result_llm", prediction_result_llm)

# if the route is DATABASE_SEARCH
pipeline.add_component("rag_embedder", rag_embedder)
pipeline.add_component("rag_retriever", rag_retriever)
pipeline.add_component("rag_prompt", rag_prompt)
pipeline.add_component("rag_llm", rag_llm)
pipeline.add_component("rag_doc_returner", rag_doc_returner)


# connect the components
pipeline.connect("router_prompt", "router_llm")
pipeline.connect("router_llm.replies", "router.replies")

pipeline.connect("router.features", "prediction_prompt")
pipeline.connect("prediction_prompt", "prediction_llm")
pipeline.connect("prediction_llm", "prediction_component")
pipeline.connect("prediction_component.prediction", "prediction_prompt_for_result.prediction")
pipeline.connect("prediction_component.features", "prediction_prompt_for_result.features")
pipeline.connect("prediction_prompt_for_result", "prediction_result_llm")

pipeline.connect("router.question", "rag_embedder.text")
pipeline.connect("router.question", "rag_prompt.question")
pipeline.connect("rag_embedder.embedding", "rag_retriever.query_embedding")
pipeline.connect("rag_retriever", "rag_prompt.documents")
pipeline.connect("rag_retriever", "rag_doc_returner")
pipeline.connect("rag_prompt", "rag_llm")

INFO - httpx -  HTTP Request: GET http://localhost:6333/collections/houses_haystack "HTTP/1.1 200 OK"


<haystack.pipeline.Pipeline at 0x7f081f738f50>

In [8]:
pipeline.draw("pipeline.png")
pipeline.draw("pipeline.txt", engine="mermaid-text")

### Execute Pipeline

In [9]:
# query = "What is the price of a house with 3 bedrooms, 2 bathrooms, 1000 sqft land, and 800 sqft house?"
# query = "berapa harga rumah dengan 10 kamar tidur, 5 kamar mandi, 1000 meter persegi tanah, dan 800 meter persegi bangunan?"
query = "rekomendasi rumah dengan 3 kamar tidur, 2 kamar mandi, dan kolam renang"

results = pipeline.run(
    {
        "router": {"query": query},
        "router_prompt": {"query": query},
    },
    debug=True
)

print(json.dumps(results, indent=2))
# results

INFO - httpx -  HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO - httpx -  HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'land_area': 1000.0, 'house_area': 800.0, 'bedrooms': 10, 'bathrooms': 5}


INFO - httpx -  HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{
  "router_llm": {
    "meta": [
      {
        "model": "gpt-3.5-turbo-0125",
        "index": 0,
        "finish_reason": "stop",
        "usage": {
          "completion_tokens": 4,
          "prompt_tokens": 345,
          "total_tokens": 349
        }
      }
    ]
  },
  "prediction_llm": {
    "meta": [
      {
        "model": "gpt-3.5-turbo-0125",
        "index": 0,
        "finish_reason": "stop",
        "usage": {
          "completion_tokens": 40,
          "prompt_tokens": 250,
          "total_tokens": 290
        }
      }
    ]
  },
  "prediction_result_llm": {
    "replies": [
      "Anda baru saja memprediksi harga sebuah rumah dengan spesifikasi berikut: Luas tanah 1000,0 meter persegi, luas rumah belum diketahui, 10 kamar tidur, dan 5 kamar mandi. Harga yang diprediksi adalah Rp 1.000.000,00."
    ],
    "meta": [
      {
        "model": "gpt-3.5-turbo-0125",
        "index": 0,
        "finish_reason": "stop",
        "usage": {
          "completion_tokens": 