In [1]:
import os
import re
import json

In [72]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## Data Loading

In [2]:
import pandas as pd

In [3]:
df = pd.read_parquet("../dataset/curated/marts_llm_houses.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10 non-null     object 
 1   district              10 non-null     object 
 2   city                  10 non-null     object 
 3   description           10 non-null     object 
 4   url                   10 non-null     object 
 5   price                 10 non-null     float64
 6   carport               10 non-null     int32  
 7   dapur                 10 non-null     int32  
 8   daya_listrik          10 non-null     float32
 9   facility_ac           10 non-null     float64
 10  facility_keamanan     10 non-null     float64
 11  facility_laundry      10 non-null     float64
 12  facility_masjid       10 non-null     float64
 13  house_mat_bata_hebel  10 non-null     float64
 14  house_mat_bata_merah  10 non-null     float64
 15  jumlah_lantai         10 n

In [4]:
df.head(2)

Unnamed: 0,id,district,city,description,url,price,carport,dapur,daya_listrik,facility_ac,...,lebar_jalan,luas_bangunan,luas_tanah,ruang_makan,ruang_tamu,tag_cash_bertahap,tag_komplek,tag_kpr,tag_perumahan,tahun_dibangun
0,hos15767028,Sentul City,Bogor,Dekat Fasilitas Bisnis dan Hutan Hijau Sejuk M...,https://www.rumah123.com/properti/bogor/hos157...,850.0,1,1,2200.0,2.0,...,3.0,60.0,90.0,1,1,0.0,1.0,1.0,1.0,0
1,hos16035504,Bojong Gede,Bogor,Luas Tanah : 72\nLuas Bangunan : 40\nKamar Tid...,https://www.rumah123.com/properti/bogor/hos160...,563.0,1,1,1300.0,2.0,...,3.0,40.0,72.0,0,0,1.0,0.0,1.0,0.0,2023


## Preprocessor

In [5]:
from jinja2 import Environment, FileSystemLoader, select_autoescape

In [6]:
def norm_description(s: str) -> str:
      # remove emojis
    s = s.encode('ascii', 'ignore').decode('ascii')
    
    # remove non-ascii characters
    s = re.sub(r'[^\x00-\x7F]+', '', s)

    # convert newlines to full stops
    s = s.replace('\n', '. ')

    # remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # remove space before punctuation
    s = re.sub(r'\s([?.!:"](?:\s|$))', r'\1', s)

    # remove double punctuation
    s = re.sub(r'([?.!"])([?.!"])+', r'\1', s)
    
    return s

def norm_facilities(tp) -> str:
    s = ""

    if tp.facility_ac > 0:
        s += "AC, "
    if tp.facility_keamanan > 0:
        s += "keamanan/satpam, "
    if tp.facility_laundry > 0:
        s += "laundry, "
    if tp.facility_masjid > 0:
        s += "masjid, "
    if tp.ruang_makan > 0:
        s += "ruang makan, "
    if tp.ruang_tamu > 0:
        s += "ruang tamu, "

    if s == "":
        return "tidak disebutkan"

    return s[:-2]

def norm_house_mat(tp) -> str:
    s = ""

    if tp.house_mat_bata_hebel > 0:
        s += "bata hebel, "
    if tp.house_mat_bata_merah > 0:
        s += "bata merah, "

    if s == "":
        return "tidak disebutkan"
    
    return s[:-2]

def norm_tag(tp) -> str:
    s = ""

    if tp.tag_cash_bertahap > 0:
        s += "cash bertahap, "
    if tp.tag_komplek > 0:
        s += "komplek, "
    if tp.tag_kpr > 0:
        s += "KPR, "
    if tp.tag_perumahan > 0:
        s += "perumahan, "

    if s == "":
        return "tidak disebutkan"
    
    return s[:-2]

def norm_scalar(s: float | int, suffix: str = '', default_value: str = 'tidak disebutkan') -> str:
    if s == 0:
        return default_value
    
    return f"{s}{suffix}"

def num_max(x, y):
    if x > y:
        return x
    return y

In [7]:
fs_loader = FileSystemLoader("../templates")
env = Environment(loader=fs_loader, autoescape=select_autoescape())

env.filters['norm_description'] = norm_description
env.filters['norm_facilities'] = norm_facilities
env.filters['norm_house_mat'] = norm_house_mat
env.filters['norm_tag'] = norm_tag
env.filters['norm_scalar'] = norm_scalar
env.filters['num_max'] = num_max

In [8]:
row_sample = next(df.itertuples())
env.get_template("document.jinja2").render(row=row_sample)

'Harga: Rp850.000.000.0\nAlamat: Sentul City, Bogor, Indonesia\nCarport: 1\nDapur: 1\nDaya listrik: 2200 watt\nJumlah lantai: 1\nKamar mandi: 1\nKamar tidur: 2\nKamar pembantu: 1\nLebar jalan: 3 cars\nLuas tanah: 90.0 m^2\nLuas bangunan: 60.0 m^2\nTahun dibangun: tidak disebutkan\nFasilitas: AC, keamanan/satpam, laundry, ruang makan, ruang tamu\nBahan bangunan: bata merah\nTag: komplek, KPR, perumahan\nDeskripsi: Dekat Fasilitas Bisnis dan Hutan Hijau Sejuk Menjadi Kelebihan Dari Rumah Ini'

In [9]:
row_sample = next(df.itertuples())
env.get_template("document_v2.jinja2").render(row=row_sample)

'-----\nKode rumah: hos15767028\nDijual rumah dengan harga Rp850.000.000 yang beralamat di Sentul City, Bogor, Indonesia.\nLuas tanah 90.0 meter persegi dengan luas bangunan 60.0 meter persegi.\nRumah terdiri atas 1 lantai dengan 2 kamar tidur, 1 kamar mandi, dan 1 kamar pembantu.\nFasilitas yang tersedia adalah 1 carport, jalan muat 3 mobil, 1 dapur, listrik 2200 VA.\n\nBahan bangunan: bata merah\nFasilitas: AC, keamanan/satpam, laundry, ruang makan, ruang tamu\nTag: komplek, KPR, perumahan\n\n\nDeskripsi tambahan:\nDekat Fasilitas Bisnis dan Hutan Hijau Sejuk Menjadi Kelebihan Dari Rumah Ini\n\n-----'

## Embedding

In [24]:
from haystack import Pipeline, Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore


In [25]:
documents_raw = []
document_template = env.get_template("document_v2.jinja2")
for row in df.itertuples():
	contents = document_template.render(row=row)
	metadata = dict(id=row.id,price=row.price,district=row.district,city=row.city,url=row.url)
	documents_raw.append(Document(id=row.id, content=contents, meta=metadata))

In [27]:
document_store = QdrantDocumentStore(url="localhost:6333", index="houses_haystack", embedding_dim=1536, hnsw_config={"m": 16, "ef_construct": 100}, return_embedding=True, wait_result_from_api=True)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("split", DocumentSplitter(split_by="passage", split_length=5, split_overlap=1))
indexing_pipeline.add_component("embedder", OpenAIDocumentEmbedder())
indexing_pipeline.add_component("store", DocumentWriter(document_store=document_store))

indexing_pipeline.connect("split", "embedder")
indexing_pipeline.connect("embedder", "store")

indexing_pipeline.run({"split": {"documents": documents_raw}})

Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
100it [00:00, 1952.61it/s]            


{'embed': {'meta': {'model': 'text-embedding-ada-002',
   'usage': {'prompt_tokens': 3581, 'total_tokens': 3581}}},
 'store': {'documents_written': 10}}

In [76]:
indexing_pipeline.draw("index_pipeline.txt", engine="mermaid-text")

## RAG

In [29]:
from haystack.components.embedders import OpenAITextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever

In [77]:
from haystack import component

@component
class ReturnDocumentsFromRetriever:
    @component.output_types(documents=list[dict])
    def run(self, docs: list[Document]):
        return {"documents": [{"id": doc.id, **doc.meta} for doc in docs]}

In [78]:
rag_prompt_template = (
    "You are an assistant for house recommendation/suggestion tasks. "
    "You will be given a few documents about property listing along with it's price, address, and specifications. "
    "Give a summary about the house specs and address if you have a match. "
    "Do not return the result as lists, but as a paragraph. "
    "You can suggest more than one house based on the context. "
    "If you don't know the answer, just say that you don't know. "
    "Answer using the same language as the question."
    "Use five sentences maximum and keep the answer concise.\n\n"
    "Context:\n"
    "###\n"
    "{% for doc in documents %}"
    "{{ doc.content }}"
    "{% endfor %}"
    "###\n\n"
    "Question: {{question}}\n"
    "Answer:"
)

rag_pipeline = Pipeline()
rag_pipeline.add_component("embedder", OpenAITextEmbedder())
rag_pipeline.add_component("retriever", QdrantEmbeddingRetriever(document_store=document_store))
rag_pipeline.add_component("rag_prompt", PromptBuilder(template=rag_prompt_template))
rag_pipeline.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
rag_pipeline.add_component("return_docs", ReturnDocumentsFromRetriever())

rag_pipeline.connect("embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever", "rag_prompt.documents")
rag_pipeline.connect("retriever", "return_docs")
rag_pipeline.connect("rag_prompt", "llm")

<haystack.pipeline.Pipeline at 0x7f28a8a95fd0>

In [79]:
rag_pipeline.draw("rag_pipeline.txt", engine="mermaid-text")

In [80]:
question = "Rumah dengan setidaknya 2 kamar tidur"
results = rag_pipeline.run(
    {
        "embedder": {"text": question},
        "rag_prompt": {"question": question},
    },
    debug=True
)

print(json.dumps(results, indent=2))
# results

{
  "embedder": {
    "meta": {
      "model": "text-embedding-ada-002",
      "usage": {
        "prompt_tokens": 13,
        "total_tokens": 13
      }
    }
  },
  "return_docs": {
    "document_ids": [
      {
        "id": "hos12335745",
        "city": "Bogor",
        "district": "Bojongsari",
        "price": 2900.0,
        "source_id": "hos12335745",
        "url": "https://www.rumah123.com/properti/bogor/hos12335745/"
      },
      {
        "id": "hos15952986",
        "city": "Bogor",
        "district": "Parung",
        "price": 1950.0,
        "source_id": "hos15952986",
        "url": "https://www.rumah123.com/properti/bogor/hos15952986/"
      },
      {
        "id": "hos15767028",
        "city": "Bogor",
        "district": "Sentul City",
        "price": 850.0,
        "source_id": "hos15767028",
        "url": "https://www.rumah123.com/properti/bogor/hos15767028/"
      },
      {
        "id": "hos15530924",
        "city": "Bogor",
        "district": "Semplak

In [71]:
# save pipeline
print(rag_pipeline.dumps())

components:
  embedder:
    init_parameters:
      api_key:
        env_vars:
        - OPENAI_API_KEY
        strict: true
        type: env_var
      dimensions: null
      model: text-embedding-ada-002
      organization: null
      prefix: ''
      suffix: ''
    type: haystack.components.embedders.openai_text_embedder.OpenAITextEmbedder
  llm:
    init_parameters:
      api_base_url: null
      api_key:
        env_vars:
        - OPENAI_API_KEY
        strict: true
        type: env_var
      generation_kwargs: {}
      model: gpt-3.5-turbo
      streaming_callback: null
      system_prompt: null
    type: haystack.components.generators.openai.OpenAIGenerator
  rag_prompt:
    init_parameters:
      template: 'You are an assistant for house recommendation/suggestion tasks. You
        will be given a few documents about property listing along with it''s price,
        address, and specifications. Give a summary about the house specs and address
        if you have a match. Do not