In [50]:
import os
import re
import json

## Data Loading

In [2]:
import pandas as pd

In [3]:
df = pd.read_parquet("../dataset/curated/marts_llm_houses.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10 non-null     object 
 1   district              10 non-null     object 
 2   city                  10 non-null     object 
 3   description           10 non-null     object 
 4   url                   10 non-null     object 
 5   price                 10 non-null     float64
 6   carport               10 non-null     int32  
 7   dapur                 10 non-null     int32  
 8   daya_listrik          10 non-null     float32
 9   facility_ac           10 non-null     float64
 10  facility_keamanan     10 non-null     float64
 11  facility_laundry      10 non-null     float64
 12  facility_masjid       10 non-null     float64
 13  house_mat_bata_hebel  10 non-null     float64
 14  house_mat_bata_merah  10 non-null     float64
 15  jumlah_lantai         10 n

In [4]:
df.head(2)

Unnamed: 0,id,district,city,description,url,price,carport,dapur,daya_listrik,facility_ac,...,lebar_jalan,luas_bangunan,luas_tanah,ruang_makan,ruang_tamu,tag_cash_bertahap,tag_komplek,tag_kpr,tag_perumahan,tahun_dibangun
0,hos15767028,Sentul City,Bogor,Dekat Fasilitas Bisnis dan Hutan Hijau Sejuk M...,https://www.rumah123.com/properti/bogor/hos157...,850.0,1,1,2200.0,2.0,...,3.0,60.0,90.0,1,1,0.0,1.0,1.0,1.0,0
1,hos16035504,Bojong Gede,Bogor,Luas Tanah : 72\nLuas Bangunan : 40\nKamar Tid...,https://www.rumah123.com/properti/bogor/hos160...,563.0,1,1,1300.0,2.0,...,3.0,40.0,72.0,0,0,1.0,0.0,1.0,0.0,2023


## Preprocessor

In [5]:
from jinja2 import Environment, FileSystemLoader, select_autoescape

In [63]:
def norm_description(s: str) -> str:
      # remove emojis
    s = s.encode('ascii', 'ignore').decode('ascii')
    
    # remove non-ascii characters
    s = re.sub(r'[^\x00-\x7F]+', '', s)

    # convert newlines to full stops
    s = s.replace('\n', '. ')

    # remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # remove space before punctuation
    s = re.sub(r'\s([?.!:"](?:\s|$))', r'\1', s)

    # remove double punctuation
    s = re.sub(r'([?.!"])([?.!"])+', r'\1', s)
    
    return s

def norm_facilities(tp) -> str:
    s = ""

    if tp.facility_ac > 0:
        s += "AC, "
    if tp.facility_keamanan > 0:
        s += "keamanan/satpam, "
    if tp.facility_laundry > 0:
        s += "laundry, "
    if tp.facility_masjid > 0:
        s += "masjid, "
    if tp.ruang_makan > 0:
        s += "ruang makan, "
    if tp.ruang_tamu > 0:
        s += "ruang tamu, "

    if s == "":
        return "tidak disebutkan"

    return s[:-2]

def norm_house_mat(tp) -> str:
    s = ""

    if tp.house_mat_bata_hebel > 0:
        s += "bata hebel, "
    if tp.house_mat_bata_merah > 0:
        s += "bata merah, "

    if s == "":
        return "tidak disebutkan"
    
    return s[:-2]

def norm_tag(tp) -> str:
    s = ""

    if tp.tag_cash_bertahap > 0:
        s += "cash bertahap, "
    if tp.tag_komplek > 0:
        s += "komplek, "
    if tp.tag_kpr > 0:
        s += "KPR, "
    if tp.tag_perumahan > 0:
        s += "perumahan, "

    if s == "":
        return "tidak disebutkan"
    
    return s[:-2]

def norm_scalar(s: float | int, suffix: str = '', default_value: str = 'tidak disebutkan') -> str:
    if s == 0:
        return default_value
    
    return f"{s}{suffix}"

def num_max(x, y):
    if x > y:
        return x
    return y

In [66]:
fs_loader = FileSystemLoader("../templates")
env = Environment(loader=fs_loader, autoescape=select_autoescape())

env.filters['norm_description'] = norm_description
env.filters['norm_facilities'] = norm_facilities
env.filters['norm_house_mat'] = norm_house_mat
env.filters['norm_tag'] = norm_tag
env.filters['norm_scalar'] = norm_scalar
env.filters['num_max'] = num_max

In [67]:
row_sample = next(df.itertuples())
env.get_template("document.jinja2").render(row=row_sample)

'Harga: Rp850.000.000.0\nAlamat: Sentul City, Bogor, Indonesia\nCarport: 1\nDapur: 1\nDaya listrik: 2200 watt\nJumlah lantai: 1\nKamar mandi: 1\nKamar tidur: 2\nKamar pembantu: 1\nLebar jalan: 3 cars\nLuas tanah: 90.0 m^2\nLuas bangunan: 60.0 m^2\nTahun dibangun: tidak disebutkan\nFasilitas: AC, keamanan/satpam, laundry, ruang makan, ruang tamu\nBahan bangunan: bata merah\nTag: complex, mortgage, housing estate\nDeskripsi: Dekat Fasilitas Bisnis dan Hutan Hijau Sejuk Menjadi Kelebihan Dari Rumah Ini'

In [73]:
row_sample = next(df.itertuples())
env.get_template("document_v2.jinja2").render(row=row_sample)

'Dijual rumah dengan harga Rp850.000.000 yang beralamat di Sentul City, Bogor, Indonesia.\nLuas tanah 90.0 meter persegi dengan luas bangunan 60.0 meter persegi .\nRumah terdiri atas 1 lantai dengan 2 kamar tidur, 1 kamar mandi, dan 1 kamar pembantu.\nFasilitas yang tersedia adalah 1 carport, jalan muat 3 mobil, 1 dapur, listrik 2200 VA.\n\nFasilitas: AC, keamanan/satpam, laundry, ruang makan, ruang tamu\n\nBahan bangunan: bata merah\n\nTag: complex, mortgage, housing estate\n\nDeskripsi tambahan:\nDekat Fasilitas Bisnis dan Hutan Hijau Sejuk Menjadi Kelebihan Dari Rumah Ini'

## Embedding

In [74]:
from langchain.globals import set_debug
set_debug(True)

In [75]:
from langchain_community.callbacks.manager import get_openai_callback

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.qdrant import Qdrant

In [100]:
documents_raw = []
document_template = env.get_template("document_v2.jinja2")
for row in df.itertuples():
	contents = document_template.render(row=row)
	metadata = dict(id=row.id,price=row.price,district=row.district,city=row.city,url=row.url)
	documents_raw.append(Document(contents,metadata=metadata))

In [89]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
documents = text_splitter.split_documents(documents_raw)

In [90]:
documents

[Document(page_content='-----\nKode rumah: hos15767028\nDijual rumah dengan harga Rp850.000.000 yang beralamat di Sentul City, Bogor, Indonesia.\nLuas tanah 90.0 meter persegi dengan luas bangunan 60.0 meter persegi .\nRumah terdiri atas 1 lantai dengan 2 kamar tidur, 1 kamar mandi, dan 1 kamar pembantu.\nFasilitas yang tersedia adalah 1 carport, jalan muat 3 mobil, 1 dapur, listrik 2200 VA.\n\nFasilitas: AC, keamanan/satpam, laundry, ruang makan, ruang tamu\n\nBahan bangunan: bata merah\n\nTag: complex, mortgage, housing estate\n\nDeskripsi tambahan:\nDekat Fasilitas Bisnis dan Hutan Hijau Sejuk Menjadi Kelebihan Dari Rumah Ini\n-----', metadata={'id': 'hos15767028', 'price': 850.0, 'district': 'Sentul City', 'city': 'Bogor', 'url': 'https://www.rumah123.com/properti/bogor/hos15767028/'}),
 Document(page_content='-----\nKode rumah: hos16035504\nDijual rumah dengan harga Rp563.000.000 yang beralamat di Bojong Gede, Bogor, Indonesia.\nLuas tanah 72.0 meter persegi dengan luas bangunan 4

In [91]:
with get_openai_callback() as cb:
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    qdrant = Qdrant.from_documents(documents, embeddings, url="localhost:6334", prefer_grpc=True, collection_name="houses")

    print(cb)

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0


## RAG

In [81]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.prompts import PromptTemplate

In [82]:
qdrant.similarity_search("House with price below 100 million IDR")

[Document(page_content='Dijual rumah dengan harga Rp2.900.000.000 yang beralamat di Bojongsari, Bogor, Indonesia.\nLuas tanah 144.0 meter persegi dengan luas bangunan 180.0 meter persegi .\nRumah terdiri atas 1 lantai dengan 4 kamar tidur, 4 kamar mandi.\nFasilitas yang tersedia adalah 2 carport, jalan muat 2 mobil, 1 dapur, listrik 3300 VA.\n\nFasilitas: AC, keamanan/satpam\n\nBahan bangunan: bata merah\n\nTag: installments, mortgage\n\nDeskripsi tambahan:\nExclusive house \nRumah mewah 3 lantai seperti di resort dengan view anak danau dan memiliki 30 fasilitas dalam cluster.\nLuas tanah : 144 sqm\nLuas banguna : 180,4sqm\nKamar tidur : 4\nKamar mandi : 4\nCarport : 2\nBalcony\nSmartdoor', metadata={'district': 'Bojongsari', 'id': 'hos12335745', 'price': 2900.0, 'url': 'https://www.rumah123.com/properti/bogor/hos12335745/', 'city': 'Bogor', '_id': '036316a7-cd42-4343-8f32-7d8030a8a5c2', '_collection_name': 'houses'}),
 Document(page_content='Dijual rumah dengan harga Rp563.000.000 yan

In [92]:
llm = ChatOpenAI(model="gpt-3.5-turbo")
retriever = qdrant.as_retriever()

In [93]:
tmpl = (
    "You are an assistant for house recommendation/suggestion tasks. "
    "You will be given a few documents about property listing along with it's price, address, and specifications. "
    "Give a summary about the house specs and address if you have a match. "
    "Do not return the result as lists, but as a paragraph. "
    "You can suggest more than one house based on the context. "
    "If you don't know the answer, just say that you don't know. "
    "Use five sentences maximum and keep the answer concise.\n\n"
    "Context:\n"
    "###\n"
    "{context}\n"
    "###\n\n"
    "Question: {question}\n"
    "Answer:"
)

prompt = PromptTemplate.from_template(tmpl)

In [94]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
  RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
  | prompt
  | llm
  | StrOutputParser()
)

rag_chain_with_source = RunnableParallel({"context": retriever, "question": RunnablePassthrough() }) \
    .assign(answer=rag_chain_from_docs)

In [95]:
with get_openai_callback() as cb:
  result = rag_chain_with_source.invoke("Rumah dengan setidaknya 2 kamar tidur")
  print(result)

  print(cb)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Rumah dengan setidaknya 2 kamar tidur"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "Rumah dengan setidaknya 2 kamar tidur"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 4:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Rumah dengan setidaknya 2 kamar tidur"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 4:chain:RunnablePassthrough] [1ms] Exiting Chain run with output:
[0m{
  "output": "Rumah dengan setidaknya 2 kamar tidur"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question>] [418ms] Exiting Chain run with output:
[0m[outputs]
[32;1m[1;3m[chain/start][0m [1

## RAG with Agent

In [109]:
from langchain_core.tools import tool
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain.prompts import SystemMessagePromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate, MessagesPlaceholder, PromptTemplate

In [107]:
@tool
def multiply(first_int: int, second_int: int) -> int:
    """Multiply two integers together."""
    return first_int * second_int

In [108]:
agent_prompt = PromptTemplate.from

ImportError: Could not import langchainhub, please install with `pip install langchainhub`.

In [105]:
multiply.invoke({"first_int": 4, "second_int": 5})

[32;1m[1;3m[tool/start][0m [1m[1:tool:multiply] Entering Tool run with input:
[0m"{'first_int': 4, 'second_int': 5}"
[36;1m[1;3m[tool/end][0m [1m[1:tool:multiply] [6ms] Exiting Tool run with output:
[0m"20"


20