# HomeMatch

## Libraries and setup

In [15]:
import os
import json

from pydantic import BaseModel, Field
from typing import List
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document as LangchainDocument
from langchain.chains import (
    create_history_aware_retriever,
    create_retrieval_chain,
)
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv

In [16]:
load_dotenv()

os.environ["OPENAI_API_KEY"] = str(os.environ.get("OPENAI_API_KEY"))
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

In [17]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

## Generate listings

Pydantic models containing the listing descriptions

In [18]:
class RealEstateDescription(BaseModel):
    """Information of a property"""
    neighborhood: str = Field(..., description="Name of the neighborhood")
    price: int = Field(..., description="Listing price in USD")
    bedrooms: int = Field(..., description="Number of bedrooms")
    bathrooms: int = Field(..., description="Number of bathrooms")
    house_size: int = Field(..., description="Size of the house in square feet")
    description: str = Field(..., description="Detailed description of about 100 words of the property")
    neighborhood_description: str = Field(..., description="Description of the neighborhood")

class RealEstateListing(BaseModel):
    """A collection of real estate listings"""
    listing: List[RealEstateDescription] = Field(..., description="List of real estate property descriptions")

In [None]:
structured_llm = llm.with_structured_output(RealEstateListing, method='function_calling', strict=True)
prompt = "Generate a real estate listing of 30 different properties"
listing = structured_llm.invoke(prompt)

As I need to include a file that contains my synthetically generated real estate listings, I'm dumping the structured output of the llm invocation

In [None]:
with open('listing.json', 'w') as f:
    json.dump(listing.model_dump(), f)

Loading the file (this way I can save some money and time not generating the listings every time)

In [19]:
with open('listing.json') as f:
    listing_data = json.load(f)

Storing listings as LangchainDocuments, that can be then stored in Chroma db. I'm using all the listing as content, and also adding the dictionary as metadata, in case I want to perform a keyword search by an specific field

In [20]:

documents = []
for i, item in enumerate(listing_data["listing"]):
    formatted_string = "\n".join(f"{key}: {value}" for key, value in item.items())
    del item["description"]
    del item["neighborhood_description"]
    doc = LangchainDocument(page_content=formatted_string, metadata=item)
    documents.append(doc)

Adding documents to a chroma db collection (reset first to ensure we're loading just this set of new documents)

In [21]:
db = Chroma(embedding_function=embeddings, persist_directory="db", collection_name="homematch", collection_metadata={"hnsw:space": "cosine"})
db.reset_collection()
db.add_documents(documents)

['4cc0d51b-e426-4093-aeda-86dc4f23ca5e',
 '2488637d-15ee-4ee3-962d-385f9d979785',
 '752d6bbd-df88-4e0b-a27c-d66f2bb00d5f',
 'a3f5fed3-a6f0-43f1-b565-78ff94dcfa13',
 'b433cfbb-429f-4f38-8e3c-cf51bdf34aa3',
 'a0d3bb21-c0fc-41fe-af1f-607d5f12a633',
 '0adc6377-56d8-436f-afa9-c7ecac2a5e11',
 'fb7f33ce-59f4-4732-b4d8-1d61fb5b576f',
 'be7f0e56-64c0-4f7a-8ea7-1cf7b3f4a244',
 'cc2abc12-78a2-4565-9abd-480ff7fe4b71',
 '5e4303f9-012f-447b-880b-5611a0c0afc8',
 'bce26aac-9037-49b5-be71-42cd53427936',
 'd4226175-01f5-4686-82d2-78b0682e503b',
 '69328c08-43ee-4811-b33c-d0be9124cb86',
 'e825843e-617f-42f1-8989-716679b96f2b',
 '09c6c53c-5a20-4863-86da-c0c5cc7382d3',
 'fba1588d-3038-4e05-9aef-66589053e172',
 '26f2d416-3fd6-4471-95be-b6f6342d42fd',
 '36f0dae8-f77d-4586-a6bf-273e30c4dcad',
 'f54f6147-cdf3-4d92-86c1-20edc3043e10',
 'b449c405-4417-42d2-b872-2e25715bcd99',
 '8dd39cf9-f2ce-47fe-b435-f0d72fba1191',
 '3def128b-08f2-4832-b816-2e6c93f62631',
 '8b8bd509-e97d-4380-bc6c-05127931c3cf',
 '65d4ad6a-74cf-

## Buyer preferences

Adding questions and answers to a in memory chat history. Also creating a chat history store based on session_id. Storing those messages in a particular session_id.
I added answers_option1 and answers_option2 to test the solution with different input preferences.

In [22]:
questions = [
    "How big do you want your house to be?" 
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?",   
]
answers_option1 = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]
answers_option2 = [
    "A cheap house, with the minimum amount of bedrooms and bathrooms.",
    "Low price, quiet neighborhood, next to the beach.",
    "Minimum amenities for lower price.",
    "Easy access to train station.",
    "I don't mind meanwhile it's a quiet place and it's cheap."
]

history = InMemoryChatMessageHistory()
history.add_user_message(f"""You are AI that will recommend user a property based on their answers to personal questions. Ask user {len(questions)} questions""")
for i in range(len(questions)):
    history.add_ai_message(questions[i])
    history.add_user_message(answers_option2[i])

store = {}
session_id = "property-recommendation-session"
store[session_id] = history

def get_session_history(session_id: str) -> InMemoryChatMessageHistory:
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]


Using create_history_aware_retriever to use the user preferences (chat history) and user question to create a new question to search in the RAG pipeline later on.

In [23]:
def create_context_retriever(llm, retriever):
    """Creates a history aware retriever chain. The prompt
    and LLM will be used to generate a search query. That search
    query is then passed to the retriever. The result of calling
    this is a list of documents.
    """
    contextualize_q_system_prompt = (
        "Given a chat history and the latest user question "
        "which might reference context in the chat history, "
        "formulate a standalone question which can be understood "
        "without the chat history. Do NOT answer the question, "
        "just reformulate it if needed and otherwise return it as is."
    )

    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
    return history_aware_retriever

Creating chain that can deal with documents passed from the retriever, using create_stuff_documents_chain. In the prompt for this chain is where I
augment implicitly the descriptions of the retreived properties.

In [24]:
def create_qa_chain(llm):
    """Creates a stuff documents chain. Basically, it receives
    a list of documents and makes them available to the model.
    """
    system_prompt = (
"""
You are an expert real state agent. Provide accurate 
answer in a proper formatted manner. Do not provide unverified 
or fabricated information. Use the following 
context and chat history to answer the user's question.:\n\n{context}
""")

    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )

    qa_chain = create_stuff_documents_chain(
        llm,
        qa_prompt,
        document_variable_name="context"
    )
    return qa_chain

Plugging in the create_history_aware_retriever and create_stuff_documents_chain using create_retrieval_chain to put all together.
Creating then a chain using that as a basis, that also takes into consideration the chat history.

In [25]:
def create_chain(llm, retriever):
    """Creates a retrieval chain, that consist of two parts,
    the retrieval part, that generates a list of documents, and
    the one that takes that list and makes it available to the
    model.
    """
    history_aware_retriever = create_context_retriever(llm, retriever)
    qa_chain = create_qa_chain(llm)
    rag_chain = create_retrieval_chain(
        history_aware_retriever, qa_chain
    )
    conversational_rag_chain = RunnableWithMessageHistory(
        rag_chain,
        get_session_history,
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer",
    )
    return conversational_rag_chain

Function for invoking the chain and return the response

In [29]:
def invoke_chain(conversational_rag_chain: RunnableWithMessageHistory, question: str, chat_session: str) -> str:
    """Invokes the LLM chain."""
    response = None
    additional_instructions_output = """
Output will consist on some text along with a list of properties. 
For each retrieved listing from the context, rephrase both the description
and neighborhood description, keeping all the other property fields intact, tailoring it 
to resonate with the buyer’s specific preferences. This involves 
subtly emphasizing aspects of the property that align with what the 
buyer is looking for. Ensure that the rephrasing process enhances 
the appeal of the listing without altering factual information.
Don't create false information if there's no information coming from the context.
"""
    result = conversational_rag_chain.invoke(
        {"input": question + "\n\n"+ additional_instructions_output},
        config={"configurable": {"session_id": chat_session}},
    )
    response = result.get("answer", result.get("output", ""))
    return response

# Instanciating everything

Instanciating the conversational rag chain, using the Chroma db as retriever, configuring it to perform similarity search and return maximum 5 results.

In [30]:
conversational_rag_chain = create_chain(llm, db.as_retriever(search_type="similarity", search_kwargs={"k": 5}))

In [32]:
response = invoke_chain(conversational_rag_chain, "Recommend me a list of properties that align with my preferences.", session_id)

In [33]:
print(response)

Based on your preferences for a low-priced property in a quiet neighborhood near the beach with easy access to a train station, here are some tailored recommendations:

1. **Property 1:**
   - Price: $750,000
   - Bedrooms: 2
   - Bathrooms: 2
   - House Size: 1700 sqft
   - Description: Cozy beachfront condo with direct beach access. Relax in the living room with serene ocean views. Stylish kitchen with quartz countertops. Master suite offers ocean vistas and a spacious walk-in closet.
   - Neighborhood Description: Tranquil beachfront location, perfect for those seeking a peaceful escape by the sea, with convenient proximity to the train station for easy commuting.

2. **Property 2:**
   - Price: $900,000
   - Bedrooms: 3
   - Bathrooms: 2
   - House Size: 1800 sqft
   - Description: Charming beachfront property with breathtaking ocean views. Open living area with high ceilings for a spacious ambiance. Modern kitchen with a breakfast bar. Master suite features a walk-in closet and a 