# 🏠 HomeMatch: Personalized Real Estate Listings using Gen AI
## Project developed for Future Homes Realty.
## Goal: Create personalized real estate listings using LLMs and vector databases.
### Tools: OpenAI / Langchain, Vector Databases, Python.

In [None]:
#!pip install pandas chromadb pydantic-settings langchain langchain-community langchain-openai openai tiktoken

In [None]:
!pip install --upgrade --force-reinstall pandas chromadb pydantic-settings langchain langchain-community langchain-core langchain-openai openai tiktoken

In [17]:
# --- Package Installation and Imports ---
import sys
import pandas as pd
import re
import os
from typing import List, Dict
# LangChain components
from langchain_community.document_loaders import DataFrameLoader
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser

In [18]:
import os

os.environ["OPENAI_API_KEY"] = "voc-133214387312667737578556786d8eca8f677.72648743"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

#### Initialize the LLM

In [19]:
completion_model_name = "gpt-3.5-turbo"
temperature = 0.0
llm = ChatOpenAI(
    model_name=completion_model_name,
    temperature=temperature,
    max_tokens=2000,
)

#### Create function to generate listings and helper to parse data

In [20]:
def parse_generated_listings(raw_listings_text: str) -> List[Dict]:
    """
    Parses a block of raw text containing multiple listings into a list of structured dictionaries.
    """
    parsed_listings = []
    
    # The pattern has been corrected to be a single string argument.
    # The comma between the 'Description' and 'Neighborhood Description' parts was removed.
    listing_pattern = re.compile(
        r"Neighborhood:\s*(.*?)\n"
        r"Price:\s*\$([\d,]+)\n"
        r"Bedrooms:\s*(\d+)\n"
        r"Bathrooms:\s*(\d+)\n"
        r"House Size:\s*([\d,]+)\s*sqft\n"
        r"Description:\s*(.*?)\n"
        r"Neighborhood Description:\s*(.*?)(?=\n\nNeighborhood:|\Z)",
        re.DOTALL | re.MULTILINE
    )

    matches = listing_pattern.finditer(raw_listings_text)

    for match in matches:
        try:
            listing_dict = {
                'Neighborhood': match.group(1).strip(),
                'Price': match.group(2).replace(',', ''),
                'Bedrooms': int(match.group(3)),
                'Bathrooms': int(match.group(4)),
                'House Size': match.group(5).replace(',', ''),
                'Description': match.group(6).strip(),
                'Neighborhood Description': match.group(7).strip()
            }
            parsed_listings.append(listing_dict)
        except (AttributeError, IndexError) as e:
            # Added a more descriptive error message to help debug future pattern issues
            print(f"Error parsing a listing. Found {len(match.groups())} groups, but expected 7. Error: {e}")
            print(f"Problematic text block might be: {match.group(0)}")
            continue
            
    return parsed_listings

def generate_real_estate_listings(llm: ChatOpenAI, num_to_generate: int) -> pd.DataFrame:
    """
    Generates a specified number of synthetic real estate listings using an LLM,
    continuing until the target number is reached.
    """
    listing_template = """
    Please generate {batch_size} creative and diverse real estate listings.
    YOU MUST FOLLOW THIS TEMPLATE EXACTLY FOR EACH LISTING.
    DO NOT ADD ANY NUMBERS, BULLET POINTS, OR INTRODUCTORY TEXT.
    START THE RESPONSE DIRECTLY WITH "Neighborhood:". Separate each listing with two newlines. Make the <DESCR>
    a detailed unique description of the home. Make <NDESCR> a unique description of the neighborhood the home
    is located within.

    Neighborhood: <NEIGHBORHOOD>
    Price: $<PRICE>
    Bedrooms: <BEDROOMS>
    Bathrooms: <BATHROOMS>
    House Size: <SIZE> sqft
    Description: <DESCR>
    Neighborhood Description: <NDESCR>
    """
    prompt = PromptTemplate(input_variables=["batch_size"], template=listing_template)
    chain = prompt | llm | StrOutputParser()
    
    all_listings = []
    # Smaller batch size can sometimes improve model compliance
    batch_size = 5 
    max_attempts = (num_to_generate // batch_size) * 3 # Add a safety break
    attempts = 0

    print(f"Generating {num_to_generate} listings...")

    # --- THIS IS THE MODIFIED PART ---
    # Changed from a for loop to a while loop to ensure the target is met.
    while len(all_listings) < num_to_generate and attempts < max_attempts:
        print(f"--- Attempt {attempts + 1} ---")
        print(f"Need {num_to_generate - len(all_listings)} more listings. Requesting a batch of {batch_size}.")
        
        try:
            raw_output = chain.invoke({"batch_size": batch_size})
            parsed_batch = parse_generated_listings(raw_output)
            
            if parsed_batch:
                 all_listings.extend(parsed_batch)
                 print(f"Successfully parsed {len(parsed_batch)} listings. Total now: {len(all_listings)}")
            else:
                print("The model returned no valid listings in this attempt.")

        except Exception as e:
            print(f"\nAn unexpected error occurred during generation: {e}")
        
        attempts += 1

    if attempts >= max_attempts:
        print("\nReached maximum attempts. Continuing with generated listings.")

    print(f"\n--- Generation Complete ---")
    print(f"Total listings generated: {len(all_listings)}")
    # Return the exact number requested, in case we overshot
    return pd.DataFrame(all_listings[:num_to_generate])

# --- Main execution block (No changes needed here) ---
print("Helper functions defined.")

Helper functions defined.


#### Generate and Load Formatted Data

In [21]:
# Generate 50 listings
df = generate_real_estate_listings(llm, 20)

if not df.empty:
    print(df.columns.tolist())
    # Create a rich, descriptive column for the retriever
    df['combined_features'] = df.apply(
    lambda row: (
        f"Neighborhood: {row.get('Neighborhood','')}\n"
        f"Price: ${row.get('Price','')}\n"
        f"Bedrooms: {row.get('Bedrooms','')}\n"
        f"Bathrooms: {row.get('Bathrooms','')}\n"
        f"House Size: {row.get('House Size','')}\n"
        f"Description: {row.get('Description','')}\n"
        f"Neighborhood Description: {row.get('Neighborhood Description','')}"
    ),
    axis=1
    )    
    # Load the formatted data using DataFrameLoader
    loader = DataFrameLoader(df, page_content_column="combined_features")
    split_listings = loader.load()
    
    if split_listings:
        print("\n--- Sample of a Generated and Formatted Document ---")
        print(split_listings[0].page_content)
else:
    print("\nNo listings were generated. Please check for errors in the previous cell.")

Generating 20 listings...
--- Attempt 1 ---
Need 20 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 3
--- Attempt 2 ---
Need 17 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 6
--- Attempt 3 ---
Need 14 more listings. Requesting a batch of 5.
Successfully parsed 2 listings. Total now: 8
--- Attempt 4 ---
Need 12 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 11
--- Attempt 5 ---
Need 9 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 14
--- Attempt 6 ---
Need 6 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 17
--- Attempt 7 ---
Need 3 more listings. Requesting a batch of 5.
Successfully parsed 3 listings. Total now: 20

--- Generation Complete ---
Total listings generated: 20
['Neighborhood', 'Price', 'Bedrooms', 'Bathrooms', 'House Size', 'Description', 'Neighborhood Description']

--- Sample of a Generated and

#### Init embeddings and populate vector db

In [22]:
# Add this import at the top of your imports cell or in this cell
if 'split_listings' in locals() and split_listings:
    embeddings = OpenAIEmbeddings()
    
    # Define the client settings to disable telemetry
    client_settings = Settings(anonymized_telemetry=False)
    
    # Pass the settings to the from_documents method
    db = Chroma.from_documents(
        split_listings, 
        embeddings, 
        client_settings=client_settings
    )

    # Create a retriever from the vector store
    retriever = db.as_retriever(search_kwargs=dict(k=5))

    print("\nVector store created and populated with dynamically generated listings.")
else:
    print("\n'split_listings' not found. Please run the previous cells successfully.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



Vector store created and populated with dynamically generated listings.


#### Verify Memory and Retrieval

In [23]:
if 'retriever' in locals():
    print("--- Manually Inspecting Retrieved Docs ---")
    query_for_retrieval = "A suburban home around 1100-1300 sq ft with a pool, large kitchen, and backyard."
    relevant_docs = retriever.invoke(query_for_retrieval)

    print(f"Retriever found {len(relevant_docs)} documents for the query:")
    for i, doc in enumerate(relevant_docs):
        print(f"\n--- Document {i+1} ---")
        print(doc.page_content)
    print("\n--- End of Manual Inspection ---\n")
else:
    print("\nRetriever not initialized. Please run the previous cells.")

--- Manually Inspecting Retrieved Docs ---
Retriever found 5 documents for the query:

--- Document 1 ---
Neighborhood: Suburban Sanctuary
Price: $800000
Bedrooms: 5
Bathrooms: 4
House Size: 3500
Description: This spacious suburban home offers a peaceful retreat with a large backyard, swimming pool, and outdoor kitchen. The interior features a gourmet kitchen, formal dining room, and a cozy family room with a fireplace. The master suite includes a sitting area, walk-in closet, and a luxurious spa-like bathroom.
Neighborhood Description: The Suburban Sanctuary neighborhood is a family-friendly community with top-rated schools, parks, and recreational facilities. Residents can enjoy a quiet suburban lifestyle while still being close to shopping centers and restaurants.

--- Document 2 ---
Neighborhood: Suburban Retreat
Price: $900000
Bedrooms: 5
Bathrooms: 4
House Size: 4000
Description: This spacious family home is situated on a large wooded lot, offering privacy and tranquility. The in

#### Build user preference interface with dynamic Q/A

In [24]:
questions = [   
    "How big do you want your house to be (in sq ft)?",
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?",
]

personal_answers = [] 

print("\n--- Let's Find Your Dream Home ---")
for question in questions:
    answer = input(f"{question} ")
    personal_answers.append(answer)

print("\nThank you for your answers!")


--- Let's Find Your Dream Home ---
How big do you want your house to be (in sq ft)? 1200
What are 3 most important things for you in choosing this property? safety, close to schools, backyard
Which amenities would you like? a pool
Which transportation options are important to you? none
How urban do you want your neighborhood to be? suburban

Thank you for your answers!


#### Create the ConversationalRetrievalChain

In [25]:
if 'retriever' in locals():
    # Use a simple buffer memory for the chat history.
    # This memory does NOT interact with the vector store.
    memory = ConversationBufferMemory(
        memory_key='chat_history', 
        return_messages=True,
        output_key='answer' 
    )
    
    # Create the purpose-built chain for Q&A over documents
    conversation_with_context = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=True,
        return_source_documents=True
    )
    
    print("\nConversationalRetrievalChain created and ready.")
else:
    print("\nRetriever not initialized. Cannot create the chain.")




ConversationalRetrievalChain created and ready.


####  Engage in Conversation

In [26]:
if 'conversation_with_context' in locals():
    print("\n--- AI Recommendation ---")

    # We construct a detailed query that includes the user's preferences
    # to guide the initial document retrieval.
    final_query = f"""Given my preferences:\n{personal_answers}\nFind a home for me from your database and describe it. 
    Within the output show the home details and personalize/augment both the Description and the Neighborhood Description properties 
    of the search results, without altering the factual data. """
    
    # The chain now takes a dictionary and returns a dictionary
    result = conversation_with_context({"question": final_query})

    print("\n\n--- Final Answer ---")
    print(result['answer'])

    print("\n\n--- Source Documents Used by the AI ---")
    for i, doc in enumerate(result['source_documents']):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content)
else:
    print("\nConversation chain not ready. Please run all previous cells.")



--- AI Recommendation ---


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Neighborhood: Suburban Sanctuary
Price: $800000
Bedrooms: 5
Bathrooms: 4
House Size: 3500
Description: This spacious suburban home offers a peaceful retreat with a large backyard, swimming pool, and outdoor kitchen. The interior features a gourmet kitchen, formal dining room, and a cozy family room with a fireplace. The master suite includes a sitting area, walk-in closet, and a luxurious spa-like bathroom.
Neighborhood Description: The Suburban Sanctuary neighborhood is a family-friendly community with top-rated schools, parks, and recreational facilities. Residents can enjoy a quiet suburban lifestyle while still being close to shopping center