# Step 1: Managing Libraries

In [4]:
!pip install langchain_community
!pip install chromadb
!pip install tiktoken

Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain_community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain_community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [6]:
from google.colab import drive
drive.mount('/content/drive')
import os
from dotenv import load_dotenv


load_dotenv('/content/drive/MyDrive/Empathia/.env')
openapi_key = os.getenv("OPENAI_API_KEY")
if not  openapi_key:
  print("API Key not found in .env file.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 2: Generating Real Estate Listings

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd
import json



ONE_SHOT_PROMPT=    """
        You are a real estate listing generator. Generate {num_listings} unique real estate listings.
        Each listing should have the following details in JSON format:
        Neighborhood, Price, Bedrooms, Bathrooms, House Size, Description, and Neighborhood Description.
        Here's a example of a listing:

        {{
          "Neighborhood": "Green Oaks",
          "Price": "$800,000",
          "Bedrooms": 3,
          "Bathrooms": 2,
          "House Size": "2,000 sqft",
          "Description": "Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.",
          "Neighborhood Description": "Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze."
        }}
        Ensure all listings are creative, realistic, and vary in features like location, price, and house type.
        Finally, pack each listing in one list of json objects.
        """


# Function to save JSON data to a CSV file
def save_to_csv(data, filename):
    # Convert JSON to pandas DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")
    return df


prompt_template = PromptTemplate(
    input_variables=["num_listings"],
    template=(
        ONE_SHOT_PROMPT
    )
)

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key=openapi_key)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt_template)

# Number of listings to generate
num_listings = 10

# Run the chain to generate listings
response = chain.run(num_listings=num_listings)

# Parse the response into JSON
try:
    listings = json.loads(response)
except json.JSONDecodeError:
    print("Failed to parse response as JSON. Check the LLM output.")
    listings = []

# Save listings to a CSV file
df=save_to_csv(listings, "/content/drive/MyDrive/Empathia/real_estate_listings.csv")
df


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data saved to /content/drive/MyDrive/Empathia/real_estate_listings.csv


Unnamed: 0,Neighborhood,Price,Bedrooms,Bathrooms,House Size,Description,Neighborhood Description
0,Sunset Heights,"$650,000",4,3,"2,500 sqft",Step into luxury living in Sunset Heights with...,Sunset Heights is known for its upscale vibe a...
1,Oakwood Estates,"$1,200,000",5,4,"3,800 sqft",Welcome to your dream home in Oakwood Estates!...,Oakwood Estates is a prestigious enclave known...
2,Lakeview Terrace,"$500,000",3,2,"1,800 sqft",Nestled in the tranquil neighborhood of Lakevi...,Lakeview Terrace is a hidden gem known for its...
3,Mountain Ridge,"$750,000",4,3,"2,300 sqft",Discover mountain living at its finest in this...,Mountain Ridge is a nature lover's paradise wi...
4,Harbor Pointe,"$900,000",4,3,"2,700 sqft",Live the waterfront lifestyle in Harbor Pointe...,Harbor Pointe is a boater's paradise with easy...
5,Maple Grove,"$550,000",3,2,"1,600 sqft","Welcome home to Maple Grove, a charming neighb...",Maple Grove is a family-friendly neighborhood ...
6,Pinecrest Heights,"$700,000",4,3,"2,400 sqft",Experience modern living in Pinecrest Heights ...,Pinecrest Heights is a trendy neighborhood kno...
7,Riverfront Estates,"$1,500,000",5,4,"4,000 sqft",Indulge in luxury living in Riverfront Estates...,Riverfront Estates is an exclusive waterfront ...
8,Valley View,"$600,000",3,2,"2,000 sqft",Escape to the peaceful neighborhood of Valley ...,Valley View is a hidden gem known for its quie...
9,Meadowbrook Heights,"$850,000",4,3,"2,800 sqft",Discover the beauty of Meadowbrook Heights wit...,Meadowbrook Heights is a peaceful neighborhood...


# Step 3: Storing Listings in a Vector Database

In [10]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


def create_vector_store(csv_path):
    # Initialize CSV loader
    loader = CSVLoader(
        file_path=csv_path,
        csv_args={
            'delimiter': ',',
            'quotechar': '"',
            'fieldnames': ['Neighborhood', 'Price', 'Bedrooms', 'Bathrooms', 'House Size', 'Description', 'Neighborhood Description']
        }
    )

    # Load documents
    documents = loader.load()
    documents = documents[1:]
    # Initialize text splitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

    # Split documents
    split_docs = text_splitter.split_documents(documents)

    # Initialize embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=openapi_key)

    # Create and persist vector store
    vectorstore = Chroma.from_documents(
        documents=split_docs,
        embedding=embeddings,
        persist_directory="/content/drive/MyDrive/Empathia/chromadb_vs"
    )

    return vectorstore

# Function to perform MMR search
def search_listings_using_mmr(vectorstore, query, k=2, fetch_k=5,lambda_mmr=0.5 ):
    # Create standard retriever with similarity search
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": k,
            "fetch_k": fetch_k,
            "lambda_mmr": lambda_mmr,
        }
    )

    # Perform search
    results = retriever.get_relevant_documents(query)
    return results

vectorstore = create_vector_store("/content/drive/MyDrive/Empathia/real_estate_listings.csv")

# Testing
search_query = "Find luxury homes with modern amenities"
search_results = search_listings_using_mmr(vectorstore, search_query)

# Print search results
for i, doc in enumerate(search_results, 1):
    print(f"\nResult {i}:")
    print(doc.page_content)



Result 1:
Neighborhood: Riverfront Estates
Price: $1,500,000
Bedrooms: 5
Bathrooms: 4
House Size: 4,000 sqft
Description: Indulge in luxury living in Riverfront Estates with this exquisite 5-bedroom, 4-bathroom home situated on the banks of the river. The grand foyer leads to a formal living room, a gourmet kitchen with a breakfast nook, and a master suite with a private balcony overlooking the water. Relax in the landscaped backyard with a pool, spa, and outdoor kitchen, perfect for entertaining in style.
Neighborhood Description: Riverfront Estates is an exclusive waterfront community known for its upscale homes and picturesque views. Residents can enjoy boating, fishing, and water activities right from their backyard. With easy access to downtown amenities and a serene riverfront setting, Riverfront Estates offers the ultimate in luxury waterfront living.

Result 2:
Neighborhood: Sunset Heights
Price: $650,000
Bedrooms: 4
Bathrooms: 3
House Size: 2,500 sqft
Description: Step into l

#Step 4: Building the User Preference Interface

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

# questions for buyer preferences
PREFERENCE_QUESTIONS = [
    "How big do you want your house to be?",
    "What are 3 most important things for you in choosing this property?",
    "Which amenities would you like?",
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

# Example answers
EXAMPLE_ANSWERS = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]

# Prompt template for parsing preferences
PREFERENCE_PARSER_TEMPLATE = """
You are an expert real estate preference analyzer. Given a set of buyer responses to preference questions,
create a structured analysis that can be used for property matching.

Questions and Answers:
{qa_pairs}

Please analyze these preferences and create a structured JSON output with the following fields:
- size_requirements: Details about desired house size, bedrooms, etc.
- location_preferences: Neighborhood characteristics and location requirements
- must_have_features: List of essential features or deal-breakers
- nice_to_have_features: List of preferred but not essential features
- lifestyle_factors: Environmental and community aspects important to the buyer
- transportation_needs: Transportation and accessibility requirements
- price_sensitivity: Any mentions or implications about budget (if mentioned)

Ensure the output is in valid JSON format and captures both explicit and implicit preferences from the answers.
"""

class PreferenceCollector:
    def __init__(self, openai_api_key):
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.2,
            openai_api_key=openai_api_key
        )

        self.parser_prompt = PromptTemplate(
            input_variables=["qa_pairs"],
            template=PREFERENCE_PARSER_TEMPLATE
        )

        self.chain = LLMChain(
            llm=self.llm,
            prompt=self.parser_prompt
        )

    def collect_preferences(self, interactive=False):
        """Collect buyer preferences either interactively or using example data"""
        if interactive:
            answers = []
            for question in PREFERENCE_QUESTIONS:
                answer = input(f"{question}\nYour answer: ")
                answers.append(answer)
        else:
            answers = EXAMPLE_ANSWERS

        return dict(zip(PREFERENCE_QUESTIONS, answers))

    def format_qa_pairs(self, preferences_dict):
        """Format Q&A pairs for the LLM prompt"""
        qa_pairs = ""
        for question, answer in preferences_dict.items():
            qa_pairs += f"QUESTION: {question}\nANSWER: {answer}\n\n"
        return qa_pairs

    def parse_preferences(self, preferences_dict):
        """Parse raw preferences into structured format using LLM"""
        qa_pairs = self.format_qa_pairs(preferences_dict)

        try:
            # Get structured analysis from LLM
            response = self.chain.run(qa_pairs=qa_pairs)

            # Parse JSON response
            structured_preferences = json.loads(response)
            return structured_preferences

        except json.JSONDecodeError as e:
            print(f"Error parsing LLM response as JSON: {e}")
            return None
        except Exception as e:
            print(f"Error processing preferences: {e}")
            return None

# Initialize preference collector
collector = PreferenceCollector(openai_api_key=openapi_key)

# Collect preferences
raw_preferences = collector.collect_preferences(interactive=False)
# Parse preferences into structured format
structured_preferences = collector.parse_preferences(raw_preferences)

if structured_preferences:
  print("\nStructured Preferences:")
  print(json.dumps(structured_preferences, indent=2))
else:
  print("Failed to process preferences")



Structured Preferences:
{
  "size_requirements": {
    "house_size": "Comfortable",
    "bedrooms": 3,
    "kitchen": "Spacious",
    "living_room": "Cozy"
  },
  "location_preferences": {
    "neighborhood": "Quiet",
    "schools": "Good local schools",
    "shopping": "Convenient options",
    "urban_level": "Suburban tranquility with access to urban amenities"
  },
  "must_have_features": [
    "Backyard for gardening",
    "Two-car garage",
    "Modern, energy-efficient heating system"
  ],
  "nice_to_have_features": [],
  "lifestyle_factors": [],
  "transportation_needs": {
    "bus_line": "Easy access to a reliable bus line",
    "highway": "Proximity to a major highway",
    "bike_friendly_roads": true
  },
  "price_sensitivity": null
}


# Step 5: Searching Based on Preferences

In [15]:
from typing import Dict, List
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

# Prompt template for converting preferences to search query
SEARCH_QUERY_TEMPLATE = """
You are an expert real estate search specialist. Convert these structured buyer preferences into a detailed search query
that captures the essence of what the buyer is looking for.

Buyer Preferences:
{structured_preferences}

Create a natural language search query that:
1. Emphasizes the most important requirements
2. Includes both explicit and implicit preferences
3. Maintains proper context and relationships between features

Format the output as a single, detailed search string.
"""

class HomeMatchSearchEngine:
    def __init__(self, vectorstore, openai_api_key):
        self.vectorstore = vectorstore
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.0,
            openai_api_key=openai_api_key
        )

        # Initialize query generator chain
        self.query_generator = LLMChain(
            llm=self.llm,
            prompt=PromptTemplate(
                input_variables=["structured_preferences"],
                template=SEARCH_QUERY_TEMPLATE
            )
        )

    def generate_search_query(self, structured_preferences: Dict) -> str:
        """Convert structured preferences into an optimized search query"""
        preferences_str = json.dumps(structured_preferences, indent=2)
        search_query = self.query_generator.run(structured_preferences=preferences_str)
        return search_query.strip()


    def search(self, structured_preferences: Dict,
               num_results: int = 3,
               diversity_factor: float = 0.7) -> List:
        """
        Perform semantic search using structured preferences

        Args:
            structured_preferences: Parsed buyer preferences
            num_results: Number of results to return
            diversity_factor: Balance between relevance (0.0) and diversity (1.0)
        """
        try:
            # Generate optimized search query
            search_query = self.generate_search_query(structured_preferences)

            # Perform initial search with MMR
            retriever = self.vectorstore.as_retriever(
                search_type="mmr",
                search_kwargs={
                    "k": num_results ,  # Fetch more results initially for filtering
                    "fetch_k": num_results * 2,
                    "lambda_mult": diversity_factor
                }
            )

            # Get initial results
            results = retriever.get_relevant_documents(search_query)

            # Apply hard constraints filtering
            #filtered_results = self.filter_by_hard_constraints(results, structured_preferences)
            filtered_results=results
            # Limit to requested number of results
            return filtered_results[:num_results]

        except Exception as e:
            print(f"Error during search: {e}")
            return []

    def explain_match(self, listing: str, preferences: Dict) -> str:
        """Generate an explanation of why this listing matches the preferences"""
        explanation_prompt = PromptTemplate(
            input_variables=["listing", "preferences"],
            template="""
            Explain why this property matches the buyer's preferences:

            Property Details:
            {listing}

            Buyer Preferences:
            {preferences}

            Provide a concise explanation focusing on the key matching points:
            """
        )

        explanation_chain = LLMChain(llm=self.llm, prompt=explanation_prompt)
        return explanation_chain.run(
            listing=json.dumps(listing, indent=2),
            preferences=json.dumps(preferences, indent=2)
        )


# Initialize search engine
search_engine = HomeMatchSearchEngine(vectorstore, openai_api_key=openapi_key)

    # Perform search
matches = search_engine.search(
        structured_preferences,
        num_results=5,
        diversity_factor=0.7
    )

# Display results with explanations
for i, match in enumerate(matches, 1):
        print(f"\nMatch {i}:")
        print(match.page_content)

        explanation = search_engine.explain_match(
            match.page_content,
            structured_preferences
        )
        print("\nExplanation of this match:")
        print(explanation)
        print("-" * 80)



Match 1:
Neighborhood: Valley View
Price: $600,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft
Description: Escape to the peaceful neighborhood of Valley View with this 3-bedroom, 2-bathroom home nestled on a quiet cul-de-sac. The cozy living room features a fireplace, the updated kitchen boasts quartz countertops, and the backyard offers a tranquil retreat with a deck and mature trees. Enjoy the serenity and charm of Valley View living.
Neighborhood Description: Valley View is a hidden gem known for its quiet streets and scenic views of the valley. Residents can enjoy hiking trails, parks, and local shops in this close-knit community. With a strong sense of community and a peaceful atmosphere, Valley View offers a relaxed and laid-back lifestyle.

Explanation of this match:
This property in Valley View matches the buyer's preferences as it meets their criteria for a quiet neighborhood with good local schools and convenient shopping options. The house itself has 3 bedrooms, a cozy