# Step 1: Managing Libraries

In [16]:
!pip install langchain_community
!pip install chromadb
!pip install tiktoken
!pip install load_dotenv



In [68]:
import os
from dotenv import load_dotenv


load_dotenv()
openapi_key = os.getenv("OPENAI_API_KEY")
if not  openapi_key:
  print("API Key not found in .env file.")

# Step 2: Generating Real Estate Listings

**Rubric:** Synthetic Data Generation

In [69]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd
import json



ONE_SHOT_PROMPT=    """
        You are a real estate listing agent. Generate {num_listings} unique real estate listings.
        Each listing should have the following details in JSON format:
        Neighborhood, Price, Bedrooms (integer), Bathrooms(integer), House Size, Description, and Neighborhood Description.
        Here's a example of a listing:

        {{
          "Neighborhood": "Green Oaks",
          "Price": "$800,000",
          "Bedrooms": 3,
          "Bathrooms": 2,
          "House Size": "2,000 sqft",
          "Description": "Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.",
          "Neighborhood Description": "Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze."
        }}
        Ensure all listings are creative, realistic, and vary in features like location, price, and house type.
        Finally, pack each listing in one list of json objects.
        """


# Function to save JSON data to a CSV file
def save_to_csv(data, filename):
    # Convert JSON to pandas DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")
    return df


prompt_template = PromptTemplate(
    input_variables=["num_listings"],
    template=(
        ONE_SHOT_PROMPT
    )
)

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key=openapi_key)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt_template)

# Number of listings to generate
num_listings = 10

# Run the chain to generate listings
response = chain.run(num_listings=num_listings)

# Parse the response into JSON
try:
    listings = json.loads(response)
except json.JSONDecodeError:
    print("Failed to parse response as JSON. Check the LLM output.")
    listings = []

# Save listings to a CSV file
df=save_to_csv(listings, "./real_estate_listings.csv")
df


Data saved to ./real_estate_listings.csv


Unnamed: 0,Neighborhood,Price,Bedrooms,Bathrooms,House Size,Description,Neighborhood Description
0,Sunset Heights,"$650,000",4,3,"2,500 sqft",Enjoy breathtaking sunsets from this stunning ...,Sunset Heights is a vibrant neighborhood known...
1,Lakeview Estates,"$1,200,000",5,4,"3,800 sqft",Experience luxury living in this exquisite 5-b...,Lakeview Estates is a prestigious gated commun...
2,Forest Hills,"$500,000",3,2,"1,800 sqft",Nestled in the tranquil neighborhood of Forest...,Forest Hills is a hidden gem known for its lus...
3,Oceanfront Paradise,"$2,500,000",6,5,"4,500 sqft",Live the ultimate beachfront lifestyle in this...,Oceanfront Paradise is a prestigious coastal c...
4,Mountain View Heights,"$800,000",4,3,"2,200 sqft",Escape to the serene mountains in this 4-bedro...,Mountain View Heights offers a peaceful mounta...
5,Riverfront Retreat,"$900,000",3,2,"2,300 sqft",Experience waterfront living in this charming ...,Riverfront Retreat is a tranquil waterfront co...
6,Downtown Loft District,"$700,000",2,2,"1,600 sqft",Live in the heart of the city in this trendy 2...,Downtown Loft District is a vibrant urban neig...
7,Golf Course Estates,"$1,000,000",4,3,"2,800 sqft","Live in luxury in this 4-bedroom, 3-bathroom h...",Golf Course Estates is a prestigious gated com...
8,Historic District,"$600,000",3,2,"2,000 sqft","Step back in time in this charming 3-bedroom, ...",Historic District is a picturesque neighborhoo...
9,Beachside Bungalow,"$400,000",2,1,"1,200 sqft",Live the beach lifestyle in this cozy 2-bedroo...,Beachside Bungalow is a laid-back beach commun...


# Step 3: Storing Listings in a Vector Database

**Rubric:** Semantic Search | Creating a Vector Database and Storing Listings

In [70]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


def create_vector_store(csv_path):
    # Initialize CSV loader
    loader = CSVLoader(
        file_path=csv_path,
        csv_args={
            'delimiter': ',',
            'quotechar': '"',
            'fieldnames': ['Neighborhood', 'Price', 'Bedrooms', 'Bathrooms', 'House Size', 'Description', 'Neighborhood Description']
        }
    )

    # Load documents
    documents = loader.load()
    documents = documents[1:]
    # Initialize text splitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

    # Split documents
    split_docs = text_splitter.split_documents(documents)

    # Initialize embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=openapi_key)

    # Create and persist vector store
    vectorstore = Chroma.from_documents(
        documents=split_docs,
        embedding=embeddings,
        persist_directory="./chroma.db"
    )

    return vectorstore

# Function to perform MMR search
def search_listings_using_mmr(vectorstore, query, k=2, fetch_k=5,lambda_mmr=0.5 ):
    # Create standard retriever with similarity search
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": k,
            "fetch_k": fetch_k,
            "lambda_mult": lambda_mmr,
        }
    )

    # Perform search
    results = retriever.get_relevant_documents(query)
    return results

vectorstore = create_vector_store("./real_estate_listings.csv")

# Testing
search_query = "Find luxury homes with modern amenities"
search_results = search_listings_using_mmr(vectorstore, search_query)

# Print search results
for i, doc in enumerate(search_results, 1):
    print(f"\nResult {i}:")
    print(doc.page_content)



Result 1:
Neighborhood: Lakeview Estates
Price: $1,200,000
Bedrooms: 5
Bathrooms: 4
House Size: 3,800 sqft
Description: Experience luxury living in this exquisite 5-bedroom, 4-bathroom home in Lakeview Estates. The grand foyer welcomes you into the elegant living spaces, featuring high ceilings and designer finishes. The gourmet kitchen is a chef's dream, with top-of-the-line appliances and a breakfast nook. Retreat to the private backyard with a lush garden and serene lake views.
Neighborhood Description: Lakeview Estates is a prestigious gated community known for its upscale amenities and waterfront properties. Residents enjoy exclusive access to a private clubhouse, tennis courts, and walking trails. With top-rated schools and upscale shopping nearby, Lakeview Estates offers the epitome of luxury living.

Result 2:
Neighborhood: Downtown Loft District
Price: $700,000
Bedrooms: 2
Bathrooms: 2
House Size: 1,600 sqft
Description: Live in the heart of the city in this trendy 2-bedroom,

#Step 4:Building Buyer Preferences



In [71]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

# questions for buyer preferences
PREFERENCE_QUESTIONS = [
    "How big do you want your house to be?",
    "What are 3 most important things for you in choosing this property?",
    "Which amenities would you like?",
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

# Example answers
EXAMPLE_ANSWERS = [
    "A spacious five-bedroom house with a large backyard and a finished basement.",
    "Safety, a great view, and nearby entertainment options.",
    "A home office, a patio for outdoor dining, and built-in smart home features.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]


PREFERENCE_PARSER_TEMPLATE = """
You are an expert real estate preference analyzer. Given a set of buyer responses to preference questions,
create a structured analysis that can be used for property matching.

Questions and Answers:
{qa_pairs}

Please analyze these preferences and create a structured JSON output with the following fields:
- size_requirements: Details about desired house size, bedrooms, etc.
- location_preferences: Neighborhood characteristics and location requirements
- must_have_features: List of essential features or deal-breakers
- nice_to_have_features: List of preferred but not essential features
- lifestyle_factors: Environmental and community aspects important to the buyer
- transportation_needs: Transportation and accessibility requirements
- price_sensitivity: Any mentions or implications about budget (if mentioned)

Ensure the output is in valid JSON format and captures both explicit and implicit preferences from the answers.
"""

class PreferenceCollector:
    def __init__(self, openai_api_key):
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.2,
            openai_api_key=openai_api_key
        )

        self.parser_prompt = PromptTemplate(
            input_variables=["qa_pairs"],
            template=PREFERENCE_PARSER_TEMPLATE
        )

        self.chain = LLMChain(
            llm=self.llm,
            prompt=self.parser_prompt
        )

    def collect_preferences(self, interactive=False):
        """Collect buyer preferences either interactively or using example data"""
        if interactive:
            answers = []
            for question in PREFERENCE_QUESTIONS:
                answer = input(f"{question}\nYour answer: ")
                answers.append(answer)
        else:
            answers = EXAMPLE_ANSWERS

        return dict(zip(PREFERENCE_QUESTIONS, answers))

    def format_qa_pairs(self, preferences_dict):
        """Format Q&A pairs for the LLM prompt"""
        qa_pairs = ""
        for question, answer in preferences_dict.items():
            qa_pairs += f"QUESTION: {question}\nANSWER: {answer}\n\n"
        return qa_pairs

    def parse_preferences(self, preferences_dict):
        """Parse raw preferences into structured format using LLM"""
        qa_pairs = self.format_qa_pairs(preferences_dict)

        try:
            # Get structured analysis from LLM
            response = self.chain.run(qa_pairs=qa_pairs)

            # Parse JSON response
            structured_preferences = json.loads(response)
            return structured_preferences

        except json.JSONDecodeError as e:
            print(f"Error parsing LLM response as JSON: {e}")
            return None
        except Exception as e:
            print(f"Error processing preferences: {e}")
            return None

# Initialize preference collector
collector = PreferenceCollector(openai_api_key=openapi_key)

# Collect preferences
raw_preferences = collector.collect_preferences(interactive=False)
# Parse preferences into structured format
structured_preferences = collector.parse_preferences(raw_preferences)

if structured_preferences:
  print("\nStructured Preferences:")
  print(json.dumps(structured_preferences, indent=2))
else:
  print("Failed to process preferences")



Structured Preferences:
{
  "size_requirements": {
    "house_size": "Spacious",
    "bedrooms": 5,
    "backyard": "Large",
    "basement": "Finished"
  },
  "location_preferences": {
    "safety": true,
    "great_view": true,
    "entertainment_options": true,
    "urban_access": "Balanced"
  },
  "must_have_features": [
    "Home office",
    "Outdoor dining patio",
    "Built-in smart home features"
  ],
  "nice_to_have_features": [],
  "lifestyle_factors": [],
  "transportation_needs": {
    "bus_line": "Easy access to reliable bus line",
    "highway": "Proximity to major highway",
    "bike_friendly_roads": true
  },
  "price_sensitivity": null
}


# Step 5: Searching Based on Preferences

**Rubric:** Semantic Search | Listings Based on Buyer Preferences

In [72]:
from typing import Dict, List
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

class HomeMatchSearchEngine:
    def __init__(self, vectorstore, openai_api_key):
        self.vectorstore = vectorstore
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.0,
            openai_api_key=openai_api_key
        )

        # Improved search query template
        self.query_template = PromptTemplate(
            input_variables=["preferences"],
            template="""
            Create a detailed property search query based on these preferences:
            {preferences}

            Focus on:
            1. Primary requirements (bedrooms, bathrooms, price range)
            2. Location preferences and neighborhood characteristics
            3. Must-have features and amenities
            4. Lifestyle and environmental factors

            Return only the search query as a detailed but concise paragraph.
            """
        )

        self.chain = LLMChain(llm=self.llm, prompt=self.query_template)

    def semantic_search(self,
                       structured_preferences: Dict,
                       num_results: int = 3,
                       mmr_lambda: float = 0.5) -> List:
        """
        Perform semantic search with MMR

        Args:
            structured_preferences: Dict of buyer preferences
            num_results: Number of results to return
            mmr_lambda: Trade-off between max diversity (0.0) and minimum diveristy (1.0)
        """
        try:
            # Generate search query from preferences
            search_query = self.chain.run(preferences=json.dumps(structured_preferences))

            # Configure MMR retriever
            retriever = self.vectorstore.as_retriever(
                search_type="mmr",
                search_kwargs={
                    "k": num_results,
                    "fetch_k": num_results * 2,  # Fetch more candidates for diversity
                    "lambda_mult": mmr_lambda
                }
            )
            # Perform search
            results = retriever.get_relevant_documents(search_query)

            return results

        except Exception as e:
            print(f"Search error: {e}")
            return []



    def explain_match(self, listing: str, preferences: Dict) -> str:
        """Generate explanation for why listing matches preferences"""
        explanation_prompt = PromptTemplate(
            input_variables=["listing", "preferences"],
            template="""
            Explain why this property matches the buyer's preferences:

            Property Details:
            {listing}

            Buyer Preferences:
            {preferences}

            ACTION: Provide a concise explanation focusing on the key matching points
            """
        )

        chain = LLMChain(llm=self.llm, prompt=explanation_prompt)
        return chain.run(
            listing=json.dumps(listing),
            preferences=json.dumps(preferences)
        )

# Initialize search engine
search_engine = HomeMatchSearchEngine(vectorstore, openai_api_key=openapi_key)

# Perform semantic search
results = search_engine.semantic_search(
        structured_preferences=structured_preferences,
        num_results=3,
        mmr_lambda=0.5
    )
for doc in results:
  print(doc.page_content)
  print("\n")
  print("The Why of this match: ")
  print(search_engine.explain_match(doc.page_content, structured_preferences))

  print("-" * 80)



Neighborhood: Sunset Heights
Price: $650,000
Bedrooms: 4
Bathrooms: 3
House Size: 2,500 sqft
Description: Enjoy breathtaking sunsets from this stunning 4-bedroom, 3-bathroom home in Sunset Heights. The spacious open floor plan is perfect for entertaining, with a gourmet kitchen and expansive living room. Relax in the luxurious master suite with a spa-like bathroom and walk-in closet. Step outside to the backyard oasis with a pool and outdoor kitchen, perfect for summer gatherings.
Neighborhood Description: Sunset Heights is a vibrant neighborhood known for its trendy restaurants, boutique shops, and art galleries. Take a leisurely stroll along Sunset Boulevard or join a yoga class at the local studio. With easy access to hiking trails and parks, outdoor enthusiasts will love living in Sunset Heights.


The Why of this match: 
This property matches the buyer's preferences because it has a spacious house size with 4 bedrooms, a large backyard with a pool and outdoor kitchen, and is locat

# Step 6: Augmented Response Generation

**Rubric:** LLM Augmentation: For each retrieved listing, use the LLM to augment the description, tailoring it to resonate with the buyerâ€™s specific preferences. This involves subtly emphasizing aspects of the property that align with what the buyer is looking for.

In [73]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from typing import Dict, List
import json

class ListingPersonalizer:
    def __init__(self, openai_api_key):
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.3,  # Lower temperature for more factual outputs
            openai_api_key=openai_api_key
        )

        # Prompt for personalizing descriptions
        self.personalization_template = PromptTemplate(
            input_variables=["listing", "preferences"],
            template="""
            You are an expert real estate agent personalizing a property listing.

            ORIGINAL LISTING:
            {listing}

            BUYER PREFERENCES:
            {preferences}

            TASK:
            Rewrite the listing description to highlight features that match the buyer's preferences.

            REQUIREMENTS:
            1. Maintain absolute factual accuracy - DO NOT ADD or remove any property features
            2. Preserve all numerical data (price, size, rooms, etc.)
            3. Emphasize aspects that align with buyer preferences
            4. Add contextual details about how features meet the buyer's needs
            5. Keep the tone professional and authentic

            Return the response in this JSON format:
            {{
                "personalized_description": "The enhanced property description",
                "highlighted_features": ["3-5 key features that match preferences"],
                "lifestyle_match": "Brief statement about property-lifestyle fit",
                "original_specs": {{
                    "price": "original price",
                    "bedrooms": "original bedroom count",
                    "bathrooms": "original bathroom count",
                    "size": "original size",
                    "location": "original location"
                }}
            }}
            """
        )

        self.chain = LLMChain(
            llm=self.llm,
            prompt=self.personalization_template
        )

    def personalize_listing(self, listing: str, buyer_preferences: Dict) -> Dict:
        """
        Create a personalized version of the listing based on buyer preferences
        while maintaining factual integrity.
        """
        try:

            # Generate personalized content
            response = self.chain.run(
                listing=listing,
                preferences=json.dumps(buyer_preferences)
            )

            # Parse the response
            personalized = json.loads(response)


            return personalized

        except Exception as e:
            print(f"Error personalizing listing: {e}")
            return listing

    def verify_facts(self, personalized_specs: Dict, original_listing: str) -> bool:
        """
        Verify that no factual information has been altered
        """
        key_facts = ['price', 'bedrooms', 'bathrooms', 'size']
        for fact in key_facts:
            orig_value = str(original_listing.get(fact, '')).lower()
            pers_value = str(personalized_specs.get(fact, '')).lower()
            if orig_value and pers_value and orig_value != pers_value:
                raise ValueError(f"Factual discrepancy found in {fact}")
        return True




# Initialize personalizer
personalizer = ListingPersonalizer(openapi_key)


augmented_response=[]
# Generate personalized listing
for doc in results:
    augmentation = personalizer.personalize_listing(
       doc.page_content,
       structured_preferences
    )
    augmented_response.append(augmentation)
    print("Original Listing:")
    print(doc.page_content)
    print("\n")

    print("Augmented Listing:")
    print(augmentation)
    print("-" * 80)





Original Listing:
Neighborhood: Sunset Heights
Price: $650,000
Bedrooms: 4
Bathrooms: 3
House Size: 2,500 sqft
Description: Enjoy breathtaking sunsets from this stunning 4-bedroom, 3-bathroom home in Sunset Heights. The spacious open floor plan is perfect for entertaining, with a gourmet kitchen and expansive living room. Relax in the luxurious master suite with a spa-like bathroom and walk-in closet. Step outside to the backyard oasis with a pool and outdoor kitchen, perfect for summer gatherings.
Neighborhood Description: Sunset Heights is a vibrant neighborhood known for its trendy restaurants, boutique shops, and art galleries. Take a leisurely stroll along Sunset Boulevard or join a yoga class at the local studio. With easy access to hiking trails and parks, outdoor enthusiasts will love living in Sunset Heights.


Augmented Listing:
{'personalized_description': 'Experience luxury living in this spacious 4-bedroom, 3-bathroom home located in the vibrant Sunset Heights neighborhood

# Step 7: Personalizing Listing Descriptions
**Rubric**:Maintaining Factual Integrity: Ensure that the augmentation process enhances the appeal of the listing without altering factual information.

In [74]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from typing import Dict, Tuple
import json

class FactualIntegrityChecker:
    def __init__(self, openai_api_key):
        self.llm = ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=0.0,  # Zero temperature for maximum consistency
            openai_api_key=openai_api_key
        )

        # Prompt for fact checking
        self.fact_check_template = PromptTemplate(
            input_variables=["original", "personalized"],
            template="""
            You are a precise fact-checking system for real estate listings. Compare the original listing with its personalized version.

            ORIGINAL LISTING:
            {original}

            PERSONALIZED LISTING:
            {personalized}

            TASK:
            1. Compare all factual information between the listings
            2. Check for any discrepancies or alterations in:
               - Price
               - Number of bedrooms/bathrooms
               - Square footage
               - Location/neighborhood
               - Property features and amenities
               - Year built (if mentioned)
               - Property type
               - Any other numerical or factual data

            Return the analysis in this JSON format:
            {{
                "factual_accuracy_score": <score between 0-100>,
                "discrepancies": [
                    {{
                        "field": "field_name",
                        "original": "original_value",
                        "personalized": "personalized_value",
                        "severity": "high/medium/low"
                    }}
                ],
                "verified_facts": [
                    "list of correctly maintained facts"
                ],
                "overall_assessment": "brief assessment of factual integrity",
                "is_acceptable": true/false
            }}
            IF severity is LOW then this is accepta
            """
        )

        self.chain = LLMChain(
            llm=self.llm,
            prompt=self.fact_check_template
        )

    def verify_listing(self, original_listing: str, personalized_listing: Dict) -> Tuple[bool, Dict]:
        """
        Verify factual integrity between original and personalized listings
        Returns: (is_acceptable, detailed_analysis)
        """
        try:

            personalized_str = (json.dumps(personalized_listing)
                              if isinstance(personalized_listing, dict)
                              else personalized_listing)

            # Run factual Integrity
            analysis_str = self.chain.run(
                original=original_listing,
                personalized=personalized_str
            )

            analysis = json.loads(analysis_str)

            # Determine if acceptable ( score >= 95 and no high-severity discrepancies)
            is_acceptable = (
                analysis['factual_accuracy_score'] >= 85 and
                not any(d['severity'] == 'high' for d in analysis['discrepancies'])
            )

            return is_acceptable, analysis

        except Exception as e:
            print(f"Error during fact verification: {e}")
            return False, {
                "error": str(e),
                "factual_accuracy_score": 0,
                "discrepancies": [],
                "is_acceptable": False
            }


factual_checker = FactualIntegrityChecker(openapi_key)
for i,doc in enumerate(results):
    is_acceptable, analysis = factual_checker.verify_listing(
       doc.page_content,
       augmented_response[i]
    )
    if not is_acceptable:
        print("Warning: Factual discrepancies detected!")
        print(analysis)
    else:
        print(f"Personalization is acceptable {augmented_response[i]} ")
    print("-" * 80)



Personalization is acceptable {'personalized_description': 'Experience luxury living in this spacious 4-bedroom, 3-bathroom home located in the vibrant Sunset Heights neighborhood. This home boasts a gourmet kitchen, expansive living room, and a luxurious master suite with a spa-like bathroom and walk-in closet. Step outside to your backyard oasis featuring a large pool and outdoor kitchen, perfect for entertaining guests and enjoying summer gatherings.', 'highlighted_features': ['Spacious 4-bedroom layout', 'Large backyard with pool and outdoor kitchen', 'Luxurious master suite with walk-in closet'], 'lifestyle_match': 'This property offers a perfect blend of luxury living and outdoor entertainment, ideal for those seeking a spacious home with ample outdoor space for gatherings and relaxation.', 'original_specs': {'price': '$650,000', 'bedrooms': '4', 'bathrooms': '3', 'size': '2,500 sqft', 'location': 'Sunset Heights'}} 
---------------------------------------------------------------