In [3]:
# Create the agent workflow
def create_property_price_agent():
    """Create and compile the property price fetcher agent"""
    
    # Create the graph
    workflow = StateGraph(AgentState)
    
    # Add nodes
    workflow.add_node("scrape_websites", scrape_websites_node)
    workflow.add_node("extract_prices", extract_prices_node)
    workflow.add_node("calculate_total", calculate_total_node)
    
    # Define the flow
    workflow.set_entry_point("scrape_websites")
    workflow.add_edge("scrape_websites", "extract_prices")
    workflow.add_edge("extract_prices", "calculate_total")
    workflow.add_edge("calculate_total", END)
    
    # Compile the graph
    app = workflow.compile()
    
    return app

# Create the agent
property_price_agent = create_property_price_agent()

NameError: name 'StateGraph' is not defined

# Property Price Fetcher Agent

This notebook implements an agent that fetches property prices from websites by:
1. Accepting an address prompt
2. Scraping HTML from property websites
3. Extracting specific div content
4. Using an LLM to parse prices from the HTML
5. Summing up the total property prices

In [None]:
# Import required libraries
import os
from typing import TypedDict, List, Dict, Any
import requests
from bs4 import BeautifulSoup
import json
from langgraph.graph import StateGraph, END
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
import re
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

print("Libraries imported successfully!")

In [None]:
# Define the state for our agent
class AgentState(TypedDict):
    address: str
    websites: List[Dict[str, Any]]  # Contains url, div_selector, html_content
    extracted_prices: List[float]
    total_price: float
    messages: List[str]

In [None]:
# Initialize the LLM using OpenAI (API key loaded from .env)
try:
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    print("LLM initialized successfully using OpenAI!")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    print("Please ensure OPENAI_API_KEY is set in your .env file")

In [None]:
# Function to scrape HTML content from websites
def scrape_website(url: str, div_selector: str = None) -> Dict[str, Any]:
    """
    Scrape website and extract specific div content if selector provided
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # If specific div selector provided, extract that content
        if div_selector:
            # Handle different selector types (id, class, etc.)
            if div_selector.startswith('#'):
                # ID selector
                element = soup.find(id=div_selector[1:])
            elif div_selector.startswith('.'):
                # Class selector
                element = soup.find(class_=div_selector[1:])
            else:
                # Tag selector or complex selector
                element = soup.select_one(div_selector)
            
            if element:
                content = str(element)
            else:
                content = f"No element found with selector: {div_selector}"
        else:
            # Return full HTML if no selector specified
            content = str(soup)
        
        return {
            "url": url,
            "selector": div_selector,
            "content": content,
            "status": "success"
        }
        
    except Exception as e:
        return {
            "url": url,
            "selector": div_selector,
            "content": None,
            "status": "error",
            "error": str(e)
        }

In [None]:
# Function to extract prices from HTML using LLM
def extract_prices_with_llm(html_content: str, address: str) -> List[float]:
    """
    Use LLM to extract property prices from HTML content
    """
    if not html_content:
        return []
    
    system_prompt = """You are a property price extraction expert. Your task is to extract property prices from HTML content.
    
    Rules:
    1. Look for price patterns like $XXX,XXX or £XXX,XXX or €XXX,XXX
    2. Extract ONLY property prices, not other prices like fees or taxes
    3. Return prices as a JSON array of numbers (without currency symbols or commas)
    4. If no prices found, return empty array []
    5. Focus on the main property listing price
    """
    
    user_prompt = f"""Extract property prices for the address: {address}
    
    From the following HTML content:
    {html_content[:3000]}  # Limiting to avoid token limits
    
    Return ONLY a JSON array of price numbers, e.g., [450000, 525000]
    """
    
    try:
        response = llm.invoke([
            SystemMessage(content=system_prompt),
            HumanMessage(content=user_prompt)
        ])
        
        # Extract JSON array from response
        content = response.content
        # Try to find JSON array pattern
        json_match = re.search(r'\[[\d,\s]*\]', content)
        if json_match:
            prices_str = json_match.group()
            prices = json.loads(prices_str)
            return [float(price) for price in prices]
        else:
            # Try to extract individual numbers
            numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', content)
            prices = []
            for num in numbers:
                clean_num = float(num.replace(',', ''))
                if clean_num > 10000:  # Assume property prices are > $10k
                    prices.append(clean_num)
            return prices
            
    except Exception as e:
        print(f"Error extracting prices: {e}")
        return []

In [None]:
# Define the agent nodes

def scrape_websites_node(state: AgentState) -> AgentState:
    """Node to scrape websites for property data"""
    messages = state.get("messages", [])
    messages.append(f"Scraping websites for address: {state['address']}")
    
    scraped_data = []
    for website in state["websites"]:
        result = scrape_website(website["url"], website.get("div_selector"))
        scraped_data.append(result)
        
        if result["status"] == "success":
            messages.append(f"Successfully scraped {website['url']}")
        else:
            messages.append(f"Failed to scrape {website['url']}: {result.get('error', 'Unknown error')}")
    
    state["websites"] = scraped_data
    state["messages"] = messages
    return state

def extract_prices_node(state: AgentState) -> AgentState:
    """Node to extract prices from scraped HTML using LLM"""
    messages = state.get("messages", [])
    all_prices = []
    
    for website in state["websites"]:
        if website["status"] == "success" and website["content"]:
            prices = extract_prices_with_llm(website["content"], state["address"])
            all_prices.extend(prices)
            
            if prices:
                messages.append(f"Found {len(prices)} price(s) from {website['url']}: {prices}")
            else:
                messages.append(f"No prices found from {website['url']}")
    
    state["extracted_prices"] = all_prices
    state["messages"] = messages
    return state

def calculate_total_node(state: AgentState) -> AgentState:
    """Node to calculate total of all extracted prices"""
    messages = state.get("messages", [])
    
    if state["extracted_prices"]:
        total = sum(state["extracted_prices"])
        state["total_price"] = total
        messages.append(f"Total property price: ${total:,.2f}")
        messages.append(f"Average price: ${total/len(state['extracted_prices']):,.2f}")
    else:
        state["total_price"] = 0
        messages.append("No prices were found to calculate total")
    
    state["messages"] = messages
    return state

In [None]:
# Main function to run the agent
def fetch_property_prices(address: str, websites: List[Dict[str, str]]) -> Dict[str, Any]:
    """
    Fetch property prices for a given address from specified websites
    
    Args:
        address: The property address to search for
        websites: List of dicts with 'url' and optional 'div_selector' keys
        
    Returns:
        Dict with results including total price, individual prices, and messages
    """
    
    # Initialize state
    initial_state = {
        "address": address,
        "websites": websites,
        "extracted_prices": [],
        "total_price": 0.0,
        "messages": []
    }
    
    # Run the agent
    result = property_price_agent.invoke(initial_state)
    
    # Format the output
    output = {
        "address": address,
        "total_price": result["total_price"],
        "individual_prices": result["extracted_prices"],
        "num_prices_found": len(result["extracted_prices"]),
        "messages": result["messages"]
    }
    
    return output

In [None]:
# Example: Using the Agent for "74 hilda rd baulkham hills nsw"
# Domain.com.au may block direct scraping, so this shows the configuration

address = "74 hilda rd baulkham hills nsw"
websites = [
    {
        "url": "https://www.domain.com.au/property-profile/74-hilda-road-baulkham-hills-nsw-2153",
        "div_selector": "[data-testid='estimate-card']"
    }
]

print(f"Configuration for: {address}")
print(f"Website: {websites[0]['url']}")
print(f"Target element: {websites[0]['div_selector']}")

# Example usage (Domain.com.au may block scraping due to anti-bot measures)
# Uncomment to try:
"""
result = fetch_property_prices(address, websites)
print("\\n=== Results ===")
print(f"Address: {result['address']}")
print(f"Total Price: ${result['total_price']:,.2f}")
print(f"Individual prices: {result['individual_prices']}")
for msg in result['messages']:
    print(f"- {msg}")
"""

# Demonstration with mock Domain.com.au data
print("\\n" + "="*40)
print("MOCK DEMONSTRATION")
print("="*40)

mock_domain_html = """
<div data-testid="estimate-card" class="estimate-card">
    <h3>Property Estimate</h3>
    <div class="price-range">
        <span class="estimate-low">$1,180,000</span>
        <span class="estimate-high">$1,290,000</span>
    </div>
    <div class="confidence">High confidence estimate</div>
</div>
"""

# Test with mock data
prices = extract_prices_with_llm(mock_domain_html, address)
print(f"Mock extraction results: {prices}")
if prices:
    print(f"Price range: ${min(prices):,.2f} - ${max(prices):,.2f}")
    print(f"Average estimate: ${sum(prices)/len(prices):,.2f}")

print("\\nNote: Many property websites use anti-scraping measures.")
print("For production use, consider using official APIs or web scraping services.")

## How to Use This Agent

1. **Environment Setup:**
   - The agent uses OpenAI API key from your `.env` file
   - Make sure your `.env` file contains: `OPENAI_API_KEY=your-key-here`

2. **Prepare website configurations:**
   - Provide URLs of property listing websites
   - Specify div selectors for price elements (optional)
   - Use CSS selectors like `.class-name`, `#id-name`, or `div.specific-class`

3. **Run the agent:**
   ```python
   websites = [
       {"url": "https://property-site.com/...", "div_selector": ".price"},
       {"url": "https://another-site.com/...", "div_selector": "#listing-price"}
   ]
   
   result = fetch_property_prices("123 Main St, City, State", websites)
   ```

4. **The agent will:**
   - Scrape each website
   - Extract HTML content from specified divs
   - Use LLM to identify and parse property prices
   - Sum all found prices
   - Return detailed results with individual prices and total

In [4]:
# First, let's run all the code cells in order to set up the notebook

# Import required libraries
import os
from typing import TypedDict, List, Dict, Any
import requests
from bs4 import BeautifulSoup
import json
from langgraph.graph import StateGraph, END
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage
import re

# Define the state for our agent
class AgentState(TypedDict):
    address: str
    websites: List[Dict[str, Any]]  # Contains url, div_selector, html_content
    extracted_prices: List[float]
    total_price: float
    messages: List[str]

# Initialize the LLM (you'll need to set your API key)
# os.environ["GOOGLE_API_KEY"] = "your-api-key-here"
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

# Function to scrape HTML content from websites
def scrape_website(url: str, div_selector: str = None) -> Dict[str, Any]:
    """
    Scrape website and extract specific div content if selector provided
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # If specific div selector provided, extract that content
        if div_selector:
            # Handle different selector types (id, class, etc.)
            if div_selector.startswith('#'):
                # ID selector
                element = soup.find(id=div_selector[1:])
            elif div_selector.startswith('.'):
                # Class selector
                element = soup.find(class_=div_selector[1:])
            else:
                # Tag selector or complex selector
                element = soup.select_one(div_selector)
            
            if element:
                content = str(element)
            else:
                content = f"No element found with selector: {div_selector}"
        else:
            # Return full HTML if no selector specified
            content = str(soup)
        
        return {
            "url": url,
            "selector": div_selector,
            "content": content,
            "status": "success"
        }
        
    except Exception as e:
        return {
            "url": url,
            "selector": div_selector,
            "content": None,
            "status": "error",
            "error": str(e)
        }

# Function to extract prices from HTML using LLM
def extract_prices_with_llm(html_content: str, address: str) -> List[float]:
    """
    Use LLM to extract property prices from HTML content
    """
    if not html_content:
        return []
    
    system_prompt = """You are a property price extraction expert. Your task is to extract property prices from HTML content.
    
    Rules:
    1. Look for price patterns like $XXX,XXX or £XXX,XXX or €XXX,XXX
    2. Extract ONLY property prices, not other prices like fees or taxes
    3. Return prices as a JSON array of numbers (without currency symbols or commas)
    4. If no prices found, return empty array []
    5. Focus on the main property listing price
    """
    
    user_prompt = f"""Extract property prices for the address: {address}
    
    From the following HTML content:
    {html_content[:3000]}  # Limiting to avoid token limits
    
    Return ONLY a JSON array of price numbers, e.g., [450000, 525000]
    """
    
    try:
        response = llm.invoke([
            SystemMessage(content=system_prompt),
            HumanMessage(content=user_prompt)
        ])
        
        # Extract JSON array from response
        content = response.content
        # Try to find JSON array pattern
        json_match = re.search(r'\[[\d,\s]*\]', content)
        if json_match:
            prices_str = json_match.group()
            prices = json.loads(prices_str)
            return [float(price) for price in prices]
        else:
            # Try to extract individual numbers
            numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', content)
            prices = []
            for num in numbers:
                clean_num = float(num.replace(',', ''))
                if clean_num > 10000:  # Assume property prices are > $10k
                    prices.append(clean_num)
            return prices
            
    except Exception as e:
        print(f"Error extracting prices: {e}")
        return []

# Define the agent nodes

def scrape_websites_node(state: AgentState) -> AgentState:
    """Node to scrape websites for property data"""
    messages = state.get("messages", [])
    messages.append(f"Scraping websites for address: {state['address']}")
    
    scraped_data = []
    for website in state["websites"]:
        result = scrape_website(website["url"], website.get("div_selector"))
        scraped_data.append(result)
        
        if result["status"] == "success":
            messages.append(f"Successfully scraped {website['url']}")
        else:
            messages.append(f"Failed to scrape {website['url']}: {result.get('error', 'Unknown error')}")
    
    state["websites"] = scraped_data
    state["messages"] = messages
    return state

def extract_prices_node(state: AgentState) -> AgentState:
    """Node to extract prices from scraped HTML using LLM"""
    messages = state.get("messages", [])
    all_prices = []
    
    for website in state["websites"]:
        if website["status"] == "success" and website["content"]:
            prices = extract_prices_with_llm(website["content"], state["address"])
            all_prices.extend(prices)
            
            if prices:
                messages.append(f"Found {len(prices)} price(s) from {website['url']}: {prices}")
            else:
                messages.append(f"No prices found from {website['url']}")
    
    state["extracted_prices"] = all_prices
    state["messages"] = messages
    return state

def calculate_total_node(state: AgentState) -> AgentState:
    """Node to calculate total of all extracted prices"""
    messages = state.get("messages", [])
    
    if state["extracted_prices"]:
        total = sum(state["extracted_prices"])
        state["total_price"] = total
        messages.append(f"Total property price: ${total:,.2f}")
        messages.append(f"Average price: ${total/len(state['extracted_prices']):,.2f}")
    else:
        state["total_price"] = 0
        messages.append("No prices were found to calculate total")
    
    state["messages"] = messages
    return state

# Create the agent workflow
def create_property_price_agent():
    """Create and compile the property price fetcher agent"""
    
    # Create the graph
    workflow = StateGraph(AgentState)
    
    # Add nodes
    workflow.add_node("scrape_websites", scrape_websites_node)
    workflow.add_node("extract_prices", extract_prices_node)
    workflow.add_node("calculate_total", calculate_total_node)
    
    # Define the flow
    workflow.set_entry_point("scrape_websites")
    workflow.add_edge("scrape_websites", "extract_prices")
    workflow.add_edge("extract_prices", "calculate_total")
    workflow.add_edge("calculate_total", END)
    
    # Compile the graph
    app = workflow.compile()
    
    return app

# Create the agent
property_price_agent = create_property_price_agent()

# Main function to run the agent
def fetch_property_prices(address: str, websites: List[Dict[str, str]]) -> Dict[str, Any]:
    """
    Fetch property prices for a given address from specified websites
    
    Args:
        address: The property address to search for
        websites: List of dicts with 'url' and optional 'div_selector' keys
        
    Returns:
        Dict with results including total price, individual prices, and messages
    """
    
    # Initialize state
    initial_state = {
        "address": address,
        "websites": websites,
        "extracted_prices": [],
        "total_price": 0.0,
        "messages": []
    }
    
    # Run the agent
    result = property_price_agent.invoke(initial_state)
    
    # Format the output
    output = {
        "address": address,
        "total_price": result["total_price"],
        "individual_prices": result["extracted_prices"],
        "num_prices_found": len(result["extracted_prices"]),
        "messages": result["messages"]
    }
    
    return output

# Test that everything is set up correctly
print("Property Price Fetcher Agent initialized successfully!")
print(f"Agent created: {property_price_agent is not None}")
print(f"fetch_property_prices function ready: {fetch_property_prices is not None}")



DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.