In [None]:
import os
import re
import json
from pathlib import Path
import pandas as pd
import datetime as dt
from dataclasses import dataclass
from tqdm import tqdm
from labnotes.scraping_tools.utils import *
from labnotes.scraping_tools.llm.openai_utils import query_llm_sync

In [2]:
def clean_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text)

def is_dir_empty(path):
    """Check if directory is empty."""
    return not any(Path(path).iterdir())


DATA_PATH = Path("/Users/giovannidoni/giovanni-data")
DATA_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
@dataclass
class AuctionPage:
    listings: list
    next_pages: list

@dataclass
class AuctionDataPage:
    listings_url: list
    description: list
    perizia_url: str | None
    data: dict
    raw_text: str


def scrape_links(url, driver):
    wait_for_page_content(driver)
    _link = get_links(url, driver=driver)
    listings = [i for i in _link if i[0] and len(i[1]) > 50 and '/aste/' in i[0]]
    next_pages = find_page_links(driver)
    return AuctionPage(listings=listings, next_pages=next_pages)

In [4]:
provincia = "treviso"
# provincia = "belluno"

In [5]:
url = f"https://www.asteannunci.it/aste-immobiliari/case/{provincia}"
driver = uc_make_driver(headless=False)

2025-12-05 18:59:45 - INFO - patching driver executable /Users/giovannidoni/Library/Application Support/undetected_chromedriver/undetected_chromedriver


In [6]:
# results = scrape_links(url, driver)
# listings = results.listings

In [None]:
def enrich_listing(listings, driver=None):
    listings_enriched = []

    for listing in listings:
        perizia_url = get_links(listing[0], driver=driver, apply_filter='perizia')
        page_text = get_clean_page_text(driver, listing[0])
                
        listings_enriched.append(AuctionDataPage(
            listing[0],
            listing[1],
            None if not perizia_url else perizia_url[0][0],
            extract_property_data(page_text),
            page_text,
        ))
    return listings_enriched

def get_data(base_url):
    driver = get_driver(headless=True)
    results = scrape_links(base_url, driver)
    listings = results.listings
    print(f"Found {len( results.next_pages)} pages of results on first page.")
    for i, next_page in results.next_pages:
        print(f"Scraping page {i+1}/{len(results.next_pages)}")
        results = scrape_links(next_page["url"], driver)
        listings += results.listings

    # Enrich BEFORE quitting
    enriched = enrich_listing(listings, driver)
    
    # Quit AFTER all operations are done
    driver.quit()
    
    return enriched


PROPERTY_EXTRACTION_PROMPT = """You are an expert real estate data extractor. 

Extract structured information from the following property listing text.

Extract the following fields:
- title: The property title/heading
- property_type: Type of property (apartment, house, villa, commercial, land, etc.)
- location: Full address or location (city, region, street if available)
- comune: Municipality name, if available
- price: Current price or starting bid (include currency)
- original_price: Original/appraised price if mentioned
- area_sqm: Property area in square meters
- condition: Property condition (good, needs renovation, new, etc.)
- occupancy_status: Occupied, vacant, deptor occupied, etc.
- auction_date: Auction date if mentioned
- auction_type: Type of auction (judicial, administrative, etc.)
- description: Brief description of the property
- features: List of notable features (balcony, parking, garden, etc.)
- cadastral_data: Cadastral information if available
- lot_number: Auction lot number if applicable

If a field is not found or not applicable, use null.

Translate all information to Italian.

Property listing text:
{property_text}
"""

PROPERTY_SCHEMA = {
    "type": "object",
    "properties": {
        "title": {"type": ["string", "null"]},
        "property_type": {"type": ["string", "null"]},
        "location": {"type": ["string", "null"]},
        "comune": {"type": ["string", "null"]},
        "price": {"type": ["number", "null"]},
        "area_sqm": {"type": ["number", "null"]},
        "condition": {"type": ["string", "null"]},
        "auction_date": {"type": ["string", "null"]},
        "auction_type": {"type": ["string", "null"]},
        "description": {"type": ["string", "null"]},
        "features": {
            "type": "array",
            "items": {"type": ["string", "null"]}
        },
        "cadastral_data": {"type": ["string", "null"]},
        "lot_number": {"type": ["string", "null"]}
    },
    "required": ["title", "property_type", "location", "comune", "price"],
    "additionalProperties": False
}

def extract_property_data(page_text, model="gpt-4o-mini"):
    """Extract structured property data from HTML text."""
    
    # Create the prompt
    prompt = PROPERTY_EXTRACTION_PROMPT.format(property_text=page_text)
    
    # Format messages for OpenAI API
    messages = [
        {"role": "system", "content": "You are a real estate data extraction expert. Always respond with valid JSON."},
        {"role": "user", "content": prompt}
    ]
    
    # Call the LLM with structured output
    result = query_llm_sync(
        model=model,
        messages=messages,
        temperature=1,
        max_tokens=5000,
        response_format={"type": "json_schema", "json_schema": {
            "name": "property_extraction",
            "strict": False,
            "schema": PROPERTY_SCHEMA
        }}
    )
    
    return result

def extract_df(listings):
    _df = pd.DataFrame(listings).drop(["description", "raw_text"], axis=1)
    df_expanded = pd.json_normalize(_df['data'])
    df_final = pd.concat([_df.drop('data', axis=1), df_expanded], axis=1)
    return df_final

In [None]:
all_listings = get_data(url)

2025-12-05 18:59:48 - INFO - Get LATEST chromedriver version for google-chrome
2025-12-05 18:59:48 - INFO - Get LATEST chromedriver version for google-chrome
2025-12-05 18:59:48 - INFO - Driver [/Users/giovannidoni/.wdm/drivers/chromedriver/mac64/143.0.7499.40/chromedriver-mac-arm64/chromedriver] found in cache
Found 5 pages of results on first page.
2025-12-05 19:00:25 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-05 19:00:25 - INFO - Usage: {'completion_tokens': 309, 'prompt_tokens': 2425, 'total_tokens': 2734, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}
2025-12-05 19:00:25 - INFO - Query cost: $0.000549
2025-12-05 19:00:32 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-05 19:00:32 - INFO - Usage: {'completion_tokens': 285, 'p

In [None]:
df = extract_df(all_listings)

In [None]:
df.sort_values(by="price", descending=False).to_csv(DATA_PATH / f"auction_listings_{provincia}_{str(dt.datetime.today().date())}.csv", index=False)