In [1]:
import json
from pydantic import BaseModel, ValidationError
from typing import Literal
from openai import OpenAI
import dotenv
import os

In [2]:
# Load environment variables from .env file
dotenv.load_dotenv()

# API credentials
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

In [3]:
client = OpenAI(
    api_key=OPEN_AI_API_KEY,
)

In [4]:
with open("response.schema.json", "r") as f:
    schema = json.load(f)

In [5]:
type(schema)

dict

In [6]:
class ResponseSchema(BaseModel):
    hasSwimmingPool: bool
    hasBalcony: bool
    hasGarage: bool
    hasParking: bool
    hasAirConditioning: bool
    hasClotheslineSpace: bool
    hasBuiltInWardrobes: bool
    hasDishwasher: bool
    hasWashingMachine: bool
    hasDryer: bool
    hasLift: bool
    isFurnished: bool
    isPetFriendly: bool
    isSmokingAllowed: bool
    isWheelchairAccessible: bool
    hasGarden: bool
    hasFireplace: bool
    hasAlarmSystem: bool
    hasCityViews: bool
    hasWaterViews: bool
    isLuxury: bool
    isTemporaryContract: bool
    isLongTermContract: bool
    isStudentFriendly: bool
    energyEfficiencyRating: Literal[
        "A+++",
        "A++",
        "A+",
        "A",
        "B",
        "C",
        "D",
        "E",
        "F",
        "Unknown",
    ]
    hasHomeAppliances: bool
    hasOven: bool
    hasMicrowave: bool
    hasRefrigerator: bool
    hasStove: bool
    hasTerrace: bool
    hasStorageRoom: bool
    hasInternetIncluded: bool
    hasCableTVIncluded: bool
    hasSmartHomeFeatures: bool
    hasCCTV: bool
    hasGym: bool
    isNewlyRenovated: bool
    hasModernFinish: bool

In [7]:
def extract_features(listing_text):
    system_msg = {
        "role": "system",
        "content": (
            "You are an assistant that extracts real estate listing features from text. "
            "Read the provided text and output a JSON object that exactly follows the schema. "
            "Do not include any extra explanation or text."
        ),
    }

    user_msg = {
        "role": "user",
        "content": f"""
        Given the following real estate listing text, please determine the values for each of these features. 
        If a feature is not clearly mentioned, assume it is false for booleans or "Unknown" for energyEfficiencyRating.

        - Does it mention a pool? (hasSwimmingPool)
        - Does it mention a balcony? (hasBalcony)
        - Does it mention a garage? (hasGarage)
        - Does it mention a parking space? (hasParking)
        - Does it mention air conditioning? (hasAirConditioning)
        - Does it mention a clothesline space? (hasClotheslineSpace)
        - Does it mention built-in wardrobes? (hasBuiltInWardrobes)
        - Does it mention a dishwasher? (hasDishwasher)
        - Does it mention a washing machine? (hasWashingMachine)
        - Does it mention a dryer? (hasDryer)
        - Does it mention an elevator/lift? (hasLift)
        - Is the property furnished? (isFurnished)
        - Does it allow pets? (isPetFriendly)
        - Does it allow smoking? (isSmokingAllowed)
        - Is it wheelchair accessible? (isWheelchairAccessible)
        - Does it mention a garden? (hasGarden)
        - Does it mention a fireplace? (hasFireplace)
        - Does it mention an alarm system? (hasAlarmSystem)
        - Does it have city views? (hasCityViews)
        - Does it have water views? (hasWaterViews)
        - Is it a luxury listing? (isLuxury)
        - Is it a temporary (fixed) contract? (isTemporaryContract)
        - Is it a long-term contract? (isLongTermContract)
        - Is it student-friendly? (isStudentFriendly)
        - What is the energy efficiency rating? (energyEfficiencyRating)
        - Does it mention home appliances in general? (hasHomeAppliances)
        - Does it mention an oven? (hasOven)
        - Does it mention a microwave? (hasMicrowave)
        - Does it mention a refrigerator? (hasRefrigerator)
        - Does it mention a stove? (hasStove)
        - Does it mention a terrace? (hasTerrace)
        - Does it mention a storage room? (hasStorageRoom)
        - Is internet included? (hasInternetIncluded)
        - Is cable TV included? (hasCableTVIncluded)
        - Does it have smart home features? (hasSmartHomeFeatures)
        - Does it mention CCTV? (hasCCTV)
        - Does it have a gym? (hasGym)
        - Is it newly renovated? (isNewlyRenovated)
        - Does it have modern finishes? (hasModernFinish)

        Text:
        \"{listing_text}\"
        """,
    }

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[system_msg, user_msg],
        response_format={"type": "json_schema", "json_schema": schema},
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    return response

In [8]:
# Example usage:
example_description = "Encantador Penthouse no coração do Chiado, recentemente renovado e com uma área bruta de 162 m², este apartamento oferece um espaço amplo e moderno, ideal para quem procura conforto e sofisticação.  A sala de 45m² com cozinha aberta de 16m² cria um ambiente acolhedor e perfeito para receber amigos e familiares.  O terraço de 12m² é um verdadeiro refúgio, proporcionando momentos de relaxamento com uma vista deslumbrante.  Com quatro quartos, dois deles em mezzanine, e duas casas de banho completas (uma com duche e outra com banheira), este imóvel é perfeito para famílias ou para quem precisa de espaço extra. Localizado no 4º andar de um edifício com elevador, o apartamento beneficia de uma excelente orientação solar (Sul e Oeste), garantindo muita luz natural durante todo o dia. Equipado com ar condicionado e aquecimento elétrico, o conforto está assegurado em todas as estações do ano. Além disso, há a possibilidade de arrendamento a curto e médio prazo: Outubro-Maio: 8.500€/mês Junho e Setembro: 10.000€/mês Julho e Agosto: 12.000€/mês."

In [9]:
response = extract_features(example_description)

In [11]:
content = response.choices[0].message.content

In [13]:
try:
    # Parse and validate the response content
    solution = ResponseSchema.model_validate_json(content)
except ValidationError as e:
    # Handle validation errors
    print(e.json())

In [14]:
data = json.loads(content)

In [19]:
import pandas as pd

pd.read_json(json.dumps(data), orient="index").T

  pd.read_json(json.dumps(data), orient="index").T


Unnamed: 0,hasSwimmingPool,hasBalcony,hasGarage,hasParking,hasAirConditioning,hasClotheslineSpace,hasBuiltInWardrobes,hasDishwasher,hasWashingMachine,hasDryer,...,hasStove,hasTerrace,hasStorageRoom,hasInternetIncluded,hasCableTVIncluded,hasSmartHomeFeatures,hasCCTV,hasGym,isNewlyRenovated,hasModernFinish
0,False,False,False,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,True
