In [None]:
from pydantic import BaseModel, ValidationError
from typing import Literal
from openai import OpenAI
import dotenv
import sys
import os
import pandas as pd
import time

import json
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# to visualise al the columns in the dataframe
pd.pandas.set_option("display.max_columns", None)
# to display all the rows of the dataframe in the notebook
pd.pandas.set_option("display.max_rows", None)

In [None]:
# Load environment variables from .env file
dotenv.load_dotenv()

# API credentials
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

In [None]:
class ResponseSchema(BaseModel):
    hasSwimmingPool: bool
    hasBalcony: bool
    hasGarage: bool
    hasParking: bool
    hasAirConditioning: bool
    hasClotheslineSpace: bool
    hasBuiltInWardrobes: bool
    hasDishwasher: bool
    hasWashingMachine: bool
    hasDryer: bool
    hasLift: bool
    isFurnished: bool
    isPetFriendly: bool
    isSmokingAllowed: bool
    isWheelchairAccessible: bool
    hasGarden: bool
    hasFireplace: bool
    hasAlarmSystem: bool
    hasCityViews: bool
    hasWaterViews: bool
    isLuxury: bool
    isTemporaryContract: bool
    isLongTermContract: bool
    isStudentFriendly: bool
    energyEfficiencyRating: Literal[
        "A+++",
        "A++",
        "A+",
        "A",
        "B",
        "C",
        "D",
        "E",
        "F",
        "Unknown",
    ]
    hasHomeAppliances: bool
    hasOven: bool
    hasMicrowave: bool
    hasRefrigerator: bool
    hasStove: bool
    hasTerrace: bool
    hasStorageRoom: bool
    hasInternetIncluded: bool
    hasCableTVIncluded: bool
    hasSmartHomeFeatures: bool
    hasCCTV: bool
    hasGym: bool
    isNewlyRenovated: bool
    hasModernFinish: bool


class OpenAITextFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, api_key, schema_path="response.schema.json", retry_attempts=3, delay_seconds=2):
        self.api_key = api_key
        self.client = OpenAI(api_key=self.api_key)
        self.schema_path = schema_path
        self.retry_attempts = retry_attempts
        self.delay_seconds = delay_seconds

        with open(self.schema_path, "r") as f:
            self.schema = json.load(f)

    def extract_features(self, listing_text):
        self.system_message = {
            "role": "system",
            "content": (
                "You are an assistant that extracts real estate listing features from text. "
                "Read the provided text and output a JSON object that exactly follows the schema. "
                "Do not include any extra explanation or text."
            ),
        }

        self.user_message = {
            "role": "user",
            "content": f"""
            Given the following real estate listing text, please determine the values for each of these features. 
            If a feature is not clearly mentioned, assume it is false for booleans or "Unknown" for energyEfficiencyRating.

            - Does it mention a pool? (hasSwimmingPool)
            - Does it mention a balcony? (hasBalcony)
            - Does it mention a garage? (hasGarage)
            - Does it mention a parking space? (hasParking)
            - Does it mention air conditioning? (hasAirConditioning)
            - Does it mention a clothesline space? (hasClotheslineSpace)
            - Does it mention built-in wardrobes? (hasBuiltInWardrobes)
            - Does it mention a dishwasher? (hasDishwasher)
            - Does it mention a washing machine? (hasWashingMachine)
            - Does it mention a dryer? (hasDryer)
            - Does it mention an elevator/lift? (hasLift)
            - Is the property furnished? (isFurnished)
            - Does it allow pets? (isPetFriendly)
            - Does it allow smoking? (isSmokingAllowed)
            - Is it wheelchair accessible? (isWheelchairAccessible)
            - Does it mention a garden? (hasGarden)
            - Does it mention a fireplace? (hasFireplace)
            - Does it mention an alarm system? (hasAlarmSystem)
            - Does it have city views? (hasCityViews)
            - Does it have water views? (hasWaterViews)
            - Is it a luxury listing? (isLuxury)
            - Is it a temporary (fixed) contract? (isTemporaryContract)
            - Is it a long-term contract? (isLongTermContract)
            - Is it student-friendly? (isStudentFriendly)
            - What is the energy efficiency rating? (energyEfficiencyRating)
            - Does it mention home appliances in general? (hasHomeAppliances)
            - Does it mention an oven? (hasOven)
            - Does it mention a microwave? (hasMicrowave)
            - Does it mention a refrigerator? (hasRefrigerator)
            - Does it mention a stove? (hasStove)
            - Does it mention a terrace? (hasTerrace)
            - Does it mention a storage room? (hasStorageRoom)
            - Is internet included? (hasInternetIncluded)
            - Is cable TV included? (hasCableTVIncluded)
            - Does it have smart home features? (hasSmartHomeFeatures)
            - Does it mention CCTV? (hasCCTV)
            - Does it have a gym? (hasGym)
            - Is it newly renovated? (isNewlyRenovated)
            - Does it have modern finishes? (hasModernFinish)

            Text:
            \"{listing_text}\"
            """,
        }

        for attempt in range(self.retry_attempts):
            try:
                response = self.client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[self.system_message, self.user_message],
                    response_format={"type": "json_schema", "json_schema": self.schema},
                    temperature=1,
                    max_completion_tokens=2048,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                )

                content = response.choices[0].message.content

                try:
                    ResponseSchema.model_validate_json(content)
                except ValidationError as e:
                    print(e.json())
                
                features = json.loads(content)
                return features
            except (json.JSONDecodeError, Exception) as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < self.retry_attempts - 1:
                    time.sleep(self.delay_seconds)
                else:
                    return {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        results = []
        for index, row in X.iterrows():
            try:
                features = self.extract_features(row["description"])
                results.append(features)
            except Exception as e:
                print(f"Skipping row {index} due to error: {e}")
                results.append({})
        extracted_features = pd.DataFrame(results, index=X.index)
        return pd.concat([X, extracted_features], axis=1)

In [None]:
# Get the absolute path of the scripts directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the scripts directory to Python's module search path
sys.path.append(root_dir)

from scripts.preprocessors import IdealistaDataLoader

data_loader = IdealistaDataLoader(
    read_path=f"../data/idealista",
    city="lisbon",
    operation="rent",
    date_or_unioned="unioned",
    include_geodata=True,
    index_col="propertyCode",
)

full_df = data_loader.load_data()

In [None]:
X = full_df.iloc[679:681, :]

In [None]:
X

In [None]:
open_ai_text_feature_extractor = OpenAITextFeatureExtractor(api_key=OPEN_AI_API_KEY)
open_ai_text_feature_extractor.fit(X)

In [None]:
X = open_ai_text_feature_extractor.transform(X)

In [None]:
X

In [None]:
X[["hasParking", "hasParkingSpace"]]

In [None]:
open_ai_text_feature_extractor = OpenAITextFeatureExtractor(api_key=OPEN_AI_API_KEY)
open_ai_text_feature_extractor.fit(full_df)

In [None]:
full_df = open_ai_text_feature_extractor.transform(full_df)

In [None]:
full_df.head()

In [None]:
full_df.shape

In [None]:
full_df.to_csv("../data/idealista/cleaned/rent/lisbon/unioned-lisbon-listings-for-rent-with-geodata-openai.csv")