In [1]:
import pandas as pd
import ast
import datetime

In [2]:
def is_valid_date_format(date_string):
    try:
        datetime.datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [3]:
city = "lisbon"
operation = "rent"
date_or_unioned = "unioned"

In [4]:
if not (date_or_unioned == "unioned" or is_valid_date_format(date_or_unioned)):
    raise ValueError(
        "date_or_unioned must be 'unioned' or a valid date in the format 'YYYY-MM-DD'"
    )

In [5]:
file_name = f"{date_or_unioned}-{city}-listings-for-{operation}"

In [6]:
read_path = f"../data/idealista"

if date_or_unioned == "unioned":
    df = pd.read_csv(f"{read_path}/unioned/{operation}/{city}/{file_name}.csv", index_col="propertyCode")
else:
    df = pd.read_csv(f"{read_path}/raw/{operation}/{city}/{file_name}.csv", index_col="propertyCode")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [12]:
df = df.drop(columns=['thumbnail', 'numPhotos', 'operation', 'hasVideo', 'hasPlan', 'has3DTour', 'has360', 'hasStaging', 'externalReference', 'detailedType', 'suggestedTexts'])

In [None]:
df.columns

In [14]:
df = df.drop_duplicates()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.floor.unique()

In [None]:
df[df.floor == 'en']

In [19]:
df.floor = df.floor.str.replace('bj', '0')
df.floor = df.floor.str.replace('-', '')
df.floor = df.floor.str.replace('st', '-1')
df.floor = df.floor.str.replace('ss', '-1')
df.floor = df.floor.str.replace('en', '-1')
df.floor = df.floor.astype(float)

In [None]:
df.floor.describe()

In [None]:
df.priceInfo.unique()

In [22]:
# Function to parse the price and coalesce the formerPrice
def extract_former_price(price_str):
    # Parse the string to a dictionary
    price_dict = ast.literal_eval(price_str)
    price_info = price_dict.get('price', {})
    amount = price_info.get('amount')  # Current price
    price_drop_info = price_info.get('priceDropInfo', {})
    former_price = price_drop_info.get('formerPrice')  # Former price, if available
    # Coalesce formerPrice with the current price
    return former_price if former_price is not None else amount

# Apply the function to create the formerPrice column
df['formerPrice'] = df['priceInfo'].apply(extract_former_price)
df['priceChange'] = df['formerPrice'] - df['price']
df = df.drop(columns=['priceInfo'])

In [None]:
df['formerPrice']

In [None]:
df.head()

In [None]:
df.parkingSpace.unique()

In [26]:
df['parkingSpace'] = df['parkingSpace'].fillna('{}')
df['parsed'] = df['parkingSpace'].apply(ast.literal_eval)
df['hasParkingSpace'] = df['parsed'].apply(lambda x: x.get('hasParkingSpace', False))
df['isParkingSpaceIncludedInPrice'] = df['parsed'].apply(lambda x: x.get('isParkingSpaceIncludedInPrice', False))
df['parkingSpacePrice'] = df['parsed'].apply(lambda x: x.get('parkingSpacePrice', 0))
df.drop(columns=['parkingSpace', 'parsed'], inplace=True)

In [None]:
df.head()

In [None]:
df[df.hasLift.isnull()].head()

In [None]:
df.columns

In [30]:
df = df[df['price'] > 0]

In [31]:
df["newDevelopment"] = df.newDevelopment.fillna(0).astype(int).astype(bool)
df["newDevelopmentFinished"] = df.newDevelopmentFinished.fillna(0).astype(int).astype(bool)
df["hasLift"] = df.hasLift.fillna(0).astype(int).astype(bool)
df["isParkingSpaceIncludedInPrice"] = df.isParkingSpaceIncludedInPrice.fillna(0).astype(int).astype(bool)

In [32]:
freguesias_lisboa = [
    "Ajuda",
    "Alcântara",
    "Alvalade",
    "Areeiro",
    "Arroios",
    "Avenidas Novas",
    "Beato",
    "Belém",
    "Benfica",
    "Campo de Ourique",
    "Campolide",
    "Carnide",
    "Estrela",
    "Lumiar",
    "Marvila",
    "Misericórdia",
    "Olivais",
    "Parque das Nações",
    "Penha de França",
    "Santa Clara",
    "Santa Maria Maior",
    "Santo António",
    "São Domingos de Benfica",
    "São Vicente",
]

In [33]:
df = df[df["municipality"].isin(freguesias_lisboa)]

In [None]:
set(df["municipality"].unique()) == set(freguesias_lisboa)

In [35]:
df.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}.csv", index=True)

In [36]:
import numpy as np

df["logPrice"] = np.log(df["price"])
df["pricePerSquareMeter"] = df["price"] / df["size"]
df["logPricePerSquareMeter"] = df["logPrice"] / df["size"]

In [37]:
features = [
    "propertyType",
    "size",
    "rooms",
    "bathrooms",
    "municipality",
    "latitude",
    "longitude",
    "status",
    "newDevelopment",
    "hasLift",
    "newDevelopmentFinished",
    "hasParkingSpace",
    "isParkingSpaceIncludedInPrice",
]
targets = [
    "price",
    "logPrice",
    "pricePerSquareMeter",
    "logPricePerSquareMeter",
]

In [None]:
categorical_columns = df[features + targets].select_dtypes(include=["object"]).columns

In [39]:
df_encoded = pd.get_dummies(df[features + targets], columns=categorical_columns, drop_first=True)

In [None]:
df_encoded.head()

In [41]:
boolean_columns = df_encoded.select_dtypes(include=["bool"]).columns

In [42]:
df_encoded[boolean_columns] = df_encoded[boolean_columns].astype(int)

In [None]:
df_encoded[boolean_columns].head()

In [None]:
df_encoded.columns

In [45]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Optionally scale
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_encoded)

In [46]:
# Initialize the model
# 'contamination' roughly means: "what fraction of data do you think are outliers?"
iso_forest = IsolationForest(contamination=0.1, random_state=42)

In [None]:
# Fit the model on our scaled features
iso_forest.fit(scaled_features)

In [48]:
# Predict outliers
# The output is: 1 for "normal" and -1 for "outlier"
outlier_predictions = iso_forest.predict(scaled_features)

In [49]:
df['outlier'] = outlier_predictions

In [None]:
df[df.outlier == -1][features + targets].describe()

In [None]:
df[df.outlier == 1][features + targets].describe()

In [None]:
df[df["size"] == 80500.0]

In [53]:
df_iso_forest = df[df.outlier == 1][features + targets]

In [54]:
df_iso_forest.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}-iso-forest.csv", index=True)

In [55]:
from sklearn.neighbors import LocalOutlierFactor

# Initialize the Local Outlier Factor model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

In [56]:
outlier_predictions = lof.fit_predict(df_encoded)

In [57]:
df['outlier'] = outlier_predictions

In [None]:
df[df.outlier == -1][features + targets].describe()

In [None]:
df[df.outlier == 1][features + targets].describe()

In [60]:
df_lof = df[df.outlier == 1][features + targets]

In [61]:
df_lof.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}-lof.csv", index=True)