In [1]:
import pandas as pd
import sys
import os
import datetime

In [2]:
operation = "sale"
city = "lisbon"

read_path = f"../data/idealista/raw/{operation}/{city}"
save_path = f"../data/idealista/unioned/{operation}/{city}"

dataframes = []

for file in os.listdir(read_path):
    file_date = file[:10]
    if file.endswith(".csv"):
        dataframes.append(pd.read_csv(f"{read_path}/{file}", index_col='propertyCode'))
        dataframes[-1]["snapshotDate"] = datetime.datetime.strptime(file_date, "%Y-%m-%d").date()

In [None]:
# Get the absolute path of the scripts directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the scripts directory to Python's module search path
sys.path.append(root_dir)

In [None]:
from scripts.preprocessors import IdealistaDataLoader

dataframes = []

data_loader = IdealistaDataLoader(
    read_path=f"../data/idealista",
    city="lisbon",
    operation="rent",
    date_or_unioned="unioned",
    include_geodata=True,
    include_openai=True,
    index_col="propertyCode",
    directory="cleaned"
)

dataframes.append(data_loader.load_data())

data_loader = IdealistaDataLoader(
    read_path=f"../data/idealista",
    city="lisbon",
    operation="rent",
    date_or_unioned="2025-03-04",
    include_geodata=True,
    include_openai=True,
    index_col="propertyCode",
    directory="cleaned"
)

dataframes.append(data_loader.load_data())

In [3]:
# Combine the DataFrames
df = pd.concat(dataframes)

# Keep only the last occurrence of each index
df = df.loc[~df.index.duplicated(keep="last")]

# Keep only the listings that have unique values for the following columns
df = df.loc[
    ~df[
        [
            "floor",
            "propertyType",
            "size",
            "rooms",
            "bathrooms",
            "address",
            "description",
        ]
    ].duplicated(keep="last")
]

In [4]:
df.index.is_unique

True

In [5]:
operation = "sale"
city = "lisbon"

save_path = f"../data/idealista/cleaned/{operation}/{city}"

save_path += f"/unioned-{city}-listings-for-{operation}.csv"

In [6]:
save_path

'../data/idealista/cleaned/sale/lisbon/unioned-lisbon-listings-for-sale.csv'

In [7]:
# Save the unioned DataFrame to a new CSV file
df.to_csv(save_path, index_label="propertyCode")

In [8]:
df

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,topNewDevelopment,topPlus,parkingSpace,newDevelopmentFinished,highlight,neighborhood,snapshotDate,change,savedAd,notes
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33926227,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,LA5005,27,6,3000000.0,"{'price': {'amount': 3000000.0, 'currencySuffi...",flat,sale,420.0,7,...,False,False,,,,,2025-02-01,,,
33942437,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,109250011,52,8,1150000.0,"{'price': {'amount': 1150000.0, 'currencySuffi...",penthouse,sale,220.0,3,...,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,,,2025-02-01,,,
32241777,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,65151,34,4,635000.0,"{'price': {'amount': 635000.0, 'currencySuffix...",flat,sale,143.0,3,...,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",False,,,2025-02-01,,,
33942410,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,65273,35,11,390000.0,"{'price': {'amount': 390000.0, 'currencySuffix...",flat,sale,74.0,1,...,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",False,,,2025-02-01,,,
33942308,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-011200,6,8,1100000.0,"{'price': {'amount': 1100000.0, 'currencySuffi...",duplex,sale,224.0,4,...,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,{'groupDescription': 'Destaque'},,2025-02-01,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34011188,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,L3/02338,22,3,385000.0,"{'price': {'amount': 385000.0, 'currencySuffix...",flat,sale,65.0,1,...,False,False,,,,,2025-03-05,{},{},[]
34011263,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_1262,19,,885000.0,"{'price': {'amount': 885000.0, 'currencySuffix...",flat,sale,154.0,3,...,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,,,2025-03-05,{},{},[]
34011260,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,L3/02337,18,1,430000.0,"{'price': {'amount': 430000.0, 'currencySuffix...",flat,sale,63.0,2,...,False,False,,,,,2025-03-05,{},{},[]
34011194,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_334,32,2,469000.0,"{'price': {'amount': 469000.0, 'currencySuffix...",flat,sale,100.0,2,...,False,False,,,,,2025-03-05,{},{},[]
