In [1]:
import pandas as pd
import os
import datetime

In [2]:
operation = "rent"
city = "lisbon"

read_path = f"../data/idealista/raw/{operation}/{city}"
save_path = f"../data/idealista/unioned/{operation}/{city}"

dataframes = []

for file in os.listdir(read_path):
    file_date = file[:10]
    if file.endswith(".csv"):
        dataframes.append(pd.read_csv(f"{read_path}/{file}", index_col='propertyCode'))
        dataframes[-1]["snapshotDate"] = datetime.datetime.strptime(file_date, "%Y-%m-%d").date()

In [3]:
# Combine the DataFrames
df = pd.concat(dataframes)

# Keep only the last occurrence of each index
df = df.loc[~df.index.duplicated(keep="last")]

# Keep only the listings that have unique values for the following columns
df = df.loc[
    ~df[
        [
            "floor",
            "propertyType",
            "size",
            "rooms",
            "bathrooms",
            "address",
            "description",
        ]
    ].duplicated(keep="last")
]



In [4]:
df.index.is_unique

True

In [5]:
save_path += f"/unioned-{city}-listings-for-{operation}.csv"

In [6]:
save_path

'../data/idealista/unioned/rent/lisbon/unioned-lisbon-listings-for-rent.csv'

In [7]:
# Save the unioned DataFrame to a new CSV file
df.to_csv(save_path, index_label="propertyCode")

In [8]:
df

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,hasPlan,has3DTour,has360,hasStaging,topNewDevelopment,topPlus,neighborhood,parkingSpace,newDevelopmentFinished,snapshotDate
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33892869,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133030_100125,18,,1500.0,"{'price': {'amount': 1500.0, 'currencySuffix':...",flat,rent,180.0,3,...,False,False,False,False,False,False,,,,2025-01-13
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,"{'price': {'amount': 3950.0, 'currencySuffix':...",chalet,rent,172.0,4,...,False,False,False,False,False,False,São João,,,2025-01-13
33897140,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136663_130125,19,,1000.0,"{'price': {'amount': 1000.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897083,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136661_110125,29,,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897081,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133033_130125,103,1,1400.0,"{'price': {'amount': 1400.0, 'currencySuffix':...",flat,rent,60.0,2,...,True,False,False,False,False,False,,,,2025-01-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33921703,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,726256,15,,5550.0,"{'price': {'amount': 5550.0, 'currencySuffix':...",flat,rent,11.0,2,...,False,False,False,False,False,False,,,,2025-02-01
33921718,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724171,13,,5550.0,"{'price': {'amount': 5550.0, 'currencySuffix':...",flat,rent,11.0,2,...,False,False,False,False,False,False,,,,2025-02-01
33921674,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724003,10,,5850.0,"{'price': {'amount': 5850.0, 'currencySuffix':...",flat,rent,11.0,2,...,False,False,False,False,False,False,,,,2025-02-01
33921644,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724001,11,,5850.0,"{'price': {'amount': 5850.0, 'currencySuffix':...",flat,rent,11.0,2,...,False,False,False,False,False,False,,,,2025-02-01
