In [1]:
import pandas as pd
import ast
import datetime

In [2]:
def is_valid_date_format(date_string):
    try:
        datetime.datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [3]:
city = "lisbon"
operation = "rent"
date_or_unioned = "unioned"

In [4]:
if not (date_or_unioned == "unioned" or is_valid_date_format(date_or_unioned)):
    raise ValueError(
        "date_or_unioned must be 'unioned' or a valid date in the format 'YYYY-MM-DD'"
    )

In [5]:
file_name = f"{date_or_unioned}-{city}-listings-for-{operation}"

In [6]:
read_path = f"../data/idealista"

if date_or_unioned == "unioned":
    df = pd.read_csv(f"{read_path}/unioned/{operation}/{city}/{file_name}.csv", index_col="propertyCode")
else:
    df = pd.read_csv(f"{read_path}/raw/{operation}/{city}/{file_name}.csv", index_col="propertyCode")

In [7]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,hasPlan,has3DTour,has360,hasStaging,topNewDevelopment,topPlus,neighborhood,parkingSpace,newDevelopmentFinished,snapshotDate
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33892869,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133030_100125,18,,1500.0,"{'price': {'amount': 1500.0, 'currencySuffix':...",flat,rent,180.0,3,...,False,False,False,False,False,False,,,,2025-01-13
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,"{'price': {'amount': 3950.0, 'currencySuffix':...",chalet,rent,172.0,4,...,False,False,False,False,False,False,São João,,,2025-01-13
33897140,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136663_130125,19,,1000.0,"{'price': {'amount': 1000.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897083,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136661_110125,29,,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897081,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133033_130125,103,1.0,1400.0,"{'price': {'amount': 1400.0, 'currencySuffix':...",flat,rent,60.0,2,...,True,False,False,False,False,False,,,,2025-01-13


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4314 entries, 33892869 to 33921711
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   thumbnail               4311 non-null   object 
 1   externalReference       3705 non-null   object 
 2   numPhotos               4314 non-null   int64  
 3   floor                   2460 non-null   object 
 4   price                   4314 non-null   float64
 5   priceInfo               4314 non-null   object 
 6   propertyType            4314 non-null   object 
 7   operation               4314 non-null   object 
 8   size                    4314 non-null   float64
 9   rooms                   4314 non-null   int64  
 10  bathrooms               4314 non-null   int64  
 11  address                 4314 non-null   object 
 12  province                4314 non-null   object 
 13  municipality            4314 non-null   object 
 14  district                3898 non-n

In [9]:
df.describe()

Unnamed: 0,numPhotos,price,size,rooms,bathrooms,latitude,longitude,distance,priceByArea
count,4314.0,4314.0,4314.0,4314.0,4314.0,4314.0,4314.0,2904.0,4314.0
mean,24.258924,2428.056792,127.44089,2.149513,1.75197,38.733466,-9.183371,12151.682507,31.971488
std,12.036653,2073.849625,1227.375717,1.199018,1.013771,0.108595,0.109707,14722.860557,44.912398
min,0.0,550.0,11.0,0.0,1.0,37.957614,-9.48243,32.0,0.0
25%,16.0,1375.0,60.0,1.0,1.0,38.708098,-9.197977,2581.75,16.0
50%,23.0,1800.0,89.0,2.0,1.0,38.719962,-9.154417,5767.5,21.0
75%,30.0,2600.0,132.0,3.0,2.0,38.744392,-9.135476,19418.25,29.0
max,119.0,25000.0,80500.0,15.0,15.0,39.541925,-8.559313,89903.0,545.0


In [10]:
df.isnull().sum()

thumbnail                    3
externalReference          609
numPhotos                    0
floor                     1854
price                        0
priceInfo                    0
propertyType                 0
operation                    0
size                         0
rooms                        0
bathrooms                    0
address                      0
province                     0
municipality                 0
district                   416
country                      0
latitude                     0
longitude                    0
showAddress                  0
url                          0
distance                  1410
description                  9
hasVideo                     0
status                       2
newDevelopment               0
hasLift                    654
priceByArea                  0
detailedType                 0
suggestedTexts               0
hasPlan                      0
has3DTour                    0
has360                       0
hasStagi

In [11]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'priceInfo', 'propertyType', 'operation', 'size', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'district', 'country',
       'latitude', 'longitude', 'showAddress', 'url', 'distance',
       'description', 'hasVideo', 'status', 'newDevelopment', 'hasLift',
       'priceByArea', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour',
       'has360', 'hasStaging', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'parkingSpace', 'newDevelopmentFinished', 'snapshotDate'],
      dtype='object')

In [12]:
# df = df.drop(columns=['thumbnail', 'numPhotos', 'operation', 'hasVideo', 'hasPlan', 'has3DTour', 'has360', 'hasStaging', 'externalReference', 'detailedType', 'suggestedTexts'])

In [13]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'priceInfo', 'propertyType', 'operation', 'size', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'district', 'country',
       'latitude', 'longitude', 'showAddress', 'url', 'distance',
       'description', 'hasVideo', 'status', 'newDevelopment', 'hasLift',
       'priceByArea', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour',
       'has360', 'hasStaging', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'parkingSpace', 'newDevelopmentFinished', 'snapshotDate'],
      dtype='object')

In [14]:
df = df.drop_duplicates()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4314 entries, 33892869 to 33921711
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   thumbnail               4311 non-null   object 
 1   externalReference       3705 non-null   object 
 2   numPhotos               4314 non-null   int64  
 3   floor                   2460 non-null   object 
 4   price                   4314 non-null   float64
 5   priceInfo               4314 non-null   object 
 6   propertyType            4314 non-null   object 
 7   operation               4314 non-null   object 
 8   size                    4314 non-null   float64
 9   rooms                   4314 non-null   int64  
 10  bathrooms               4314 non-null   int64  
 11  address                 4314 non-null   object 
 12  province                4314 non-null   object 
 13  municipality            4314 non-null   object 
 14  district                3898 non-n

In [16]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,hasPlan,has3DTour,has360,hasStaging,topNewDevelopment,topPlus,neighborhood,parkingSpace,newDevelopmentFinished,snapshotDate
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33892869,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133030_100125,18,,1500.0,"{'price': {'amount': 1500.0, 'currencySuffix':...",flat,rent,180.0,3,...,False,False,False,False,False,False,,,,2025-01-13
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,"{'price': {'amount': 3950.0, 'currencySuffix':...",chalet,rent,172.0,4,...,False,False,False,False,False,False,São João,,,2025-01-13
33897140,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136663_130125,19,,1000.0,"{'price': {'amount': 1000.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897083,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136661_110125,29,,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,60.0,1,...,False,False,False,False,False,False,,,,2025-01-13
33897081,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133033_130125,103,1.0,1400.0,"{'price': {'amount': 1400.0, 'currencySuffix':...",flat,rent,60.0,2,...,True,False,False,False,False,False,,,,2025-01-13


In [17]:
df.floor.unique()

array([nan, '1', 'bj', '3', '8', '2', '4', '7', '5', 'st', '6', '9', '14',
       '10', '17', 'ss', '11', '15', '13', '-1', '16', '12', 'en', '-2'],
      dtype=object)

In [18]:
df.priceInfo.unique()

array(["{'price': {'amount': 1500.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 3950.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1000.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1100.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1400.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 3600.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1200.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 2475.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 2150.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 2250.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1950.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 800.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 700.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 900.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 975.0, 'currencySuffix': '€/mês'}}",

In [19]:
# Function to parse the price and coalesce the formerPrice
def extract_former_price(price_str):
    # Parse the string to a dictionary
    price_dict = ast.literal_eval(price_str)
    price_info = price_dict.get('price', {})
    amount = price_info.get('amount')  # Current price
    price_drop_info = price_info.get('priceDropInfo', {})
    former_price = price_drop_info.get('formerPrice')  # Former price, if available
    # Coalesce formerPrice with the current price
    return former_price if former_price is not None else amount

# Apply the function to create the formerPrice column
df['formerPrice'] = df['priceInfo'].apply(extract_former_price)
df['priceChange'] = df['formerPrice'] - df['price']
df['isPriceLowered'] = df['priceChange'] < 0
df = df.drop(columns=['priceInfo'])

In [20]:
df['formerPrice']

propertyCode
33892869    1500.0
33896888    3950.0
33897140    1000.0
33897083    1100.0
33897081    1400.0
             ...  
33921703    5550.0
33921718    5550.0
33921674    5850.0
33921644    5850.0
33921711    6000.0
Name: formerPrice, Length: 4314, dtype: float64

In [21]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,hasStaging,topNewDevelopment,topPlus,neighborhood,parkingSpace,newDevelopmentFinished,snapshotDate,formerPrice,priceChange,isPriceLowered
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33892869,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133030_100125,18,,1500.0,flat,rent,180.0,3,2,...,False,False,False,,,,2025-01-13,1500.0,0.0,False
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,chalet,rent,172.0,4,3,...,False,False,False,São João,,,2025-01-13,3950.0,0.0,False
33897140,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136663_130125,19,,1000.0,flat,rent,60.0,1,1,...,False,False,False,,,,2025-01-13,1000.0,0.0,False
33897083,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136661_110125,29,,1100.0,flat,rent,60.0,1,1,...,False,False,False,,,,2025-01-13,1100.0,0.0,False
33897081,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133033_130125,103,1.0,1400.0,flat,rent,60.0,2,2,...,False,False,False,,,,2025-01-13,1400.0,0.0,False


In [22]:
df.parkingSpace.unique()

array([nan,
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 100.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 300.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 200.0}"],
      dtype=object)

In [23]:
df['parkingSpace'] = df['parkingSpace'].fillna('{}')
df['parsed'] = df['parkingSpace'].apply(ast.literal_eval)
df['hasParkingSpace'] = df['parsed'].apply(lambda x: x.get('hasParkingSpace', False))
df['isParkingSpaceIncludedInPrice'] = df['parsed'].apply(lambda x: x.get('isParkingSpaceIncludedInPrice', False))
df['parkingSpacePrice'] = df['parsed'].apply(lambda x: x.get('parkingSpacePrice', 0))
df.drop(columns=['parkingSpace', 'parsed'], inplace=True)

In [24]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,topPlus,neighborhood,newDevelopmentFinished,snapshotDate,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33892869,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133030_100125,18,,1500.0,flat,rent,180.0,3,2,...,False,,,2025-01-13,1500.0,0.0,False,False,False,0.0
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,chalet,rent,172.0,4,3,...,False,São João,,2025-01-13,3950.0,0.0,False,False,False,0.0
33897140,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136663_130125,19,,1000.0,flat,rent,60.0,1,1,...,False,,,2025-01-13,1000.0,0.0,False,False,False,0.0
33897083,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,136661_110125,29,,1100.0,flat,rent,60.0,1,1,...,False,,,2025-01-13,1100.0,0.0,False,False,False,0.0
33897081,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,133033_130125,103,1.0,1400.0,flat,rent,60.0,2,2,...,False,,,2025-01-13,1400.0,0.0,False,False,False,0.0


In [25]:
df[df.hasLift.isnull()].head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,topPlus,neighborhood,newDevelopmentFinished,snapshotDate,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33896888,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,27,,3950.0,chalet,rent,172.0,4,3,...,False,São João,,2025-01-13,3950.0,0.0,False,False,False,0.0
33896900,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,9527HS,39,,2250.0,chalet,rent,375.0,4,3,...,False,,,2025-01-13,2250.0,0.0,False,False,False,0.0
33896470,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,121491815-34,49,,4000.0,chalet,rent,184.0,4,3,...,False,,,2025-01-13,4000.0,0.0,False,True,True,0.0
33896238,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,121021235-46,33,,2700.0,flat,rent,177.0,4,3,...,False,,,2025-01-13,2700.0,0.0,False,False,False,0.0
33896208,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,122481568-14,15,,1650.0,flat,rent,54.0,1,1,...,False,,,2025-01-13,1650.0,0.0,False,False,False,0.0


In [26]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'propertyType', 'operation', 'size', 'rooms', 'bathrooms', 'address',
       'province', 'municipality', 'district', 'country', 'latitude',
       'longitude', 'showAddress', 'url', 'distance', 'description',
       'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea',
       'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour', 'has360',
       'hasStaging', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'newDevelopmentFinished', 'snapshotDate', 'formerPrice', 'priceChange',
       'isPriceLowered', 'hasParkingSpace', 'isParkingSpaceIncludedInPrice',
       'parkingSpacePrice'],
      dtype='object')

In [27]:
df = df[df['price'] > 0]

In [28]:
df["newDevelopment"] = df.newDevelopment.fillna(0).astype(int).astype(bool)
df["newDevelopmentFinished"] = df.newDevelopmentFinished.fillna(0).astype(int).astype(bool)
df["hasLift"] = df.hasLift.fillna(0).astype(int).astype(bool)
df["isParkingSpaceIncludedInPrice"] = df.isParkingSpaceIncludedInPrice.fillna(0).astype(int).astype(bool)

In [29]:
freguesias_lisboa = [
    "Ajuda",
    "Alcântara",
    "Alvalade",
    "Areeiro",
    "Arroios",
    "Avenidas Novas",
    "Beato",
    "Belém",
    "Benfica",
    "Campo de Ourique",
    "Campolide",
    "Carnide",
    "Estrela",
    "Lumiar",
    "Marvila",
    "Misericórdia",
    "Olivais",
    "Parque das Nações",
    "Penha de França",
    "Santa Clara",
    "Santa Maria Maior",
    "Santo António",
    "São Domingos de Benfica",
    "São Vicente",
]

In [30]:
df = df[df["municipality"].isin(freguesias_lisboa)]

In [31]:
set(df["municipality"].unique()) == set(freguesias_lisboa)

True

In [32]:
df["totalPrice"] = df["price"] + df["parkingSpacePrice"]

In [33]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'propertyType', 'operation', 'size', 'rooms', 'bathrooms', 'address',
       'province', 'municipality', 'district', 'country', 'latitude',
       'longitude', 'showAddress', 'url', 'distance', 'description',
       'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea',
       'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour', 'has360',
       'hasStaging', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'newDevelopmentFinished', 'snapshotDate', 'formerPrice', 'priceChange',
       'isPriceLowered', 'hasParkingSpace', 'isParkingSpaceIncludedInPrice',
       'parkingSpacePrice', 'totalPrice'],
      dtype='object')

In [34]:
df

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,neighborhood,newDevelopmentFinished,snapshotDate,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,totalPrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33829652,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-009130,33,1,2475.0,flat,rent,179.0,3,3,...,,False,2025-01-13,2475.0,0.0,False,False,False,0.0,2475.0
33596155,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-004534,33,bj,2150.0,flat,rent,170.0,3,3,...,,False,2025-01-13,2150.0,0.0,False,False,False,0.0,2150.0
33896876,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,Match_01.216,21,8,1950.0,flat,rent,135.0,2,2,...,,False,2025-01-13,1950.0,0.0,False,True,True,0.0,1950.0
31196503,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,CF119,9,1,700.0,studio,rent,25.0,0,1,...,,False,2025-01-13,700.0,0.0,False,False,False,0.0,700.0
33896540,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,1,3000.0,flat,rent,130.0,2,3,...,,False,2025-01-13,3000.0,0.0,False,True,True,0.0,3000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33921703,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,726256,15,,5550.0,flat,rent,11.0,2,1,...,,False,2025-02-01,5550.0,0.0,False,False,False,0.0,5550.0
33921718,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724171,13,,5550.0,flat,rent,11.0,2,1,...,,False,2025-02-01,5550.0,0.0,False,False,False,0.0,5550.0
33921674,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724003,10,,5850.0,flat,rent,11.0,2,1,...,,False,2025-02-01,5850.0,0.0,False,False,False,0.0,5850.0
33921644,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,724001,11,,5850.0,flat,rent,11.0,2,1,...,,False,2025-02-01,5850.0,0.0,False,False,False,0.0,5850.0


In [35]:
# features = [
#     "floor",
#     "propertyType",
#     "size",
#     "rooms",
#     "bathrooms",
#     "address",
#     "province",
#     "municipality",
#     "district",
#     "country",
#     "latitude",
#     "longitude",
#     "distance",
#     "description",
#     "status",
#     "newDevelopment",
#     "hasLift",
#     "topNewDevelopment",
#     "topPlus",
#     "neighborhood",
#     "newDevelopmentFinished",
#     "snapshotDate",
#     "hasParkingSpace",
# ]
# target = ["price"]

In [36]:
# df = df[features + target]

In [37]:
df.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}.csv", index=True)

# Outlier Removal

In [38]:
# lof_features = [
#     "floor",
#     "price",
#     "propertyType",
#     "size",
#     "rooms",
#     "bathrooms",
#     "municipality",
#     "district",
#     "latitude",
#     "longitude",
#     "status",
#     "newDevelopment",
#     "hasLift",
#     "neighborhood",
#     "newDevelopmentFinished",
#     "hasParkingSpace",
# ]
# target = ["price"]

In [39]:
# categorical_columns = df[lof_features + target].select_dtypes(include=["object"]).columns

In [40]:
# df_encoded = pd.get_dummies(df[lof_features + target], columns=categorical_columns, drop_first=True)

In [41]:
# df_encoded.head()

In [42]:
# boolean_columns = df_encoded.select_dtypes(include=["bool"]).columns

In [43]:
# df_encoded[boolean_columns] = df_encoded[boolean_columns].astype(int)

In [44]:
# df_encoded[boolean_columns].head()

In [45]:
# df_encoded.columns

In [46]:
# from sklearn.neighbors import LocalOutlierFactor

# # Initialize the Local Outlier Factor model
# lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

In [47]:
# outlier_predictions = lof.fit_predict(df_encoded)

In [48]:
# df['outlier'] = outlier_predictions

In [49]:
# df[df.outlier == -1][features + target].describe()

In [50]:
# df[df.outlier == 1][features + target].describe()

In [51]:
# df_lof = df[df.outlier == 1][features + target]

In [52]:
# df_lof.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}-lof.csv", index=True)