In [1]:
import pandas as pd
import ast
import datetime

In [2]:
def is_valid_date_format(date_string):
    try:
        datetime.datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [3]:
city = "lisbon"
operation = "rent"
date_or_unioned = "2025-03-04"

In [4]:
if not (date_or_unioned == "unioned" or is_valid_date_format(date_or_unioned)):
    raise ValueError(
        "date_or_unioned must be 'unioned' or a valid date in the format 'YYYY-MM-DD'"
    )

In [5]:
file_name = f"{date_or_unioned}-{city}-listings-for-{operation}"

In [6]:
read_path = f"../data/idealista"

if date_or_unioned == "unioned":
    df = pd.read_csv(f"{read_path}/unioned/{operation}/{city}/{file_name}.csv", index_col="propertyCode")
else:
    df = pd.read_csv(f"{read_path}/raw/{operation}/{city}/{file_name}.csv", index_col="propertyCode")

In [7]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,hasPlan,has3DTour,has360,hasStaging,savedAd,notes,topNewDevelopment,topPlus,parkingSpace,neighborhood
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34030020,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124151169-121,31,1.0,1600.0,"{'price': {'amount': 1600.0, 'currencySuffix':...",flat,rent,80.0,2,...,False,False,False,False,{},[],False,False,,
34009485,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,8.0,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,49.0,1,...,True,False,False,False,{},[],False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
34029522,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,C0506-00376,14,,1950.0,"{'price': {'amount': 1950.0, 'currencySuffix':...",flat,rent,107.0,3,...,False,False,False,False,{},[],False,False,,
34029392,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124521035-226,20,2.0,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,66.0,2,...,False,False,False,False,{},[],False,False,,
34029386,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_235 (2),15,-1.0,1250.0,"{'price': {'amount': 1250.0, 'currencySuffix':...",flat,rent,98.0,1,...,False,False,False,False,{},[],False,False,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 455 entries, 34030020 to 34000042
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   thumbnail          454 non-null    object 
 1   externalReference  379 non-null    object 
 2   numPhotos          455 non-null    int64  
 3   floor              297 non-null    object 
 4   price              455 non-null    float64
 5   priceInfo          455 non-null    object 
 6   propertyType       455 non-null    object 
 7   operation          455 non-null    object 
 8   size               455 non-null    float64
 9   rooms              455 non-null    int64  
 10  bathrooms          455 non-null    int64  
 11  address            455 non-null    object 
 12  province           455 non-null    object 
 13  municipality       455 non-null    object 
 14  district           433 non-null    object 
 15  country            455 non-null    object 
 16  latitude           

In [9]:
df.describe()

Unnamed: 0,numPhotos,price,size,rooms,bathrooms,latitude,longitude,priceByArea
count,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0
mean,22.991209,2266.540659,97.92967,1.978022,1.615385,38.728735,-9.151542,25.186813
std,11.796173,1945.371578,65.840612,1.133957,0.912097,0.02008,0.024902,13.258454
min,0.0,500.0,18.0,0.0,1.0,38.693839,-9.225044,8.0
25%,15.0,1382.5,60.0,1.0,1.0,38.712204,-9.164876,18.0
50%,21.0,1750.0,78.0,2.0,1.0,38.725171,-9.149736,21.0
75%,28.0,2500.0,115.0,3.0,2.0,38.740911,-9.135337,28.0
max,117.0,17000.0,570.0,7.0,6.0,38.791647,-9.093209,133.0


In [10]:
df.isnull().sum()

thumbnail              1
externalReference     76
numPhotos              0
floor                158
price                  0
priceInfo              0
propertyType           0
operation              0
size                   0
rooms                  0
bathrooms              0
address                0
province               0
municipality           0
district              22
country                0
latitude               0
longitude              0
showAddress            0
url                    0
description            0
hasVideo               0
status                 0
newDevelopment         0
hasLift               31
priceByArea            0
change                 0
detailedType           0
suggestedTexts         0
hasPlan                0
has3DTour              0
has360                 0
hasStaging             0
savedAd                0
notes                  0
topNewDevelopment      0
topPlus                0
parkingSpace         328
neighborhood         439
dtype: int64

In [11]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'priceInfo', 'propertyType', 'operation', 'size', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'district', 'country',
       'latitude', 'longitude', 'showAddress', 'url', 'description',
       'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea',
       'change', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour',
       'has360', 'hasStaging', 'savedAd', 'notes', 'topNewDevelopment',
       'topPlus', 'parkingSpace', 'neighborhood'],
      dtype='object')

In [12]:
# df = df.drop(columns=['thumbnail', 'numPhotos', 'operation', 'hasVideo', 'hasPlan', 'has3DTour', 'has360', 'hasStaging', 'externalReference', 'detailedType', 'suggestedTexts'])

In [13]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'priceInfo', 'propertyType', 'operation', 'size', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'district', 'country',
       'latitude', 'longitude', 'showAddress', 'url', 'description',
       'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea',
       'change', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour',
       'has360', 'hasStaging', 'savedAd', 'notes', 'topNewDevelopment',
       'topPlus', 'parkingSpace', 'neighborhood'],
      dtype='object')

In [14]:
df = df.drop_duplicates()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 455 entries, 34030020 to 34000042
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   thumbnail          454 non-null    object 
 1   externalReference  379 non-null    object 
 2   numPhotos          455 non-null    int64  
 3   floor              297 non-null    object 
 4   price              455 non-null    float64
 5   priceInfo          455 non-null    object 
 6   propertyType       455 non-null    object 
 7   operation          455 non-null    object 
 8   size               455 non-null    float64
 9   rooms              455 non-null    int64  
 10  bathrooms          455 non-null    int64  
 11  address            455 non-null    object 
 12  province           455 non-null    object 
 13  municipality       455 non-null    object 
 14  district           433 non-null    object 
 15  country            455 non-null    object 
 16  latitude           

In [16]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,priceInfo,propertyType,operation,size,rooms,...,hasPlan,has3DTour,has360,hasStaging,savedAd,notes,topNewDevelopment,topPlus,parkingSpace,neighborhood
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34030020,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124151169-121,31,1.0,1600.0,"{'price': {'amount': 1600.0, 'currencySuffix':...",flat,rent,80.0,2,...,False,False,False,False,{},[],False,False,,
34009485,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,8.0,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,49.0,1,...,True,False,False,False,{},[],False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
34029522,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,C0506-00376,14,,1950.0,"{'price': {'amount': 1950.0, 'currencySuffix':...",flat,rent,107.0,3,...,False,False,False,False,{},[],False,False,,
34029392,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124521035-226,20,2.0,1100.0,"{'price': {'amount': 1100.0, 'currencySuffix':...",flat,rent,66.0,2,...,False,False,False,False,{},[],False,False,,
34029386,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_235 (2),15,-1.0,1250.0,"{'price': {'amount': 1250.0, 'currencySuffix':...",flat,rent,98.0,1,...,False,False,False,False,{},[],False,False,,


In [17]:
df.floor.unique()

array(['1', '8', nan, '2', '-1', '4', 'bj', '6', '3', '5', '15', '12',
       '7', '10', 'st', '13', '9', '14', 'ss'], dtype=object)

In [18]:
df.priceInfo.unique()

array(["{'price': {'amount': 1600.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1100.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1950.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1250.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 2600.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1650.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 15000.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1300.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1480.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1500.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 3200.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 4200.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1860.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1200.0, 'currencySuffix': '€/mês'}}",
       "{'price': {'amount': 1750.0, 'currencySuffix': '€/mês

In [19]:
# Function to parse the price and coalesce the formerPrice
def extract_former_price(price_str):
    # Parse the string to a dictionary
    price_dict = ast.literal_eval(price_str)
    price_info = price_dict.get('price', {})
    amount = price_info.get('amount')  # Current price
    price_drop_info = price_info.get('priceDropInfo', {})
    former_price = price_drop_info.get('formerPrice')  # Former price, if available
    # Coalesce formerPrice with the current price
    return former_price if former_price is not None else amount

# Apply the function to create the formerPrice column
df['formerPrice'] = df['priceInfo'].apply(extract_former_price)
df['priceChange'] = df['formerPrice'] - df['price']
df['isPriceLowered'] = df['priceChange'] < 0
df = df.drop(columns=['priceInfo'])

In [20]:
df['formerPrice']

propertyCode
34030020    1600.0
34009485    1100.0
34029522    1950.0
34029392    1100.0
34029386    1250.0
             ...  
32738016    1600.0
34000076    1200.0
33147095    1090.0
34000063     950.0
34000042     850.0
Name: formerPrice, Length: 455, dtype: float64

In [21]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,hasStaging,savedAd,notes,topNewDevelopment,topPlus,parkingSpace,neighborhood,formerPrice,priceChange,isPriceLowered
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34030020,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124151169-121,31,1.0,1600.0,flat,rent,80.0,2,2,...,False,{},[],False,False,,,1600.0,0.0,False
34009485,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,8.0,1100.0,flat,rent,49.0,1,1,...,False,{},[],False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,1100.0,0.0,False
34029522,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,C0506-00376,14,,1950.0,flat,rent,107.0,3,2,...,False,{},[],False,False,,,1950.0,0.0,False
34029392,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124521035-226,20,2.0,1100.0,flat,rent,66.0,2,1,...,False,{},[],False,False,,,1100.0,0.0,False
34029386,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_235 (2),15,-1.0,1250.0,flat,rent,98.0,1,1,...,False,{},[],False,False,,,1250.0,0.0,False


In [22]:
df.parkingSpace.unique()

array([nan,
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True}"],
      dtype=object)

In [23]:
df['parkingSpace'] = df['parkingSpace'].fillna('{}')
df['parsed'] = df['parkingSpace'].apply(ast.literal_eval)
df['hasParkingSpace'] = df['parsed'].apply(lambda x: x.get('hasParkingSpace', False))
df['isParkingSpaceIncludedInPrice'] = df['parsed'].apply(lambda x: x.get('isParkingSpaceIncludedInPrice', False))
df['parkingSpacePrice'] = df['parsed'].apply(lambda x: x.get('parkingSpacePrice', 0))
df.drop(columns=['parkingSpace', 'parsed'], inplace=True)

In [24]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,notes,topNewDevelopment,topPlus,neighborhood,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34030020,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124151169-121,31,1.0,1600.0,flat,rent,80.0,2,2,...,[],False,False,,1600.0,0.0,False,False,False,0
34009485,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,8.0,1100.0,flat,rent,49.0,1,1,...,[],False,False,,1100.0,0.0,False,True,True,0
34029522,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,C0506-00376,14,,1950.0,flat,rent,107.0,3,2,...,[],False,False,,1950.0,0.0,False,False,False,0
34029392,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124521035-226,20,2.0,1100.0,flat,rent,66.0,2,1,...,[],False,False,,1100.0,0.0,False,False,False,0
34029386,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_235 (2),15,-1.0,1250.0,flat,rent,98.0,1,1,...,[],False,False,,1250.0,0.0,False,False,False,0


In [25]:
df[df.hasLift.isnull()].head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,notes,topNewDevelopment,topPlus,neighborhood,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34023911,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,CLRL420,49,,15000.0,chalet,rent,245.0,4,4,...,[],False,False,,15000.0,0.0,False,True,True,0
34028667,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,1697,16,,1600.0,flat,rent,57.0,1,1,...,[],False,False,,1600.0,0.0,False,False,False,0
34027851,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-005988,19,,3000.0,flat,rent,119.0,3,3,...,[],False,False,,3000.0,0.0,False,False,False,0
34027828,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,137015,16,1.0,1100.0,flat,rent,35.0,1,1,...,[],False,False,,1100.0,0.0,False,False,False,0
34027837,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,137948,21,,1400.0,flat,rent,60.0,2,1,...,[],False,False,,1400.0,0.0,False,False,False,0


In [26]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'propertyType', 'operation', 'size', 'rooms', 'bathrooms', 'address',
       'province', 'municipality', 'district', 'country', 'latitude',
       'longitude', 'showAddress', 'url', 'description', 'hasVideo', 'status',
       'newDevelopment', 'hasLift', 'priceByArea', 'change', 'detailedType',
       'suggestedTexts', 'hasPlan', 'has3DTour', 'has360', 'hasStaging',
       'savedAd', 'notes', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'formerPrice', 'priceChange', 'isPriceLowered', 'hasParkingSpace',
       'isParkingSpaceIncludedInPrice', 'parkingSpacePrice'],
      dtype='object')

In [27]:
df = df[df['price'] > 0]

In [28]:
df["newDevelopment"] = df.newDevelopment.fillna(0).astype(int).astype(bool)
df["newDevelopmentFinished"] = df.newDevelopmentFinished.fillna(0).astype(int).astype(bool)
df["hasLift"] = df.hasLift.fillna(0).astype(int).astype(bool)
df["isParkingSpaceIncludedInPrice"] = df.isParkingSpaceIncludedInPrice.fillna(0).astype(int).astype(bool)

AttributeError: 'DataFrame' object has no attribute 'newDevelopmentFinished'

In [29]:
freguesias_lisboa = [
    "Ajuda",
    "Alcântara",
    "Alvalade",
    "Areeiro",
    "Arroios",
    "Avenidas Novas",
    "Beato",
    "Belém",
    "Benfica",
    "Campo de Ourique",
    "Campolide",
    "Carnide",
    "Estrela",
    "Lumiar",
    "Marvila",
    "Misericórdia",
    "Olivais",
    "Parque das Nações",
    "Penha de França",
    "Santa Clara",
    "Santa Maria Maior",
    "Santo António",
    "São Domingos de Benfica",
    "São Vicente",
]

In [30]:
df = df[df["municipality"].isin(freguesias_lisboa)]

In [31]:
set(df["municipality"].unique()) == set(freguesias_lisboa)

True

In [32]:
df["totalPrice"] = df["price"] + df["parkingSpacePrice"]
df["priceByArea"] = df["totalPrice"] / df["size"]

In [33]:
df.columns

Index(['thumbnail', 'externalReference', 'numPhotos', 'floor', 'price',
       'propertyType', 'operation', 'size', 'rooms', 'bathrooms', 'address',
       'province', 'municipality', 'district', 'country', 'latitude',
       'longitude', 'showAddress', 'url', 'description', 'hasVideo', 'status',
       'newDevelopment', 'hasLift', 'priceByArea', 'change', 'detailedType',
       'suggestedTexts', 'hasPlan', 'has3DTour', 'has360', 'hasStaging',
       'savedAd', 'notes', 'topNewDevelopment', 'topPlus', 'neighborhood',
       'formerPrice', 'priceChange', 'isPriceLowered', 'hasParkingSpace',
       'isParkingSpaceIncludedInPrice', 'parkingSpacePrice', 'totalPrice'],
      dtype='object')

In [34]:
df

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,topNewDevelopment,topPlus,neighborhood,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,totalPrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34030020,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124151169-121,31,1,1600.0,flat,rent,80.0,2,2,...,False,False,,1600.0,0.0,False,False,False,0,1600.0
34009485,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,8,1100.0,flat,rent,49.0,1,1,...,False,False,,1100.0,0.0,False,True,True,0,1100.0
34029522,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,C0506-00376,14,,1950.0,flat,rent,107.0,3,2,...,False,False,,1950.0,0.0,False,False,False,0,1950.0
34029392,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,124521035-226,20,2,1100.0,flat,rent,66.0,2,1,...,False,False,,1100.0,0.0,False,False,False,0,1100.0
34029386,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,APA_235 (2),15,-1,1250.0,flat,rent,98.0,1,1,...,False,False,,1250.0,0.0,False,False,False,0,1250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32738016,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,605822,25,,1600.0,studio,rent,65.0,0,1,...,False,False,,1600.0,0.0,False,False,False,0,1600.0
34000076,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,176721,11,,1200.0,studio,rent,24.0,0,1,...,False,False,,1200.0,0.0,False,False,False,0,1200.0
33147095,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,669722,4,,998.0,studio,rent,29.0,0,1,...,False,False,,1090.0,92.0,False,False,False,0,998.0
34000063,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,168627,50,,950.0,studio,rent,24.0,0,1,...,False,False,,950.0,0.0,False,False,False,0,950.0


In [35]:
# features = [
#     "floor",
#     "propertyType",
#     "size",
#     "rooms",
#     "bathrooms",
#     "address",
#     "province",
#     "municipality",
#     "district",
#     "country",
#     "latitude",
#     "longitude",
#     "distance",
#     "description",
#     "status",
#     "newDevelopment",
#     "hasLift",
#     "topNewDevelopment",
#     "topPlus",
#     "neighborhood",
#     "newDevelopmentFinished",
#     "snapshotDate",
#     "hasParkingSpace",
# ]
# target = ["price"]

In [36]:
# df = df[features + target]

In [37]:
df.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}.csv", index=True)

# Outlier Removal

In [38]:
# lof_features = [
#     "floor",
#     "price",
#     "propertyType",
#     "size",
#     "rooms",
#     "bathrooms",
#     "municipality",
#     "district",
#     "latitude",
#     "longitude",
#     "status",
#     "newDevelopment",
#     "hasLift",
#     "neighborhood",
#     "newDevelopmentFinished",
#     "hasParkingSpace",
# ]
# target = ["price"]

In [39]:
# categorical_columns = df[lof_features + target].select_dtypes(include=["object"]).columns

In [40]:
# df_encoded = pd.get_dummies(df[lof_features + target], columns=categorical_columns, drop_first=True)

In [41]:
# df_encoded.head()

In [42]:
# boolean_columns = df_encoded.select_dtypes(include=["bool"]).columns

In [43]:
# df_encoded[boolean_columns] = df_encoded[boolean_columns].astype(int)

In [44]:
# df_encoded[boolean_columns].head()

In [45]:
# df_encoded.columns

In [46]:
# from sklearn.neighbors import LocalOutlierFactor

# # Initialize the Local Outlier Factor model
# lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

In [47]:
# outlier_predictions = lof.fit_predict(df_encoded)

In [48]:
# df['outlier'] = outlier_predictions

In [49]:
# df[df.outlier == -1][features + target].describe()

In [50]:
# df[df.outlier == 1][features + target].describe()

In [51]:
# df_lof = df[df.outlier == 1][features + target]

In [52]:
# df_lof.to_csv(f"../data/idealista/cleaned/{operation}/{city}/{file_name}-lof.csv", index=True)