In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import geopandas as gpd
import folium
from shapely.geometry import Point

In [2]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

In [3]:
def remove_outliers(df, variables, thresholds=None, groupby=None):
    """
    Removes outliers from specified variables using IQR or domain-based thresholds,
    with an optional grouping feature.

    Parameters:
    df (pd.DataFrame): The dataset.
    variables (list): List of variables to check for outliers.
    thresholds (dict, optional): Custom lower and upper bounds for specific variables.
                                 Format: {'variable': {'lower': value, 'upper': value}}
    groupby (str, optional): Column name to group by before removing outliers.

    Returns:
    pd.DataFrame: Dataset with outliers removed.
    """
    df_cleaned = df.copy()

    if groupby:
        # Preserve the groupby column
        group_values = df_cleaned[groupby]  # Retain original group column

        # Apply function within each group
        df_cleaned = df_cleaned.groupby(groupby, group_keys=False).apply(
            lambda group: _remove_outliers_in_group(group, variables, thresholds),
            include_groups=False,
        )

        # Reattach the groupby column
        df_cleaned = df_cleaned.merge(
            group_values, left_index=True, right_index=True, how="left"
        )

    else:
        df_cleaned = _remove_outliers_in_group(df_cleaned, variables, thresholds)

    return df_cleaned


def _remove_outliers_in_group(df, variables, thresholds=None):
    """
    Helper function to remove outliers within a DataFrame or group.
    """
    df_filtered = df.copy()

    for variable in variables:
        # Check for custom thresholds
        if thresholds and variable in thresholds:
            lower = thresholds[variable].get("lower", None)
            upper = thresholds[variable].get("upper", None)
        else:
            # Default to IQR if no custom threshold provided
            Q1 = df_filtered[variable].quantile(0.25)
            Q3 = df_filtered[variable].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

        # Remove outliers
        df_filtered = df_filtered[
            (df_filtered[variable] >= lower) & (df_filtered[variable] <= upper)
        ]

    return df_filtered

In [4]:
def is_valid_date_format(date_string):
    try:
        datetime.datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [5]:
city = "lisbon"
operation = "rent"
date_or_unioned = "unioned"

In [6]:
if not (date_or_unioned == "unioned" or is_valid_date_format(date_or_unioned)):
    raise ValueError(
        "date_or_unioned must be 'unioned' or a valid date in the format 'YYYY-MM-DD'"
    )

In [7]:
file_name = f"{date_or_unioned}-{city}-listings-for-{operation}_lof.csv"

In [8]:
read_path = f"../data/idealista"

df = pd.read_csv(
    f"{read_path}/cleaned/{operation}/{city}/{file_name}", index_col="propertyCode"
)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
numeric_columns = df.select_dtypes(include="number").columns
numeric_columns

In [None]:
categorical_columns = df.select_dtypes(include=["bool", 'object']).columns
categorical_columns

In [None]:
df.head()

In [None]:
plt.figure(figsize=(20, 10))
municipality_count_plot = sns.countplot(
    x="municipality",
    data=df,
    order=df["municipality"].value_counts().index,
)

# Set proper tick locations before changing labels
municipality_count_plot.set_xticks(range(len(df["municipality"].unique())))
municipality_count_plot.set_xticklabels(
    municipality_count_plot.get_xticklabels(), rotation=90
)

plt.show()

In [None]:
municipality_counts = df["municipality"].value_counts()
municipality_counts

In [19]:
valid_municipalities = municipality_counts[municipality_counts >= 50].index
df = df[df["municipality"].isin(valid_municipalities)]

In [None]:
df["municipality"].value_counts()

In [None]:
plt.figure(figsize=(20, 10))
property_type_count_plot = sns.countplot(
    x="propertyType",
    data=df,
    order=df["propertyType"].value_counts().index,
)

# Set proper tick locations before changing labels
property_type_count_plot.set_xticks(range(len(df["propertyType"].unique())))
property_type_count_plot.set_xticklabels(
    property_type_count_plot.get_xticklabels(), rotation=90
)

plt.show()

In [22]:
property_types = df["propertyType"].unique()

In [23]:
variables_with_outliers = ["price"]

# Remove outliers for all specified variables
df_no_price_outliers = remove_outliers(df, variables=variables_with_outliers)

In [24]:
def describe_column(df, column):
    """
    Returns a summary of a column in a DataFrame.

    Parameters:
    df (pd.DataFrame): The dataset.
    column (str): The column to describe.

    Returns:
    pd.Series: Summary of the column.
    """
    return df[column].describe()

In [None]:
describe_column(df, "price")

In [None]:
describe_column(df_no_price_outliers, "price")

In [27]:
def catplot(df, x, y=None, kind="box"):
    if y is not None:
        sns.catplot(
            x=x,
            y=y,
            data=df,
            kind=kind,
            order=df.groupby(y)[x].median().sort_values().index,
        )
    else:
        sns.catplot(x=x, data=df, kind=kind)
    plt.show()

In [None]:
catplot(df, x="price")

In [None]:
catplot(df_no_price_outliers, x="price")

In [None]:
catplot(df, x="price", y="propertyType")

In [None]:
catplot(df_no_price_outliers, x="price", y="propertyType")

In [None]:
catplot(df, x="price", y="municipality")

In [None]:
catplot(df_no_price_outliers, x="price", y="municipality")

In [34]:
import math

def best_grid_shape(n):
    """
    Find the closest (rows, cols) factor pair where rows >= cols and rows * cols >= n.
    If n has no exact factor pair, find the best fit.
    """
    # Find all factor pairs (r, c) where r * c = n or just above it
    factor_pairs = [(i, math.ceil(n / i)) for i in range(1, int(math.sqrt(n)) + 1)]
    valid_pairs = [(r, c) for r, c in factor_pairs if r * c >= n]

    # Choose the most square-like layout where rows >= cols
    best_fit = min(valid_pairs, key=lambda x: (abs(x[0] - x[1]), x[0]))  
    
    return best_fit

In [35]:
def displot(df, x=None, y=None, kde=True, bins=30, base_size=(3, 3)):
    if x is None:
        raise ValueError("x must be specified")
    if y is not None:
        # Get unique values of y
        unique_y_values = df[y].unique()

        # Get the best grid shape for the number of unique y values
        rows, cols = best_grid_shape(len(unique_y_values))

        # Compute figure size dynamically
        figsize = (cols * base_size[0], rows * base_size[1])

        # Create subplots
        _, axes = plt.subplots(rows, cols, figsize=figsize)

        x_min = df[x].min()
        x_max = df[x].max()

        for ax, y_value in zip(axes.flatten(), unique_y_values):
            # Create the catplot in the specified subplot
            sns.histplot(
                df[df[y] == y_value],
                x=x,
                ax=ax,
                kde=kde,
                bins=bins,
            )
            ax.set_title(y_value)
            ax.set_xlim(x_min, x_max)

        plt.tight_layout()
    else:
        sns.displot(df, x=x, kde=kde)
    plt.show()

In [None]:
displot(df, x="price")

In [None]:
displot(df_no_price_outliers, x="price")

In [None]:
displot(df, x="price", y="propertyType")

In [None]:
displot(df_no_price_outliers, x="price", y="propertyType")

In [None]:
displot(df, x="price", y="municipality")

In [None]:
displot(df_no_price_outliers, x="price", y="municipality")

In [42]:
df["logPrice"] = np.log(df["price"])

In [None]:
describe_column(df, "logPrice")

In [44]:
variables_with_outliers = ["logPrice"]

# Remove outliers for all specified variables
df_no_log_price_outliers = remove_outliers(df, variables=variables_with_outliers)

In [None]:
df.info()

In [None]:
df_no_log_price_outliers.info()

In [None]:
catplot(df, x="logPrice")

In [None]:
catplot(df_no_log_price_outliers, x="logPrice")

In [None]:
catplot(df, x="logPrice", y="propertyType")

In [None]:
catplot(df_no_log_price_outliers, x="logPrice", y="propertyType")

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="logPrice",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["logPrice"].median().sort_values().index,
)

In [None]:
sns.displot(df, x="logPrice", kde=True)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10, 5))

min_log_price = df["logPrice"].min()
max_log_price = df["logPrice"].max()

for ax, property_type in zip(axes.flatten(), property_types):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["propertyType"] == property_type],
        x="logPrice",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(property_type)
    ax.set_xlim(min_log_price, max_log_price)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(6, 4, figsize=(12, 12))

for ax, freguesia in zip(axes.flatten(), freguesias_lisboa):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["municipality"] == freguesia],
        x="logPrice",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(freguesia)
    ax.set_xlim(min_log_price, max_log_price)

plt.tight_layout()
plt.show()

In [59]:
df["pricePerSquareMeter"] = df["price"] / df["size"]

In [None]:
sns.boxplot(x=df["pricePerSquareMeter"])

In [None]:
sns.catplot(
    x="pricePerSquareMeter",
    y="propertyType",
    data=df,
    kind="box",
    order=df.groupby("propertyType")["pricePerSquareMeter"].median().sort_values().index,
)

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="pricePerSquareMeter",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["pricePerSquareMeter"]
    .median()
    .sort_values()
    .index,
)

In [None]:
sns.displot(df, x="pricePerSquareMeter", kde=True)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10, 5))

min_price_per_square_meter = df["pricePerSquareMeter"].min()
max_price_per_square_meter = df["pricePerSquareMeter"].max()

for ax, property_type in zip(axes.flatten(), property_types):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["propertyType"] == property_type],
        x="pricePerSquareMeter",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(property_type)
    ax.set_xlim(min_price_per_square_meter, max_price_per_square_meter)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(6, 4, figsize=(12, 12))

for ax, freguesia in zip(axes.flatten(), freguesias_lisboa):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["municipality"] == freguesia],
        x="pricePerSquareMeter",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(freguesia),
    ax.set_xlim(min_price_per_square_meter, max_price_per_square_meter)

plt.tight_layout()
plt.show()

In [66]:
df["logPricePerSquareMeter"] = np.log(df["pricePerSquareMeter"])

In [None]:
sns.boxplot(x=df["logPricePerSquareMeter"])

In [None]:
sns.catplot(
    x="logPricePerSquareMeter",
    y="propertyType",
    data=df,
    kind="box",
    order=df.groupby("propertyType")["logPricePerSquareMeter"].median().sort_values().index,
)

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="logPricePerSquareMeter",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["logPricePerSquareMeter"]
    .median()
    .sort_values()
    .index,
)

In [None]:
sns.displot(df, x="logPricePerSquareMeter", kde=True)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10, 5))

min_log_price_per_square_meter = df["logPricePerSquareMeter"].min()
max_log_price_per_square_meter = df["logPricePerSquareMeter"].max()

for ax, property_type in zip(axes.flatten(), property_types):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["propertyType"] == property_type],
        x="logPricePerSquareMeter",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(property_type)
    ax.set_xlim(min_log_price_per_square_meter, max_log_price_per_square_meter)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(6, 4, figsize=(12, 12))

for ax, freguesia in zip(axes.flatten(), freguesias_lisboa):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df["municipality"] == freguesia],
        x="logPricePerSquareMeter",
        ax=ax,
        kde=True,
        bins=30,
    )
    ax.set_title(freguesia)
    ax.set_xlim(min_log_price_per_square_meter, max_log_price_per_square_meter)


plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="pricePerSquareMeter",
    y="district",
    data=df[df.municipality == "Santa Maria Maior"],
    kind="box",
    order=df[df.municipality == "Santa Maria Maior"]
    .groupby("district")["pricePerSquareMeter"]
    .median()
    .sort_values()
    .index,
)

In [None]:
districts = df[df.municipality == "Santa Maria Maior"].district.unique()
len(districts)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 6))

for ax, district in zip(axes.flatten(), districts):
    # Create the catplot in the specified subplot
    sns.histplot(
        df[df.district == district],
        x="pricePerSquareMeter",
        kde=True,
        ax=ax,
        bins=20,
        hue="propertyType",
    )
    ax.set_title(district)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="logPricePerSquareMeter",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["logPricePerSquareMeter"]
    .median()
    .sort_values()
    .index,
)

In [None]:
df.columns

In [78]:
variables_with_outliers = ["size", "rooms", "bathrooms"]

# Remove outliers for all specified variables
df_cleaned = remove_outliers(df, variables=variables_with_outliers)

In [79]:
stats = df[["municipality", "pricePerSquareMeter"]].groupby(
    ["municipality"]
).describe().dropna()

In [None]:
stats.sort_values(("pricePerSquareMeter", "count"), ascending=False)

In [81]:
features = [
    "floor",
    "propertyType",
    "size",
    "rooms",
    "bathrooms",
    "address",
    "province",
    "municipality",
    "district",
    "country",
    "latitude",
    "longitude",
    "showAddress",
    "url",
    "distance",
    "description",
    "status",
    "newDevelopment",
    "hasLift",
    "topNewDevelopment",
    "topPlus",
    "neighborhood",
    "newDevelopmentFinished",
    "snapshotDate",
    "hasParkingSpace",
    "isParkingSpaceIncludedInPrice",
]
targets = [
    "price",
    "logPrice",
    "pricePerSquareMeter",
    "priceByArea",
    "logPricePerSquareMeter",
    "formerPrice",
    "priceChange",
    "parkingSpacePrice",
    "formerPrice",
    "priceChange",
]

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    df[features].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="logPrice",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["logPrice"].median().sort_values().index,
)

In [None]:
plt.figure(figsize=(20, 10))
sns.catplot(
    x="pricePerSquareMeter",
    y="municipality",
    data=df,
    kind="box",
    order=df.groupby("municipality")["pricePerSquareMeter"]
    .median()
    .sort_values()
    .index,
)

In [None]:
df["municipality"].value_counts()

In [None]:
df

In [None]:
# Step 1: Extract relevant data for clustering
coords = df[["latitude", "longitude"]]

# Step 2: Reset the index ahead of the merge
df = df.reset_index()

# Step 3: Compute average price for each cluster
median_price_per_municipality = (
    df.groupby("municipality")["price"].median().rename("median_price_per_municipality")
)
df = df.merge(
    median_price_per_municipality, on="municipality", how="left", validate="many_to_one"
)

# After operations, set 'propertyCode' back as the index
df = df.set_index("propertyCode")

# Step 4: Create GeoDataFrame for plotting
geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
geo_df = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# Step 5: Interactive map using Folium
m = folium.Map(location=[38.740, -9.150], zoom_start=12.2)  # Centered on Lisbon

# Create a colormap
num_clusters = df["municipality"].nunique()
colormap = plt.get_cmap("tab20", num_clusters)  # Use a colormap with distinct colors
normalize = colors.Normalize(vmin=0, vmax=num_clusters - 1)

# Get unique municipalities and map them to integers
municipality_mapping = {
    name: idx for idx, name in enumerate(df["municipality"].unique())
}

# Map the 'municipality' column to integers
df["municipality_int"] = df["municipality"].map(municipality_mapping)

# Add clusters to the map
for _, row in df.iterrows():
    cluster = row["municipality_int"]
    price = row["median_price_per_municipality"]
    color = colors.rgb2hex(colormap(normalize(cluster)))  # Get color for the cluster

    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=5,
        color=color,
        fill=True,
        fill_opacity=0.6,
        popup=f"Cluster: {cluster}<br>Median Price: €{price:.2f}",
    ).add_to(m)

m

In [None]:
median_price_per_municipality

In [None]:
df

In [None]:
df["municipality"].value_counts()

In [91]:
global_median = df["logPrice"].median()

# Calculate median and count per municipality
agg = (
    df.groupby("municipality")["logPrice"]
    .agg(["median", "count"])
    .rename(columns={"median": "municipality_median", "count": "sample_count"})
)

# Smoothing factor
alpha = 10

# Apply smoothing
agg["smoothed_median"] = (
    agg["sample_count"] * agg["municipality_median"] + alpha * global_median
) / (agg["sample_count"] + alpha)

# Map smoothed median back to the main dataframe
df["municipality_encoded"] = df["municipality"].map(agg["smoothed_median"])

In [None]:
len(df["municipality"].unique()) == len(df["municipality_encoded"].unique())

In [None]:
agg.sort_values("smoothed_median")

In [None]:
sns.catplot(
    x="logPrice",
    y="municipality",
    data=df,
    kind="box",
    order=agg.sort_values("municipality_median").index,
)

In [None]:
df.columns

In [96]:
features = [
    "propertyType",
    "size",
    "rooms",
    "bathrooms",
    "municipality",
    "latitude",
    "longitude",
    "status",
    "newDevelopment",
    "hasLift",
    "newDevelopmentFinished",
    "hasParkingSpace",
    "isParkingSpaceIncludedInPrice",
]
targets = ["price", "logPrice"]

In [None]:
df[features].info()

In [None]:
df.status.value_counts()

In [99]:
df = df.dropna(subset="status")

In [100]:
def encode_categorical(df, features, column):
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    features = [feature for feature in features if feature != column] + list(
        dummies.columns
    )
    return df, features

In [101]:
df, features = encode_categorical(df, features, "propertyType")
df, features = encode_categorical(df, features, "status")

In [102]:
df[df.select_dtypes(include="bool").columns] = df.select_dtypes(include="bool").astype(
    int
)

In [None]:
df[features + targets].info()

In [104]:
def drop_single_value_columns(df, features):
    for feature in features:
        if len(df[feature].unique()) == 1:
            df = df.drop(feature, axis=1)
            features.remove(feature)
    return df, features

In [105]:
df, features = drop_single_value_columns(df, features)

In [None]:
features

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    df[features + targets].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)

In [None]:
sns.pairplot(df[features + targets], diag_kind="kde")

In [109]:
X = df[features]
y = df[targets]

In [None]:
X

In [111]:
X.to_csv(f"../data/training/{date_or_unioned}-features-{operation}.csv", index=True)
y.to_csv(f"../data/training/{date_or_unioned}-targets-{operation}.csv", index=True)