In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import geopandas as gpd
import folium
from shapely.geometry import Point

In [2]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

In [3]:
def is_valid_date_format(date_string):
    try:
        datetime.datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

In [4]:
city = "lisbon"
operation = "rent"
date_or_unioned = "unioned"

In [5]:
if not (date_or_unioned == "unioned" or is_valid_date_format(date_or_unioned)):
    raise ValueError(
        "date_or_unioned must be 'unioned' or a valid date in the format 'YYYY-MM-DD'"
    )

In [6]:
file_name = f"{date_or_unioned}-{city}-listings-for-{operation}-lof.csv"

In [7]:
read_path = f"../data/idealista"

df = pd.read_csv(
    f"{read_path}/cleaned/{operation}/{city}/{file_name}", index_col="propertyCode"
)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
numeric_columns = df.select_dtypes(include="number").columns
numeric_columns

In [None]:
categorical_columns = df.select_dtypes(include=["bool", 'object']).columns
categorical_columns

In [58]:
boolean_columns = df.select_dtypes(include="bool").columns

In [None]:
plt.figure(figsize=(20, 10))
municipality_count_plot = sns.countplot(
    x="municipality",
    data=df,
    order=df["municipality"].value_counts().index,
)

# Set proper tick locations before changing labels
municipality_count_plot.set_xticks(range(len(df["municipality"].unique())))
municipality_count_plot.set_xticklabels(
    municipality_count_plot.get_xticklabels(), rotation=90
)

plt.show()

In [None]:
municipality_counts = df["municipality"].value_counts()
municipality_counts

In [None]:
plt.figure(figsize=(20, 10))
property_type_count_plot = sns.countplot(
    x="propertyType",
    data=df,
    order=df["propertyType"].value_counts().index,
)

# Set proper tick locations before changing labels
property_type_count_plot.set_xticks(range(len(df["propertyType"].unique())))
property_type_count_plot.set_xticklabels(
    property_type_count_plot.get_xticklabels(), rotation=90
)

plt.show()

In [17]:
property_types = df["propertyType"].unique()

In [18]:
def describe_column(df, column):
    """
    Returns a summary of a column in a DataFrame.

    Parameters:
    df (pd.DataFrame): The dataset.
    column (str): The column to describe.

    Returns:
    pd.Series: Summary of the column.
    """
    return df[column].describe()

In [None]:
describe_column(df, "price")

In [20]:
def catplot(df, x, y=None, kind="box"):
    if y is not None:
        sns.catplot(
            x=x,
            y=y,
            data=df,
            kind=kind,
            order=df.groupby(y)[x].median().sort_values().index,
        )
    else:
        sns.catplot(x=x, data=df, kind=kind)
    plt.show()

In [None]:
catplot(df, x="price")

In [None]:
catplot(df, x="price", y="propertyType")

In [None]:
catplot(df, x="price", y="municipality")

In [24]:
import math

def best_grid_shape(n):
    """
    Find the closest (rows, cols) factor pair where rows >= cols and rows * cols >= n.
    If n has no exact factor pair, find the best fit.
    """
    # Find all factor pairs (r, c) where r * c = n or just above it
    factor_pairs = [(i, math.ceil(n / i)) for i in range(1, int(math.sqrt(n)) + 1)]
    valid_pairs = [(r, c) for r, c in factor_pairs if r * c >= n]

    # Choose the most square-like layout where rows >= cols
    best_fit = min(valid_pairs, key=lambda x: (abs(x[0] - x[1]), x[0]))  
    
    return best_fit

In [25]:
def displot(df, x=None, y=None, kde=True, bins=30, base_size=(3, 3), hue=None):
    if x is None:
        raise ValueError("x must be specified")
    if y is not None:
        # Get unique values of y
        unique_y_values = df[y].unique()

        # Get the best grid shape for the number of unique y values
        rows, cols = best_grid_shape(len(unique_y_values))

        # Compute figure size dynamically
        figsize = (cols * base_size[0], rows * base_size[1])

        # Create subplots
        _, axes = plt.subplots(rows, cols, figsize=figsize)

        x_min = df[x].min()
        x_max = df[x].max()

        for ax, y_value in zip(axes.flatten(), unique_y_values):
            # Create the catplot in the specified subplot
            sns.histplot(
                df[df[y] == y_value],
                x=x,
                ax=ax,
                kde=kde,
                bins=bins,
                hue=hue
            )
            ax.set_title(y_value)
            ax.set_xlim(x_min, x_max)

        plt.tight_layout()
    else:
        sns.displot(df, x=x, kde=kde, hue=hue, bins=bins)
    plt.show()

In [None]:
displot(df, x="price")

In [None]:
displot(df, x="price", y="propertyType")

In [None]:
displot(df, x="price", y="municipality")

In [None]:
describe_column(df, "logPrice")

In [None]:
catplot(df, x="logPrice")

In [None]:
catplot(df, x="logPrice", y="propertyType")

In [None]:
catplot(df, x="logPrice", y="municipality")

In [None]:
displot(df, x="logPrice")

In [None]:
displot(df, x="logPrice", y="propertyType")

In [None]:
displot(df, x="logPrice", y="municipality")

In [None]:
catplot(df, x="pricePerSquareMeter")

In [None]:
catplot(df, x="pricePerSquareMeter", y="propertyType")

In [None]:
catplot(df, x="pricePerSquareMeter", y="municipality")

In [None]:
displot(df, x="pricePerSquareMeter")

In [None]:
displot(df, x="pricePerSquareMeter", y="propertyType")

In [None]:
displot(df, x="pricePerSquareMeter", y="municipality")

In [None]:
catplot(df, x="logPricePerSquareMeter")

In [None]:
catplot(df, x="logPricePerSquareMeter", y="propertyType")

In [None]:
catplot(df, x="logPricePerSquareMeter", y="municipality")

In [None]:
displot(df, x="logPricePerSquareMeter")

In [None]:
displot(df, x="logPricePerSquareMeter", y="propertyType")

In [None]:
displot(df, x="logPricePerSquareMeter", y="municipality")

In [49]:
features = [
    "propertyType",
    "size",
    "rooms",
    "bathrooms",
    "municipality",
    "latitude",
    "longitude",
    "status",
    "newDevelopment",
    "hasLift",
    "newDevelopmentFinished",
    "hasParkingSpace",
    "isParkingSpaceIncludedInPrice",
]
targets = [
    "price",
    "logPrice",
    "pricePerSquareMeter",
    "logPricePerSquareMeter",
]

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    df[features].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)

In [None]:
# Step 1: Extract relevant data for clustering
coords = df[["latitude", "longitude"]]

# Step 2: Reset the index ahead of the merge
df = df.reset_index()

# Step 3: Compute average price for each cluster
median_price_per_municipality = (
    df.groupby("municipality")["price"].median().rename("median_price_per_municipality")
)
df = df.merge(
    median_price_per_municipality, on="municipality", how="left", validate="many_to_one"
)

# After operations, set 'propertyCode' back as the index
df = df.set_index("propertyCode")

# Step 4: Create GeoDataFrame for plotting
geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
geo_df = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# Step 5: Interactive map using Folium
m = folium.Map(location=[38.740, -9.150], zoom_start=12.2)  # Centered on Lisbon

# Create a colormap
num_clusters = df["municipality"].nunique()
colormap = plt.get_cmap("tab20", num_clusters)  # Use a colormap with distinct colors
normalize = colors.Normalize(vmin=0, vmax=num_clusters - 1)

# Get unique municipalities and map them to integers
municipality_mapping = {
    name: idx for idx, name in enumerate(df["municipality"].unique())
}

# Map the 'municipality' column to integers
df["municipality_int"] = df["municipality"].map(municipality_mapping)

# Add clusters to the map
for _, row in df.iterrows():
    cluster = row["municipality_int"]
    price = row["median_price_per_municipality"]
    color = colors.rgb2hex(colormap(normalize(cluster)))  # Get color for the cluster

    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=5,
        color=color,
        fill=True,
        fill_opacity=0.6,
        popup=f"Cluster: {cluster}<br>Median Price: €{price:.2f}",
    ).add_to(m)

m

In [None]:
median_price_per_municipality

In [55]:
df = df.dropna(subset="status")

In [56]:
def encode_categorical(df, features, column):
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    features = [feature for feature in features if feature != column] + list(
        dummies.columns
    )
    return df, features

In [57]:
df, features = encode_categorical(df, features, "propertyType")
df, features = encode_categorical(df, features, "status")

In [59]:
df[boolean_columns] = df[boolean_columns].astype(int)

In [None]:
df[features + targets].info()

In [61]:
def drop_single_value_columns(df, features):
    for feature in features:
        if len(df[feature].unique()) == 1:
            df = df.drop(feature, axis=1)
            features.remove(feature)
    return df, features

In [62]:
df, features = drop_single_value_columns(df, features)

In [None]:
features

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    df[features + targets].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)

In [None]:
sns.pairplot(df[features + targets], diag_kind="kde")

In [66]:
X = df[features]
y = df[targets]

In [None]:
X

In [111]:
X.to_csv(f"../data/training/{date_or_unioned}-features-{operation}.csv", index=True)
y.to_csv(f"../data/training/{date_or_unioned}-targets-{operation}.csv", index=True)