In [1]:
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import pandas as pd
import os

In [2]:
base_path = "../data/idealista/cleaned/rent/lisbon"
file_name = "unioned-lisbon-listings-for-rent"
file_extension = ".csv"
read_path = os.path.join(base_path, file_name + file_extension)

In [3]:
df = pd.read_csv(read_path, index_col='propertyCode')

In [4]:
def add_distance_to_city_center(df, latitude_col='latitude', longitude_col='longitude', crs="EPSG:4326", center_coords=(38.7071, -9.1355)):
    """
    Adds a column to the DataFrame with the distance (in meters) to the city center of City (Praça do Comércio).

    Args:
    - df (pd.DataFrame): DataFrame with latitude and longitude columns.
    - latitude_col (str): Name of the latitude column in df.
    - longitude_col (str): Name of the longitude column in df.

    Returns:
    - pd.DataFrame: Original DataFrame with an added 'distanceToCityCenter' column (in meters).
    """
    # Copy DataFrame to avoid modifying the original
    new_df = df.copy()

    # city center coordinates
    city_center = Point(center_coords[1], center_coords[0])  # (longitude, latitude)

    # Convert DataFrame to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        new_df,
        geometry=gpd.points_from_xy(new_df[longitude_col], new_df[latitude_col]),
        crs=crs  # WGS84 Latitude/Longitude
    )

    # Reproject to a metric CRS for accurate distance calculation (EPSG:3857 is suitable for distance in meters)
    gdf = gdf.to_crs("EPSG:3857")
    city_center_gdf = gpd.GeoSeries([city_center], crs="EPSG:4326").to_crs("EPSG:3857")[0]

    # Compute distance (in meters) to City city center
    gdf['distanceToCityCenter'] = gdf.geometry.distance(city_center_gdf)

    # Return DataFrame without geometry if not needed
    new_df['distanceToCityCenter'] = gdf['distanceToCityCenter']

    return new_df

In [5]:
df = add_distance_to_city_center(df)

In [6]:
def add_distance_and_nearest_poi_name(
    df,
    feature_name,
    geojson_path,
    latitude_col="latitude",
    longitude_col="longitude",
    poi_crs="EPSG:4326",
):
    """
    Adds a distance column and the name of the nearest POI to the DataFrame from a GeoJSON file.

    Args:
    - df (pd.DataFrame): DataFrame with latitude and longitude columns.
    - feature_name (str): Name of the feature to be added to the DataFrame.
    - geojson_path (str): Path to the GeoJSON file.
    - latitude_col (str): Name of the latitude column in df.
    - longitude_col (str): Name of the longitude column in df.
    - poi_crs (str): CRS of the POI data (default: EPSG:4326 for WGS84).

    Returns:
    - pd.DataFrame: Original DataFrame with added distance and nearest POI name columns.
    """
    # Copy DataFrame to avoid modifying the original
    new_df = df.copy()

    # Load GeoJSON as GeoDataFrame
    poi_gdf = gpd.read_file(geojson_path)
    poi_gdf = poi_gdf.to_crs(poi_crs)  # Ensure correct CRS

    # Convert DataFrame to GeoDataFrame
    listings_gdf = gpd.GeoDataFrame(
        new_df,
        geometry=gpd.points_from_xy(new_df[longitude_col], new_df[latitude_col]),
        crs=poi_crs,
    )

    # Reproject to metric CRS for distance calculation
    metric_crs = "EPSG:3857"
    poi_gdf = poi_gdf.to_crs(metric_crs)
    listings_gdf = listings_gdf.to_crs(metric_crs)

    # Compute nearest distances and POI names
    def compute_nearest_poi_info(listing, poi_gdf):
        # Find the nearest POI geometry
        nearest_point = nearest_points(listing.geometry, poi_gdf.union_all())[1]
        # Find the row in poi_gdf corresponding to the nearest geometry
        nearest_poi = poi_gdf.loc[poi_gdf.geometry == nearest_point]
        # Extract the distance and name of the nearest POI
        distance = listing.geometry.distance(nearest_point)
        # Define the possible keys for the POI name
        poi_keys = ["INF_NOME", "NOME", "DESIGNACAO", "name"]
        name = None  # Default value
        # Find the first non-empty POI name
        for poi_name in poi_keys:
            if poi_name in nearest_poi:
                name = (
                    nearest_poi[poi_name].values[0] if not nearest_poi.empty else None
                )
                break
        return distance, name

    # Apply the function to compute both distance and nearest POI name
    results = listings_gdf.apply(
        lambda row: compute_nearest_poi_info(row, poi_gdf), axis=1
    )

    # Extract distances and POI names into separate columns
    listings_gdf[f"distanceToNearest{feature_name}"] = results.apply(lambda x: x[0])
    listings_gdf[f"nearest{feature_name}Name"] = results.apply(lambda x: x[1])

    # Add distances and names back to the original DataFrame
    new_df[f"distanceToNearest{feature_name}"] = listings_gdf[
        f"distanceToNearest{feature_name}"
    ]
    new_df[f"nearest{feature_name}Name"] = listings_gdf[f"nearest{feature_name}Name"]

    return new_df

In [7]:
df = add_distance_and_nearest_poi_name(
    df,
    "Mall",
    "../data/geojson/centros_comerciais.geojson",
)

In [8]:
df = add_distance_and_nearest_poi_name(
    df,
    "Train",
    "../data/geojson/comboios.geojson",
)

In [9]:
df = add_distance_and_nearest_poi_name(
    df,
    "Fair",
    "../data/geojson/feiras.geojson",
)

In [10]:
df = add_distance_and_nearest_poi_name(
    df,
    "Park",
    "../data/geojson/jardins_parques_urbanos.geojson",
)

In [11]:
df = add_distance_and_nearest_poi_name(
    df,
    "Market",
    "../data/geojson/mercados.geojson",
)

In [12]:
df = add_distance_and_nearest_poi_name(
    df,
    "Metro",
    "../data/geojson/metro.geojson",
)

In [13]:
df = add_distance_and_nearest_poi_name(
    df,
    "Viewpoint",
    "../data/geojson/miradouros.geojson",
)

In [14]:
df = add_distance_and_nearest_poi_name(
    df,
    "Playground",
    "../data/geojson/parques_infantis.geojson",
)

In [15]:
df = add_distance_and_nearest_poi_name(
    df,
    "Bus",
    "../data/geojson/autocarros.geojson",
)

In [16]:
df = add_distance_and_nearest_poi_name(
    df,
    "Tram",
    "../data/geojson/eletricos.geojson",
)

In [17]:
df.head()

Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,...,distanceToNearestMetro,nearestMetroName,distanceToNearestViewpoint,nearestViewpointName,distanceToNearestPlayground,nearestPlaygroundName,distanceToNearestBus,nearestBusName,distanceToNearestTram,nearestTramName
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33829652,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-009130,33,1,2475.0,flat,rent,179.0,3,3,...,250.26182,Saldanha,1142.440298,Miradouro do Parque Eduardo VII,641.621174,Parque Infantil do Jardim Gomes Amorim,64.135685,Saldanha,1890.872576,Rua Maria Andrade
33596155,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-004534,33,bj,2150.0,flat,rent,170.0,3,3,...,198.676223,Saldanha,1217.962245,Miradouro do Parque Eduardo VII,562.731979,Parque Infantil do Jardim Gomes Amorim,52.045424,Saldanha,1909.529635,Rua Maria Andrade
33896876,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,Match_01.216,21,8,1950.0,flat,rent,135.0,2,2,...,582.276537,Olaias,1410.666684,Miradouro da Penha de França,557.606783,Parque Infantil da Rua Aquiles Machado,159.398088,Casal Vistoso,2363.949266,Rua do Forno do Tijolo
31196503,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,CF119,9,1,700.0,studio,rent,25.0,0,1,...,344.207875,Santa Apolónia,304.879765,Miradouro de Santa Clara,203.098162,Parque Infantil da Rua do Vigário,146.805163,Rua do Paraíso,408.7987,Calçada de São Vicente
33896540,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,1,3000.0,flat,rent,130.0,2,3,...,607.249371,Rato,1062.957225,Miradouro do Parque Eduardo VII,238.793997,Parque Infantil do Jardim Marcelino Mesquita,220.712179,Rua das Amoreiras,141.821774,Rua das Amoreiras


In [18]:
len(df.columns)

65

In [19]:
file_name += "-with-geodata"
save_path = os.path.join(base_path, file_name + file_extension)

df.to_csv(save_path, index_label="propertyCode")