In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Polygon, MultiPolygon, Point, box
import geopandas as gpd
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from tqdm import tqdm  # Optional: For progress tracking
import os

from shapely import wkt
import contextily as cx


In [2]:

import unicodedata

# Function to normalize the names
def normalize_municipality_name(name):
    if type(name) == float:
        return name
    else:
        # Normalize the string (remove diacritical marks)
        name_without_diacritics = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
        # Convert to lowercase
        return name_without_diacritics.lower()



In [3]:
mun_gdf = gpd.read_file(r'../../../countries/portugal/datasets/municipality_data/municipalities-shapefile-2/concelhos.shp')
mun_metadata  = pd.read_excel(r'../../../countries/portugal/datasets/municipality_data/concelhos-metadata.xlsx', dtype={'dicofre' : 'string'})
mun_metadata = mun_metadata[['dicofre','designacao']]

mun_gdf = mun_gdf.merge(mun_metadata, left_on='CCA_2', right_on='dicofre').drop(columns='NAME_2')
mun_gdf = mun_gdf.rename(columns={'designacao': 'NAME_2'})
mun_gdf['concelho'] = mun_gdf['NAME_2'].map(normalize_municipality_name)
mun_gdf = mun_gdf[~mun_gdf['NAME_1'].isin(['Azores', 'Madeira'])]
west, south, east, north = mun_gdf.total_bounds

Use the above mun_gdf for portuguese municipalites. 

In [None]:
def aggregate_to_municipality(
    df, 
    municipalities_gdf, 
    grid_resolution=(0.25, 0.25), 
    time_range=(2013, 2023), 
    lat_col='latitude', 
    lon_col='longitude', 
    time_col='time', 
    municipality_id_col='concelho', 
    crs="EPSG:3763"
):
    """
    Processes a cleaned DataFrame (from a parquet file) and aggregates its numerical variables 
    to the municipality level using area-weighted aggregation.
    
    The function performs the following steps:
      1. Renames 'lat' or 'lon' columns to 'latitude' and 'longitude' if needed.
      2. Filters the time column to years within time_range (if a time column exists).
      3. Creates a GeoDataFrame from the lat/lon points.
      4. Constructs square polygons centered on each point with the specified grid_resolution.
      5. Reprojects the GeoDataFrame to the specified crs (defaults to EPSG:3763).
      6. Uses an overlay with the provided municipalities to select only the grid cells that 
         fall within Portuguese boundaries.
      7. Performs a spatial join so that each grid cell gets assigned its municipality.
      8. Computes the area for each grid cell (vectorised).
      9. For each numeric column (excluding lat, lon, time, geometry, and area),
         computes an area-weighted value.
     10. Groups the data by municipality (and time, if available) and sums the weighted values.
    
    Parameters:
      df : pd.DataFrame
          Input DataFrame containing the point-level data.
      municipalities_gdf : gpd.GeoDataFrame
          A GeoDataFrame with municipality polygons. This should either be in the target crs 
          or will be reprojected to it.
      grid_resolution : tuple (lat_resolution, lon_resolution)
          Resolution of the grid cells (polygons) to construct around each point.
      time_range : tuple (start_year, end_year)
          The inclusive range of years to keep (if a time column exists).
      lat_col, lon_col : str
          Column names for latitude and longitude. If the DataFrame has columns 'lat' or 'lon', 
          they will be renamed accordingly.
      time_col : str
          Column name for the time variable.
      municipality_id_col : str
          Column name in the municipalities GeoDataFrame that identifies the municipality.
      crs : str
          Target coordinate reference system. Default is "EPSG:3763".
    
    Returns:
      pd.DataFrame
          A DataFrame aggregated to the municipality level with area-weighted values.
    """
    import pandas as pd
    import numpy as np
    import geopandas as gpd
    from shapely.geometry import box
    from tqdm import tqdm

    # 1) Rename lat/lon if needed
    if 'lat' in df.columns and lat_col != 'lat':
        df = df.rename(columns={'lat': lat_col})
    if 'lon' in df.columns and lon_col != 'lon':
        df = df.rename(columns={'lon': lon_col})

    # 2) If time_col exists, filter by time_range
    if time_col in df.columns:
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
        df = df[df[time_col].dt.year.between(time_range[0], time_range[1])]

    # 3) Create GeoDataFrame of squares around each unique lat/lon
    unique_coords = df[[lat_col, lon_col]].drop_duplicates().reset_index(drop=True)
    unique_coords['polygon_id'] = range(len(unique_coords))

    lat_res, lon_res = grid_resolution
    half_lat = lat_res / 2.0
    half_lon = lon_res / 2.0

    gdf_points = gpd.GeoDataFrame(
        unique_coords.copy(),
        geometry=gpd.points_from_xy(unique_coords[lon_col], unique_coords[lat_col]),
        crs="EPSG:4326"
    )
    polygons = [
        box(pt.x - half_lon, pt.y - half_lat, pt.x + half_lon, pt.y + half_lat)
        for pt in tqdm(gdf_points.geometry, desc="Constructing grid polygons")
    ]
    gdf_points['geometry'] = polygons
    grid_gdf = gdf_points[[lat_col, lon_col, 'polygon_id', 'geometry']].copy()

    # 4) Reproject grid to the target crs
    grid_gdf = grid_gdf.to_crs(crs)

    # 5) Ensure municipalities are in the target crs
    if municipalities_gdf.crs != crs:
        municipalities_gdf = municipalities_gdf.to_crs(crs)

    # 6) Overlay to get coverage fraction
    overlay_gdf = gpd.overlay(municipalities_gdf, grid_gdf, how='intersection')
    overlay_gdf['cell_area'] = overlay_gdf.geometry.area
    poly_area_sum = overlay_gdf.groupby('polygon_id')['cell_area'].sum().rename('polygon_total_area')
    overlay_gdf = overlay_gdf.merge(poly_area_sum, on='polygon_id', how='left')
    overlay_gdf['coverage_fraction'] = overlay_gdf['cell_area'] / overlay_gdf['polygon_total_area']
    coverage_table = overlay_gdf[['polygon_id', municipality_id_col, 'coverage_fraction']].copy()

    # 7) Merge main df with polygon_id
    df_merged = df.merge(grid_gdf[[lat_col, lon_col, 'polygon_id']], on=[lat_col, lon_col], how='left')

    # 8) If time_col exists, create a 'year' for chunking
    if time_col in df_merged.columns:
        df_merged['year'] = df_merged[time_col].dt.year

    # 9) Identify numeric columns
    exclude_cols = {lat_col, lon_col, time_col, 'polygon_id', 'year', 'geometry'}
    numeric_cols = df_merged.select_dtypes(include=[np.number]).columns.difference(exclude_cols)

    # 10) Group and aggregate
    final_results = []
    if 'year' in df_merged.columns:
        years = sorted(df_merged['year'].dropna().unique())
        for y in years:
            chunk_df = df_merged[df_merged['year'] == y].copy()
            if chunk_df.empty:
                continue
            merged_cov = chunk_df.merge(coverage_table, on='polygon_id', how='left')
            merged_cov.dropna(subset=['coverage_fraction'], inplace=True)
            for col in numeric_cols:
                merged_cov[f'{col}_wtd'] = merged_cov[col] * merged_cov['coverage_fraction']
            grouped = merged_cov.groupby([municipality_id_col, time_col])[
                [f'{col}_wtd' for col in numeric_cols]
            ].sum().reset_index()
            final_results.append(grouped)
        aggregated = pd.concat(final_results, ignore_index=True) if final_results else pd.DataFrame()
    else:
        merged_cov = df_merged.merge(coverage_table, on='polygon_id', how='left')
        merged_cov.dropna(subset=['coverage_fraction'], inplace=True)
        for col in numeric_cols:
            merged_cov[f'{col}_wtd'] = merged_cov[col] * merged_cov['coverage_fraction']
        aggregated = merged_cov.groupby(municipality_id_col)[
            [f'{col}_wtd' for col in numeric_cols]
        ].sum().reset_index()

    return aggregated
