# Aggregates DGGS zone data

Assumes that the zone data was prepared beforehand (i.e.: see [`data_preparation.ipynb`](data_preparation.ipynb)).

Columns from different zone data outputs are merged together based on specified columns and variables of corresponding zones.
Considers temporal components for a *datetime-aware* DGGRS (i.e.: a sidecar temporal column exists, but is not directly embedded in zone IDs).

Produces a single GeoParquet file containing all aggregated zone data.

In [1]:
import os

import geopandas as gpd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
AGGREGATE_RESULT_FILE_PATH = "./outputs/manitoba_rcm_ard/IGEO7/collection.parquet"
AGGREGATE_SEARCH_DIR_PATH = "./outputs/manitoba_rcm_ard/IGEO7"

# distinct dataframes with similar variables to aggregate together
AGGREGATE_ZONE_VARIABLES = ["rr", "rl", "rrrl"]
# columns used as ID to merge corresponding zones
# (note: if datetime-aware, should include a temporal component as well)
AGGREGATE_ZONE_ID_COLUMNS = ["dggrid_ISEA7H", "day"]
# any column renaming (replace) to perform prior to aggregation and merging
# (note: below extended columns are affected by this change applied before)
AGGREGATE_RENAME_COLUMNS = {
    "cell_": "",
}
# columns to extend with the relevant above variable prefix (others merged as is / duplicates)
AGGREGATE_EXTEND_COLUMNS = [
    "minimum",
    "maximum",
    "mean",
    "median",
    "stddev",
]
# any 'current' containing one of the below values will be skipped
AGGREGATE_IGNORE = ["L12"]
# reorder resulting columns by specified ones first, followed by others as originally ordered
AGGREGATE_SORT_COLUMNS = AGGREGATE_ZONE_ID_COLUMNS + ["datetime", "resolution", "geometry"]
# reorder zones rows according to the specified column values, in ascending(TRUE)/descending(FALSE) order
AGGREGATE_SORT_ROWS = {
    "resolution": True,
    "datetime": True,
}

In [3]:

agg_zone_total = 0
agg_zone_data = []
agg_walk_progress = tqdm(
    os.walk(AGGREGATE_SEARCH_DIR_PATH),
    desc="Aggregating zone data",
)
for root_dir, sub_dirs, _ in agg_walk_progress:
    if sub_dirs != AGGREGATE_ZONE_VARIABLES:
        continue

    current = root_dir.replace(AGGREGATE_SEARCH_DIR_PATH, "").strip("/")
    if any(ignore in current for ignore in AGGREGATE_IGNORE):
        sub_dirs[:] = []  # don't recurse further
        continue

    merge_zone_data = None
    merge_zone_count = 0
    for sub_dir in sub_dirs:
        file_names = os.listdir(str(os.path.join(root_dir, sub_dir)))
        for file_name in file_names:
            if not file_name.endswith(".parquet"):
                continue
            file_path = os.path.join(root_dir, sub_dir, file_name)
            try:
                zone_data_var = gpd.read_parquet(file_path)
                zone_data_col_rename = {
                    col: col.replace(old, new)
                    for old, new in AGGREGATE_RENAME_COLUMNS.items()
                    for col in zone_data_var.columns
                }
                zone_data_var = zone_data_var.rename(columns=zone_data_col_rename)
                zone_data_col_merge = {
                    col: f"{sub_dir}_{col}"
                    for col in AGGREGATE_EXTEND_COLUMNS
                }
                zone_data_var = zone_data_var.rename(columns=zone_data_col_merge)
                if merge_zone_data is None:
                    merge_zone_data = zone_data_var
                else:
                    merge_zone_cols = AGGREGATE_ZONE_ID_COLUMNS + list(zone_data_col_merge.values())
                    merge_zone_data = merge_zone_data.merge(
                        zone_data_var[merge_zone_cols],
                        on=AGGREGATE_ZONE_ID_COLUMNS,
                        how="outer",
                        suffixes=("", ""),  # raise if something went wrong, don't do silent fixes
                    )
                merge_zone_count = len(merge_zone_data)
                agg_zone_total += merge_zone_count
                agg_zone_msg = f"{agg_zone_total} (+{merge_zone_count})"
                agg_walk_progress.set_postfix(current=current, total_zones=agg_zone_msg)
                break  # in case many were found, ignore others (cannot merge anyway / no priority)
            except Exception as exc:
                err_msg = f"Error while processing [{file_path}]: {exc}"
                raise Exception(err_msg) from exc

    agg_zone_data.append(merge_zone_data)
    sub_dirs[:] = []  # don't recurse further

print("Found results:", len(agg_zone_data))
agg_zone_data = gpd.pd.concat(agg_zone_data, ignore_index=True)
print("Soring results...")
if AGGREGATE_SORT_COLUMNS:
    agg_zone_cols = AGGREGATE_SORT_COLUMNS + agg_zone_data.columns.drop(AGGREGATE_SORT_COLUMNS).tolist()
    agg_zone_data = agg_zone_data.reindex(columns=agg_zone_cols)
agg_zone_sort = list(AGGREGATE_SORT_ROWS.items())
agg_zone_data = agg_zone_data.sort_values(
    by=[col for col, _ in agg_zone_sort],
    ascending=[col_asc for _, col_asc in agg_zone_sort],
)
print("Done aggregating, saving result:", AGGREGATE_RESULT_FILE_PATH)
agg_zone_data.to_parquet(AGGREGATE_RESULT_FILE_PATH)

Aggregating zone data: 0it [00:00, ?it/s]

Found results: 796
Soring results...
Done aggregating, saving result: ./outputs/manitoba_rcm_ard/IGEO7/collection.parquet


## Validation

In [4]:
agg_zone_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 880365 entries, 76 to 504094
Data columns (total 26 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   dggrid_ISEA7H  880365 non-null  object  
 1   day            880365 non-null  object  
 2   datetime       880365 non-null  object  
 3   resolution     880365 non-null  int64   
 4   geometry       880365 non-null  geometry
 5   center_lat     880365 non-null  float64 
 6   center_lon     880365 non-null  float64 
 7   avg_edge_len   880365 non-null  float64 
 8   area           880365 non-null  float64 
 9   perimeter      880365 non-null  float64 
 10  rr_minimum     880365 non-null  float32 
 11  rr_maximum     880365 non-null  float32 
 12  rr_mean        880365 non-null  float32 
 13  rr_median      880365 non-null  float32 
 14  rr_stddev      880365 non-null  float32 
 15  pixel_count    880365 non-null  int64   
 16  rl_minimum     880365 non-null  float32 
 17  rl_max

In [5]:
agg_zone_data.head()

Unnamed: 0,dggrid_ISEA7H,day,datetime,resolution,geometry,center_lat,center_lon,avg_edge_len,area,perimeter,...,rl_minimum,rl_maximum,rl_mean,rl_median,rl_stddev,rrrl_minimum,rrrl_maximum,rrrl_mean,rrrl_median,rrrl_stddev
76,2,2025-04-01,2025-04-01T16:39:29.752Z,0,"POLYGON ((-33.8 35.38545, -37.30483 41.76129, ...",33.523949,-78.8,774832.455308,42505510000000.0,23244970.0,...,4.5e-05,157.356857,0.07819,0.072656,0.068155,-42.10376,56.037849,0.001283,0.000894,0.020304
75,2,2025-04-02,2025-04-02T04:05:56.737Z,0,"POLYGON ((-33.8 35.38545, -37.30483 41.76129, ...",33.523949,-78.8,774832.455308,42505510000000.0,23244970.0,...,0.000433,673.485291,0.186399,0.143755,0.249906,-104.521782,176.544235,0.000739,0.000514,0.056395
74,2,2025-04-03,2025-04-03T16:55:53.710Z,0,"POLYGON ((-33.8 35.38545, -37.30483 41.76129, ...",33.523949,-78.8,774832.455308,42505510000000.0,23244970.0,...,0.000466,1279.182983,0.192974,0.158326,0.557023,-367.215729,297.749237,-0.000176,-0.000413,0.086103
73,2,2025-04-04,2025-04-04T16:31:28.293Z,0,"POLYGON ((-33.8 35.38545, -37.30483 41.76129, ...",33.523949,-78.8,774832.455308,42505510000000.0,23244970.0,...,7.9e-05,2680.847168,0.075856,0.076067,0.681283,-793.033386,256.419159,0.002814,0.001757,0.15119
72,2,2025-04-06,2025-04-06T04:06:20.489Z,0,"POLYGON ((-33.8 35.38545, -37.30483 41.76129, ...",33.523949,-78.8,774832.455308,42505510000000.0,23244970.0,...,0.000135,906.42511,0.09951,0.091506,0.292767,-12.101206,36.309296,0.001868,0.001614,0.019881


In [6]:
agg_zone_data[AGGREGATE_SORT_COLUMNS].nunique()

dggrid_ISEA7H    98462
day                 77
datetime            78
resolution          11
geometry         98462
dtype: int64

In [7]:
import pandas as pd

dates = pd.to_datetime(agg_zone_data['datetime']).dt.date.unique()
days = pd.to_datetime(agg_zone_data['day']).dt.date.unique()

display(set(days) - set(dates))
display(set(dates) - set(days))

set()

set()

In [8]:
agg_zone_data['datetime'].min(), agg_zone_data['datetime'].max()

('2025-04-01T16:39:29.752Z', '2025-11-09T05:14:08.126Z')