From 34caf8878c9fe7fde00902ea8a3ed67f86a1484e Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:27:41 +0100 Subject: [PATCH 01/15] Summary of Changes Created flixopt/dim_iterator.py New utility class DimIterator with: - Factory methods: from_dataset(), from_flow_system(), from_sentinel_lists() - Iteration: iter_slices(ds), iter_keys() - Key conversion: key_from_values(p, s), selector_for_key(key) - Combination: combine(slices, base_dims, ...), combine_arrays(slices, base_dims, ...) Refactored flixopt/clustering/base.py 1. Added DimIterator import 2. Removed unused combine_slices() function (~70 lines) 3. Added _iterator cached property to ClusteringResults 4. Simplified _build_property_array() from ~35 lines to ~3 lines 5. Simplified apply() to use selector_for_key() Refactored flixopt/transform_accessor.py 1. Added DimIterator import 2. Refactored _build_cluster_weight_da() to use DimIterator 3. Refactored _build_segment_durations_da() to use DimIterator 4. Refactored _build_clustering_metrics() to use DimIterator 5. Refactored _build_reduced_dataset() to use DimIterator 6. Refactored _build_cluster_assignments_da() to use DimIterator 7. Removed _combine_slices_to_dataarray_generic() (~62 lines) 8. Removed _combine_slices_to_dataarray_2d() (~40 lines) Lines of Code Impact - Added: ~400 lines (new dim_iterator.py module) - Removed: ~170 lines (old combine methods + combine_slices) - Net: +230 lines, but the new code is reusable across the codebase and eliminates repetitive patterns Benefits - Eliminates [None] sentinel value confusion - Eliminates 4-way if/elif branching for period/scenario combinations - Consistent key format throughout (tuples matching dims) - Single source of truth for period/scenario iteration logic - Easier to add new functionality (e.g., parallel processing) --- flixopt/clustering/base.py | 112 +-------- flixopt/dim_iterator.py | 454 ++++++++++++++++++++++++++++++++++ flixopt/transform_accessor.py | 249 +++++-------------- 3 files changed, 533 insertions(+), 282 deletions(-) create mode 100644 flixopt/dim_iterator.py diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index bc49ab660..97c4c3097 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -27,6 +27,7 @@ from ..plot_result import PlotResult from ..statistics_accessor import SelectType +from ..dim_iterator import DimIterator from ..statistics_accessor import _build_color_kwargs @@ -50,75 +51,6 @@ def _select_dims(da: xr.DataArray, period: Any = None, scenario: Any = None) -> return da -def combine_slices( - slices: dict[tuple, np.ndarray], - extra_dims: list[str], - dim_coords: dict[str, list], - output_dim: str, - output_coord: Any, - attrs: dict | None = None, -) -> xr.DataArray: - """Combine {(dim_values): 1D_array} dict into a DataArray. - - This utility simplifies the common pattern of iterating over extra dimensions - (like period, scenario), processing each slice, and combining results. - - Args: - slices: Dict mapping dimension value tuples to 1D numpy arrays. - Keys are tuples like ('period1', 'scenario1') matching extra_dims order. - extra_dims: Dimension names in order (e.g., ['period', 'scenario']). - dim_coords: Dict mapping dimension names to coordinate values. - output_dim: Name of the output dimension (typically 'time'). - output_coord: Coordinate values for output dimension. - attrs: Optional DataArray attributes. - - Returns: - DataArray with dims [output_dim, *extra_dims]. - - Raises: - ValueError: If slices is empty. - KeyError: If a required key is missing from slices. - - Example: - >>> slices = { - ... ('P1', 'base'): np.array([1, 2, 3]), - ... ('P1', 'high'): np.array([4, 5, 6]), - ... ('P2', 'base'): np.array([7, 8, 9]), - ... ('P2', 'high'): np.array([10, 11, 12]), - ... } - >>> result = combine_slices( - ... slices, - ... extra_dims=['period', 'scenario'], - ... dim_coords={'period': ['P1', 'P2'], 'scenario': ['base', 'high']}, - ... output_dim='time', - ... output_coord=[0, 1, 2], - ... ) - >>> result.dims - ('time', 'period', 'scenario') - """ - if not slices: - raise ValueError('slices cannot be empty') - - first = next(iter(slices.values())) - n_output = len(first) - shape = [n_output] + [len(dim_coords[d]) for d in extra_dims] - data = np.empty(shape, dtype=first.dtype) - - for combo in np.ndindex(*shape[1:]): - key = tuple(dim_coords[d][i] for d, i in zip(extra_dims, combo, strict=True)) - try: - data[(slice(None),) + combo] = slices[key] - except KeyError: - raise KeyError(f'Missing slice for key {key} (extra_dims={extra_dims})') from None - - return xr.DataArray( - data, - dims=[output_dim] + extra_dims, - coords={output_dim: output_coord, **dim_coords}, - attrs=attrs or {}, - ) - - def _cluster_occurrences(cr: TsamClusteringResult) -> np.ndarray: """Compute cluster occurrences from ClusteringResult.""" counts = Counter(cr.cluster_assignments) @@ -232,6 +164,14 @@ def coords(self) -> dict[str, list]: """ return {dim: self._get_dim_values(dim) for dim in self._dim_names} + @functools.cached_property + def _iterator(self) -> DimIterator: + """DimIterator for this ClusteringResults (cached).""" + return DimIterator( + periods=self._get_dim_values('period'), + scenarios=self._get_dim_values('scenario'), + ) + def sel(self, **kwargs: Any) -> TsamClusteringResult: """Select result by dimension labels (xarray-like). @@ -551,33 +491,8 @@ def _build_property_array( name: str | None = None, ) -> xr.DataArray: """Build a DataArray property, handling both single and multi-dimensional cases.""" - base_coords = base_coords or {} - periods = self._get_dim_values('period') - scenarios = self._get_dim_values('scenario') - - # Build list of (dim_name, values) for dimensions that exist - extra_dims = [] - if periods is not None: - extra_dims.append(('period', periods)) - if scenarios is not None: - extra_dims.append(('scenario', scenarios)) - - # Simple case: no extra dimensions - if not extra_dims: - return xr.DataArray(get_data(self._results[()]), dims=base_dims, coords=base_coords, name=name) - - # Multi-dimensional: stack data for each combination - first_data = get_data(next(iter(self._results.values()))) - shape = list(first_data.shape) + [len(vals) for _, vals in extra_dims] - data = np.empty(shape, dtype=first_data.dtype) # Preserve dtype - - for combo in np.ndindex(*[len(vals) for _, vals in extra_dims]): - key = tuple(extra_dims[i][1][idx] for i, idx in enumerate(combo)) - data[(...,) + combo] = get_data(self._results[key]) - - dims = base_dims + [dim_name for dim_name, _ in extra_dims] - coords = {**base_coords, **{dim_name: vals for dim_name, vals in extra_dims}} - return xr.DataArray(data, dims=dims, coords=coords, name=name) + slices = {key: get_data(cr) for key, cr in self._results.items()} + return self._iterator.combine_arrays(slices, base_dims, base_coords, name) @staticmethod def _key_to_str(key: tuple) -> str: @@ -628,10 +543,7 @@ def apply(self, data: xr.Dataset) -> AggregationResults: results = {} for key, cr in self._results.items(): - # Build selector for this key - selector = dict(zip(self._dim_names, key, strict=False)) - - # Select the slice for this (period, scenario) + selector = self._iterator.selector_for_key(key) data_slice = data.sel(**selector, drop=True) if selector else data # Drop constant arrays and convert to DataFrame diff --git a/flixopt/dim_iterator.py b/flixopt/dim_iterator.py new file mode 100644 index 000000000..27d13c366 --- /dev/null +++ b/flixopt/dim_iterator.py @@ -0,0 +1,454 @@ +""" +Dimension iteration utilities for period/scenario slicing patterns. + +This module provides `DimIterator`, a helper class that eliminates repetitive +boilerplate when working with datasets that may have period and/or scenario +dimensions. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +import xarray as xr + +if TYPE_CHECKING: + from collections.abc import Iterator + + from .flow_system import FlowSystem + + +class DimIterator: + """Handles period/scenario iteration and combination patterns. + + Eliminates the need for: + - ``[None]`` sentinel values for missing dimensions + - 4-way if/elif branching for period/scenario combinations + - Manual key conversion between internal and external formats + + Keys returned by this class are tuples that match ``dims`` order: + - No dims: ``()`` + - Period only: ``(period_value,)`` + - Scenario only: ``(scenario_value,)`` + - Both: ``(period_value, scenario_value)`` + + Example: + >>> iterator = DimIterator.from_dataset(ds) + >>> + >>> # Iteration + >>> results = {} + >>> for key, ds_slice in iterator.iter_slices(ds): + ... results[key] = transform(ds_slice) + >>> + >>> # Combination + >>> combined = iterator.combine(results, base_dims=['cluster', 'time']) + + Attributes: + dims: Tuple of dimension names present (e.g., ``('period', 'scenario')``). + coords: Dict mapping dimension names to coordinate values. + """ + + def __init__( + self, + periods: list | None = None, + scenarios: list | None = None, + ): + """Initialize with explicit dimension values. + + Args: + periods: Period coordinate values, or None if no period dimension. + scenarios: Scenario coordinate values, or None if no scenario dimension. + """ + self._periods = periods + self._scenarios = scenarios + + # Build dim_names for external keys (no None sentinels) + self._dim_names: list[str] = [] + if periods is not None: + self._dim_names.append('period') + if scenarios is not None: + self._dim_names.append('scenario') + + # ========================================================================== + # Factory methods + # ========================================================================== + + @classmethod + def from_dataset(cls, ds: xr.Dataset) -> DimIterator: + """Create iterator from dataset's period/scenario coordinates. + + Args: + ds: Dataset with optional 'period' and/or 'scenario' coordinates. + + Returns: + DimIterator configured for the dataset's dimensions. + """ + periods = list(ds.period.values) if 'period' in ds.coords else None + scenarios = list(ds.scenario.values) if 'scenario' in ds.coords else None + return cls(periods=periods, scenarios=scenarios) + + @classmethod + def from_flow_system(cls, fs: FlowSystem) -> DimIterator: + """Create iterator from FlowSystem's period/scenario structure. + + Args: + fs: FlowSystem with optional periods and/or scenarios. + + Returns: + DimIterator configured for the FlowSystem's dimensions. + """ + return cls( + periods=list(fs.periods) if fs.periods is not None else None, + scenarios=list(fs.scenarios) if fs.scenarios is not None else None, + ) + + @classmethod + def from_sentinel_lists(cls, periods: list, scenarios: list) -> DimIterator: + """Create iterator from lists that may contain [None] sentinel. + + This handles the common pattern where code uses [None] to indicate + no dimension exists: + + periods = list(fs.periods) if has_periods else [None] + scenarios = list(fs.scenarios) if has_scenarios else [None] + + Args: + periods: Period values, or [None] if no period dimension. + scenarios: Scenario values, or [None] if no scenario dimension. + + Returns: + DimIterator configured for the actual dimensions present. + """ + return cls( + periods=periods if periods != [None] else None, + scenarios=scenarios if scenarios != [None] else None, + ) + + # ========================================================================== + # Properties + # ========================================================================== + + @property + def dims(self) -> tuple[str, ...]: + """Dimension names as tuple (xarray-like).""" + return tuple(self._dim_names) + + @property + def coords(self) -> dict[str, list]: + """Coordinate values for each dimension. + + Returns: + Dict mapping dimension names to lists of coordinate values. + Empty dict if no dimensions. + """ + result = {} + if self._periods is not None: + result['period'] = self._periods + if self._scenarios is not None: + result['scenario'] = self._scenarios + return result + + @property + def has_periods(self) -> bool: + """Whether period dimension exists.""" + return self._periods is not None + + @property + def has_scenarios(self) -> bool: + """Whether scenario dimension exists.""" + return self._scenarios is not None + + @property + def n_slices(self) -> int: + """Total number of slices (product of dimension sizes).""" + n = 1 + if self._periods is not None: + n *= len(self._periods) + if self._scenarios is not None: + n *= len(self._scenarios) + return n + + # ========================================================================== + # Iteration + # ========================================================================== + + def iter_slices( + self, + ds: xr.Dataset, + drop: bool = True, + ) -> Iterator[tuple[tuple, xr.Dataset]]: + """Iterate over (period, scenario) slices of a dataset. + + Args: + ds: Dataset with optional period/scenario dimensions. + drop: If True (default), drop period/scenario dims from sliced datasets. + If False, keep them as scalar coordinates. + + Yields: + ``(key, ds_slice)`` tuples where key matches ``dims`` order. + For no dims, key is ``()``. + + Example: + >>> for key, slice_ds in iterator.iter_slices(ds): + ... print(f'Processing {key}') + ... result = process(slice_ds) + """ + periods = self._periods if self._periods is not None else [None] + scenarios = self._scenarios if self._scenarios is not None else [None] + + for p in periods: + for s in scenarios: + selector = self._build_selector(p, s) + key = self._to_key(p, s) + if selector: + yield key, ds.sel(**selector, drop=drop) + else: + yield key, ds + + def iter_keys(self) -> Iterator[tuple]: + """Iterate over keys only (no data access). + + Yields: + Key tuples matching ``dims`` order. + + Example: + >>> for key in iterator.iter_keys(): + ... results[key] = compute_something() + """ + periods = self._periods if self._periods is not None else [None] + scenarios = self._scenarios if self._scenarios is not None else [None] + + for p in periods: + for s in scenarios: + yield self._to_key(p, s) + + # ========================================================================== + # Key conversion + # ========================================================================== + + def _build_selector(self, period: Any, scenario: Any) -> dict: + """Build xarray selector dict (only for non-None values).""" + selector = {} + if period is not None: + selector['period'] = period + if scenario is not None: + selector['scenario'] = scenario + return selector + + def _to_key(self, period: Any, scenario: Any) -> tuple: + """Convert (period, scenario) to external key matching dims.""" + parts = [] + if self._periods is not None: + parts.append(period) + if self._scenarios is not None: + parts.append(scenario) + return tuple(parts) + + def _from_key(self, key: tuple) -> tuple[Any, Any]: + """Convert external key back to (period, scenario) with Nones. + + Returns: + Tuple of (period, scenario) where missing dims are None. + """ + period = None + scenario = None + idx = 0 + if self._periods is not None: + period = key[idx] + idx += 1 + if self._scenarios is not None: + scenario = key[idx] + return period, scenario + + def selector_for_key(self, key: tuple) -> dict: + """Get xarray selector dict for an external key. + + Args: + key: Key tuple as returned by ``iter_slices`` or ``iter_keys``. + + Returns: + Dict suitable for ``ds.sel(**selector)``. + + Example: + >>> selector = iterator.selector_for_key(key) + >>> ds_slice = ds.sel(**selector) + """ + period, scenario = self._from_key(key) + return self._build_selector(period, scenario) + + def key_from_values(self, period: Any, scenario: Any) -> tuple: + """Convert (period, scenario) values to a key tuple. + + This is useful when you have separate period/scenario values + (possibly None) and need to create a key for use with combine(). + + Args: + period: Period value, or None if no period dimension. + scenario: Scenario value, or None if no scenario dimension. + + Returns: + Key tuple matching ``dims`` order. + + Example: + >>> iterator = DimIterator(periods=[2024, 2025]) + >>> iterator.key_from_values(2024, None) + (2024,) + """ + return self._to_key(period, scenario) + + # ========================================================================== + # Combination + # ========================================================================== + + def combine( + self, + slices: dict[tuple, xr.DataArray], + base_dims: list[str], + name: str | None = None, + attrs: dict | None = None, + join: str = 'exact', + fill_value: Any = None, + ) -> xr.DataArray: + """Combine per-slice DataArrays into multi-dimensional DataArray. + + Args: + slices: Dict mapping keys (from ``iter_slices``) to DataArrays. + All DataArrays must have the same ``base_dims``. + base_dims: Base dimensions of each slice (e.g., ``['cluster', 'time']``). + Used for final transpose to ensure consistent dimension order. + name: Optional name for resulting DataArray. + attrs: Optional attributes for resulting DataArray. + join: How to handle coordinate differences between slices. + - ``'exact'``: Coordinates must match exactly (default). + - ``'outer'``: Union of coordinates, fill missing with fill_value. + fill_value: Value to use for missing entries when ``join='outer'``. + Only used when coordinates differ between slices. + + Returns: + DataArray with dims ``[*base_dims, period?, scenario?]``. + + Example: + >>> results = {key: process(s) for key, s in iterator.iter_slices(ds)} + >>> combined = iterator.combine(results, base_dims=['cluster', 'time']) + """ + # Build concat kwargs + concat_kwargs = {'join': join} + if fill_value is not None: + concat_kwargs['fill_value'] = fill_value + + if not self._dim_names: + # No extra dims - return single slice + result = slices[()] + elif self.has_periods and self.has_scenarios: + # Both dimensions: concat scenarios first, then periods + period_arrays = [] + for p in self._periods: + scenario_arrays = [slices[self._to_key(p, s)] for s in self._scenarios] + period_arrays.append( + xr.concat( + scenario_arrays, + dim=pd.Index(self._scenarios, name='scenario'), + **concat_kwargs, + ) + ) + result = xr.concat( + period_arrays, + dim=pd.Index(self._periods, name='period'), + **concat_kwargs, + ) + elif self.has_periods: + result = xr.concat( + [slices[(p,)] for p in self._periods], + dim=pd.Index(self._periods, name='period'), + **concat_kwargs, + ) + else: # has_scenarios only + result = xr.concat( + [slices[(s,)] for s in self._scenarios], + dim=pd.Index(self._scenarios, name='scenario'), + **concat_kwargs, + ) + + # Transpose to standard order: base_dims first, then period/scenario + result = result.transpose(*base_dims, ...) + + if name: + result = result.rename(name) + if attrs: + result = result.assign_attrs(attrs) + return result + + def combine_arrays( + self, + slices: dict[tuple, np.ndarray], + base_dims: list[str], + base_coords: dict[str, Any] | None = None, + name: str | None = None, + attrs: dict | None = None, + ) -> xr.DataArray: + """Combine per-slice numpy arrays into multi-dimensional DataArray. + + More efficient than ``combine()`` when working with raw numpy arrays, + as it avoids intermediate DataArray creation. + + Args: + slices: Dict mapping keys to numpy arrays. All arrays must have + the same shape and dtype. + base_dims: Dimension names for the base array axes. + base_coords: Optional coordinates for base dimensions. + name: Optional name for resulting DataArray. + attrs: Optional attributes for resulting DataArray. + + Returns: + DataArray with dims ``[*base_dims, period?, scenario?]``. + + Example: + >>> arrays = {key: np.array([1, 2, 3]) for key in iterator.iter_keys()} + >>> da = iterator.combine_arrays(arrays, base_dims=['cluster']) + """ + base_coords = base_coords or {} + + if not self._dim_names: + # No extra dims - wrap single array + return xr.DataArray( + slices[()], + dims=base_dims, + coords=base_coords, + name=name, + attrs=attrs or {}, + ) + + # Build full shape: base_shape + extra_dims_shape + first = next(iter(slices.values())) + extra_shape = [len(coords) for coords in self.coords.values()] + shape = list(first.shape) + extra_shape + data = np.empty(shape, dtype=first.dtype) + + # Fill using np.ndindex for flexibility with any number of extra dims + for combo in np.ndindex(*extra_shape): + key = tuple(list(self.coords.values())[i][idx] for i, idx in enumerate(combo)) + data[(...,) + combo] = slices[key] + + return xr.DataArray( + data, + dims=base_dims + self._dim_names, + coords={**base_coords, **self.coords}, + name=name, + attrs=attrs or {}, + ) + + # ========================================================================== + # Dunder methods + # ========================================================================== + + def __repr__(self) -> str: + if not self.dims: + return 'DimIterator(dims=())' + coords_str = ', '.join(f'{k}: {len(v)}' for k, v in self.coords.items()) + return f'DimIterator(dims={self.dims}, coords=({coords_str}))' + + def __len__(self) -> int: + """Number of slices.""" + return self.n_slices diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 9e56479af..4423d1031 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -16,6 +16,7 @@ import pandas as pd import xarray as xr +from .dim_iterator import DimIterator from .modeling import _scalar_safe_reduce from .structure import EXPAND_DIVIDE, EXPAND_FIRST_TIMESTEP, EXPAND_INTERPOLATE, VariableCategory @@ -167,9 +168,11 @@ def _weight_for_key(key: tuple) -> xr.DataArray: weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) return xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) - weight_slices = {key: _weight_for_key(key) for key in cluster_occurrences_all} - return self._combine_slices_to_dataarray_generic( - weight_slices, ['cluster'], periods, scenarios, 'cluster_weight' + iterator = DimIterator.from_sentinel_lists(periods, scenarios) + # Convert old (period, scenario) keys to DimIterator keys + weight_slices = {iterator.key_from_values(p, s): _weight_for_key((p, s)) for p, s in cluster_occurrences_all} + return iterator.combine( + weight_slices, base_dims=['cluster'], name='cluster_weight', join='outer', fill_value=np.nan ) def _build_typical_das( @@ -259,9 +262,10 @@ def _build_segment_durations_da( DataArray with dims [cluster, time] or [cluster, time, period?, scenario?] containing duration in hours for each segment. """ + iterator = DimIterator.from_sentinel_lists(periods, scenarios) segment_duration_slices: dict[tuple, xr.DataArray] = {} - for key, tsam_result in tsam_aggregation_results.items(): + for (p, s), tsam_result in tsam_aggregation_results.items(): # segment_durations is tuple of tuples: ((dur1, dur2, ...), (dur1, dur2, ...), ...) # Each inner tuple is durations for one cluster seg_durs = tsam_result.segment_durations @@ -274,14 +278,18 @@ def _build_segment_durations_da( # Duration in hours = number of original timesteps * dt data[cluster_id, seg_id] = cluster_seg_durs[seg_id] * dt - segment_duration_slices[key] = xr.DataArray( + segment_duration_slices[iterator.key_from_values(p, s)] = xr.DataArray( data, dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}, ) - return self._combine_slices_to_dataarray_generic( - segment_duration_slices, ['cluster', 'time'], periods, scenarios, 'timestep_duration' + return iterator.combine( + segment_duration_slices, + base_dims=['cluster', 'time'], + name='timestep_duration', + join='outer', + fill_value=np.nan, ) def _build_clustering_metrics( @@ -323,6 +331,7 @@ def _build_clustering_metrics( ) # Multi-dim case + iterator = DimIterator.from_sentinel_lists(periods, scenarios) sample_df = next(iter(non_empty_metrics.values())) metric_names = list(sample_df.columns) data_vars = {} @@ -330,20 +339,21 @@ def _build_clustering_metrics( for metric in metric_names: slices = {} for (p, s), df in clustering_metrics_all.items(): + key = iterator.key_from_values(p, s) if df.empty: - slices[(p, s)] = xr.DataArray( + slices[key] = xr.DataArray( np.full(len(sample_df.index), np.nan), dims=['time_series'], coords={'time_series': list(sample_df.index)}, ) else: - slices[(p, s)] = xr.DataArray( + slices[key] = xr.DataArray( df[metric].values, dims=['time_series'], coords={'time_series': list(df.index)}, ) - data_vars[metric] = self._combine_slices_to_dataarray_generic( - slices, ['time_series'], periods, scenarios, metric + data_vars[metric] = iterator.combine( + slices, base_dims=['time_series'], name=metric, join='outer', fill_value=np.nan ) return xr.Dataset(data_vars) @@ -532,7 +542,7 @@ def _build_reduced_dataset( """ from .core import TimeSeriesData - all_keys = {(p, s) for p in periods for s in scenarios} + iterator = DimIterator.from_sentinel_lists(periods, scenarios) ds_new_vars = {} # Use ds.variables to avoid _construct_dataarray overhead @@ -567,11 +577,11 @@ def _build_reduced_dataset( coords=new_coords, attrs=var.attrs, ) - elif set(typical_das[name].keys()) != all_keys: + elif set(typical_das[name].keys()) != {(p, s) for p in periods for s in scenarios}: # Partial typical slices: fill missing keys with constant values # For multi-period/scenario data, we need to select the right slice for each key - # Exclude 'period' and 'scenario' - they're handled by _combine_slices_to_dataarray_2d + # Exclude 'period' and 'scenario' - they're handled by iterator.combine other_dims = [d for d in var.dims if d not in ('time', 'period', 'scenario')] other_shape = [var.sizes[d] for d in other_dims] new_shape = [actual_n_clusters, n_time_points] + other_shape @@ -583,54 +593,46 @@ def _build_reduced_dataset( # Build filled slices dict: use typical where available, constant otherwise filled_slices = {} - for key in all_keys: - if key in typical_das[name]: - filled_slices[key] = typical_das[name][key] - else: - # Select the specific period/scenario slice, then reshape - period_label, scenario_label = key - selector = {} - if period_label is not None and 'period' in var.dims: - selector['period'] = period_label - if scenario_label is not None and 'scenario' in var.dims: - selector['scenario'] = scenario_label - - # Select per-key slice if needed, otherwise use full variable - if selector: - var_slice = ds[name].sel(**selector, drop=True) + for p in iterator._periods or [None]: + for s in iterator._scenarios or [None]: + old_key = (p, s) + new_key = iterator.key_from_values(p, s) + if old_key in typical_das[name]: + filled_slices[new_key] = typical_das[name][old_key] else: - var_slice = ds[name] - - # Now slice time and reshape - time_idx = var_slice.dims.index('time') - slices_list = [slice(None)] * len(var_slice.dims) - slices_list[time_idx] = slice(0, n_reduced_timesteps) - sliced_values = var_slice.values[tuple(slices_list)] - reshaped_constant = sliced_values.reshape(new_shape) - - filled_slices[key] = xr.DataArray( - reshaped_constant, - dims=['cluster', 'time'] + other_dims, - coords=new_coords, - ) - - da = self._combine_slices_to_dataarray_2d( - slices=filled_slices, - attrs=var.attrs, - periods=periods, - scenarios=scenarios, - ) + # Select the specific period/scenario slice, then reshape + selector = iterator._build_selector(p, s) + + # Select per-key slice if needed, otherwise use full variable + if selector: + # Only include dims that exist in the variable + var_selector = {k: v for k, v in selector.items() if k in var.dims} + var_slice = ds[name].sel(**var_selector, drop=True) if var_selector else ds[name] + else: + var_slice = ds[name] + + # Now slice time and reshape + time_idx = var_slice.dims.index('time') + slices_list = [slice(None)] * len(var_slice.dims) + slices_list[time_idx] = slice(0, n_reduced_timesteps) + sliced_values = var_slice.values[tuple(slices_list)] + reshaped_constant = sliced_values.reshape(new_shape) + + filled_slices[new_key] = xr.DataArray( + reshaped_constant, + dims=['cluster', 'time'] + other_dims, + coords=new_coords, + ) + + da = iterator.combine(filled_slices, base_dims=['cluster', 'time'], attrs=var.attrs) if var.attrs.get('__timeseries_data__', False): da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) ds_new_vars[name] = da else: # Time-varying: combine per-(period, scenario) slices - da = self._combine_slices_to_dataarray_2d( - slices=typical_das[name], - attrs=var.attrs, - periods=periods, - scenarios=scenarios, - ) + # Convert old-style keys to DimIterator keys + converted_slices = {iterator.key_from_values(p, s): da for (p, s), da in typical_das[name].items()} + da = iterator.combine(converted_slices, base_dims=['cluster', 'time'], attrs=var.attrs) if var.attrs.get('__timeseries_data__', False): da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) ds_new_vars[name] = da @@ -656,25 +658,16 @@ def _build_cluster_assignments_da( Returns: DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. """ - has_periods = periods != [None] - has_scenarios = scenarios != [None] - - if has_periods or has_scenarios: - # Multi-dimensional case - cluster_assignments_slices = {} - for p in periods: - for s in scenarios: - key = (p, s) - cluster_assignments_slices[key] = xr.DataArray( - cluster_assignmentss[key], dims=['original_cluster'], name='cluster_assignments' - ) - return self._combine_slices_to_dataarray_generic( - cluster_assignments_slices, ['original_cluster'], periods, scenarios, 'cluster_assignments' + iterator = DimIterator.from_sentinel_lists(periods, scenarios) + slices = { + iterator.key_from_values(p, s): xr.DataArray( + cluster_assignmentss[(p, s)], dims=['original_cluster'], name='cluster_assignments' ) - else: - # Simple case - first_key = (periods[0], scenarios[0]) - return xr.DataArray(cluster_assignmentss[first_key], dims=['original_cluster'], name='cluster_assignments') + for p, s in cluster_assignmentss + } + return iterator.combine( + slices, base_dims=['original_cluster'], name='cluster_assignments', join='outer', fill_value=np.nan + ) def sel( self, @@ -1696,114 +1689,6 @@ def apply_clustering( scenarios=scenarios, ) - @staticmethod - def _combine_slices_to_dataarray_generic( - slices: dict[tuple, xr.DataArray], - base_dims: list[str], - periods: list, - scenarios: list, - name: str, - ) -> xr.DataArray: - """Combine per-(period, scenario) slices into a multi-dimensional DataArray. - - Generic version that works with any base dimension (not just 'time'). - - Args: - slices: Dict mapping (period, scenario) tuples to DataArrays. - base_dims: Base dimensions of each slice (e.g., ['original_cluster'] or ['original_time']). - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). - name: Name for the resulting DataArray. - - Returns: - DataArray with dimensions [base_dims..., period?, scenario?]. - """ - first_key = (periods[0], scenarios[0]) - has_periods = periods != [None] - has_scenarios = scenarios != [None] - - # Simple case: no period/scenario dimensions - if not has_periods and not has_scenarios: - return slices[first_key].rename(name) - - # Multi-dimensional: use xr.concat to stack along period/scenario dims - # Use join='outer' to handle cases where different periods/scenarios have different - # coordinate values (e.g., different time_series after drop_constant_arrays) - if has_periods and has_scenarios: - # Stack scenarios first, then periods - period_arrays = [] - for p in periods: - scenario_arrays = [slices[(p, s)] for s in scenarios] - period_arrays.append( - xr.concat( - scenario_arrays, dim=pd.Index(scenarios, name='scenario'), join='outer', fill_value=np.nan - ) - ) - result = xr.concat(period_arrays, dim=pd.Index(periods, name='period'), join='outer', fill_value=np.nan) - elif has_periods: - result = xr.concat( - [slices[(p, None)] for p in periods], - dim=pd.Index(periods, name='period'), - join='outer', - fill_value=np.nan, - ) - else: - result = xr.concat( - [slices[(None, s)] for s in scenarios], - dim=pd.Index(scenarios, name='scenario'), - join='outer', - fill_value=np.nan, - ) - - # Put base dimension first (standard order) - result = result.transpose(base_dims[0], ...) - - return result.rename(name) - - @staticmethod - def _combine_slices_to_dataarray_2d( - slices: dict[tuple, xr.DataArray], - attrs: dict, - periods: list, - scenarios: list, - ) -> xr.DataArray: - """Combine per-(period, scenario) slices into a multi-dimensional DataArray with (cluster, time) dims. - - Args: - slices: Dict mapping (period, scenario) tuples to DataArrays with (cluster, time) dims. - attrs: Attributes to assign to the result. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). - - Returns: - DataArray with dimensions (cluster, time, period?, scenario?). - """ - first_key = (periods[0], scenarios[0]) - has_periods = periods != [None] - has_scenarios = scenarios != [None] - - # Simple case: no period/scenario dimensions - if not has_periods and not has_scenarios: - return slices[first_key].assign_attrs(attrs) - - # Multi-dimensional: use xr.concat to stack along period/scenario dims - if has_periods and has_scenarios: - # Stack scenarios first, then periods - period_arrays = [] - for p in periods: - scenario_arrays = [slices[(p, s)] for s in scenarios] - period_arrays.append(xr.concat(scenario_arrays, dim=pd.Index(scenarios, name='scenario'))) - result = xr.concat(period_arrays, dim=pd.Index(periods, name='period')) - elif has_periods: - result = xr.concat([slices[(p, None)] for p in periods], dim=pd.Index(periods, name='period')) - else: - result = xr.concat([slices[(None, s)] for s in scenarios], dim=pd.Index(scenarios, name='scenario')) - - # Put cluster and time first (standard order for clustered data) - result = result.transpose('cluster', 'time', ...) - - return result.assign_attrs(attrs) - def _validate_for_expansion(self) -> Clustering: """Validate FlowSystem can be expanded and return clustering info. From bedc0e4e6260edeb7a1253bda16c5f6b533581a6 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:49:53 +0100 Subject: [PATCH 02/15] The refactoring is complete. Here's a summary of what changed: dim_iterator.py - Simplified to a single helper function: def add_slice_dims(da, period=None, scenario=None): """Add period/scenario dims back to a transformed DataArray.""" if period is not None: da = da.expand_dims(period=[period]) if scenario is not None: da = da.expand_dims(scenario=[scenario]) return da The new pattern used throughout the codebase: results = [] for p in periods: for s in scenarios: da = transform_data(...) # Create DataArray for this slice results.append(add_slice_dims(da, period=p, scenario=s)) combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] This removes the complex DimIterator class with its key conversion methods, making the code more straightforward and easier to follow. --- flixopt/clustering/base.py | 42 ++- flixopt/dim_iterator.py | 469 ++-------------------------- flixopt/transform_accessor.py | 169 +++++----- tests/test_cluster_reduce_expand.py | 83 ----- tests/test_clustering_io.py | 8 +- 5 files changed, 163 insertions(+), 608 deletions(-) diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 97c4c3097..646d8f2e2 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -27,7 +27,7 @@ from ..plot_result import PlotResult from ..statistics_accessor import SelectType -from ..dim_iterator import DimIterator +from ..dim_iterator import add_slice_dims from ..statistics_accessor import _build_color_kwargs @@ -164,14 +164,6 @@ def coords(self) -> dict[str, list]: """ return {dim: self._get_dim_values(dim) for dim in self._dim_names} - @functools.cached_property - def _iterator(self) -> DimIterator: - """DimIterator for this ClusteringResults (cached).""" - return DimIterator( - periods=self._get_dim_values('period'), - scenarios=self._get_dim_values('scenario'), - ) - def sel(self, **kwargs: Any) -> TsamClusteringResult: """Select result by dimension labels (xarray-like). @@ -491,8 +483,33 @@ def _build_property_array( name: str | None = None, ) -> xr.DataArray: """Build a DataArray property, handling both single and multi-dimensional cases.""" - slices = {key: get_data(cr) for key, cr in self._results.items()} - return self._iterator.combine_arrays(slices, base_dims, base_coords, name) + results = [] + for key, cr in self._results.items(): + data = get_data(cr) + coords = base_coords if base_coords else {} + da = xr.DataArray(data, dims=base_dims, coords=coords, name=name) + + # Extract period/scenario from key based on dim_names + period = None + scenario = None + for i, dim_name in enumerate(self._dim_names): + if dim_name == 'period': + period = key[i] + elif dim_name == 'scenario': + scenario = key[i] + + results.append(add_slice_dims(da, period=period, scenario=scenario)) + + if len(results) == 1: + return results[0] + + combined = xr.combine_by_coords(results) + # combine_by_coords returns a Dataset when DataArrays have names, extract the DataArray + if isinstance(combined, xr.Dataset): + combined = combined[name] + # Ensure correct dimension order: base_dims first, then period/scenario + dim_order = list(base_dims) + self._dim_names + return combined.transpose(*dim_order) @staticmethod def _key_to_str(key: tuple) -> str: @@ -543,7 +560,8 @@ def apply(self, data: xr.Dataset) -> AggregationResults: results = {} for key, cr in self._results.items(): - selector = self._iterator.selector_for_key(key) + # Build selector from key based on dim_names + selector = {dim_name: key[i] for i, dim_name in enumerate(self._dim_names)} data_slice = data.sel(**selector, drop=True) if selector else data # Drop constant arrays and convert to DataFrame diff --git a/flixopt/dim_iterator.py b/flixopt/dim_iterator.py index 27d13c366..224f86b60 100644 --- a/flixopt/dim_iterator.py +++ b/flixopt/dim_iterator.py @@ -1,454 +1,49 @@ """ -Dimension iteration utilities for period/scenario slicing patterns. +Helper for period/scenario dimension handling. -This module provides `DimIterator`, a helper class that eliminates repetitive -boilerplate when working with datasets that may have period and/or scenario -dimensions. +Provides a simple utility for adding period/scenario dimensions back to +DataArrays after transformation. """ from __future__ import annotations from typing import TYPE_CHECKING, Any -import numpy as np -import pandas as pd -import xarray as xr - if TYPE_CHECKING: - from collections.abc import Iterator + import xarray as xr - from .flow_system import FlowSystem +def add_slice_dims( + da: xr.DataArray, + period: Any = None, + scenario: Any = None, +) -> xr.DataArray: + """Add period/scenario dimensions back to a transformed DataArray. -class DimIterator: - """Handles period/scenario iteration and combination patterns. + After selecting a slice with `ds.sel(period=p, scenario=s, drop=True)` + and transforming it, use this to re-add the period/scenario coordinates + so results can be combined with `xr.combine_by_coords`. - Eliminates the need for: - - ``[None]`` sentinel values for missing dimensions - - 4-way if/elif branching for period/scenario combinations - - Manual key conversion between internal and external formats + Args: + da: DataArray without period/scenario dimensions. + period: Period value to add, or None to skip. + scenario: Scenario value to add, or None to skip. - Keys returned by this class are tuples that match ``dims`` order: - - No dims: ``()`` - - Period only: ``(period_value,)`` - - Scenario only: ``(scenario_value,)`` - - Both: ``(period_value, scenario_value)`` + Returns: + DataArray with period/scenario dims added (as length-1 dims). Example: - >>> iterator = DimIterator.from_dataset(ds) - >>> - >>> # Iteration - >>> results = {} - >>> for key, ds_slice in iterator.iter_slices(ds): - ... results[key] = transform(ds_slice) - >>> - >>> # Combination - >>> combined = iterator.combine(results, base_dims=['cluster', 'time']) - - Attributes: - dims: Tuple of dimension names present (e.g., ``('period', 'scenario')``). - coords: Dict mapping dimension names to coordinate values. + >>> results = [] + >>> for p in periods: + ... for s in scenarios: + ... selector = {k: v for k, v in [('period', p), ('scenario', s)] if v is not None} + ... ds_slice = ds.sel(**selector, drop=True) if selector else ds + ... result = transform(ds_slice) + ... results.append(add_slice_dims(result, period=p, scenario=s)) + >>> combined = xr.combine_by_coords(results) """ - - def __init__( - self, - periods: list | None = None, - scenarios: list | None = None, - ): - """Initialize with explicit dimension values. - - Args: - periods: Period coordinate values, or None if no period dimension. - scenarios: Scenario coordinate values, or None if no scenario dimension. - """ - self._periods = periods - self._scenarios = scenarios - - # Build dim_names for external keys (no None sentinels) - self._dim_names: list[str] = [] - if periods is not None: - self._dim_names.append('period') - if scenarios is not None: - self._dim_names.append('scenario') - - # ========================================================================== - # Factory methods - # ========================================================================== - - @classmethod - def from_dataset(cls, ds: xr.Dataset) -> DimIterator: - """Create iterator from dataset's period/scenario coordinates. - - Args: - ds: Dataset with optional 'period' and/or 'scenario' coordinates. - - Returns: - DimIterator configured for the dataset's dimensions. - """ - periods = list(ds.period.values) if 'period' in ds.coords else None - scenarios = list(ds.scenario.values) if 'scenario' in ds.coords else None - return cls(periods=periods, scenarios=scenarios) - - @classmethod - def from_flow_system(cls, fs: FlowSystem) -> DimIterator: - """Create iterator from FlowSystem's period/scenario structure. - - Args: - fs: FlowSystem with optional periods and/or scenarios. - - Returns: - DimIterator configured for the FlowSystem's dimensions. - """ - return cls( - periods=list(fs.periods) if fs.periods is not None else None, - scenarios=list(fs.scenarios) if fs.scenarios is not None else None, - ) - - @classmethod - def from_sentinel_lists(cls, periods: list, scenarios: list) -> DimIterator: - """Create iterator from lists that may contain [None] sentinel. - - This handles the common pattern where code uses [None] to indicate - no dimension exists: - - periods = list(fs.periods) if has_periods else [None] - scenarios = list(fs.scenarios) if has_scenarios else [None] - - Args: - periods: Period values, or [None] if no period dimension. - scenarios: Scenario values, or [None] if no scenario dimension. - - Returns: - DimIterator configured for the actual dimensions present. - """ - return cls( - periods=periods if periods != [None] else None, - scenarios=scenarios if scenarios != [None] else None, - ) - - # ========================================================================== - # Properties - # ========================================================================== - - @property - def dims(self) -> tuple[str, ...]: - """Dimension names as tuple (xarray-like).""" - return tuple(self._dim_names) - - @property - def coords(self) -> dict[str, list]: - """Coordinate values for each dimension. - - Returns: - Dict mapping dimension names to lists of coordinate values. - Empty dict if no dimensions. - """ - result = {} - if self._periods is not None: - result['period'] = self._periods - if self._scenarios is not None: - result['scenario'] = self._scenarios - return result - - @property - def has_periods(self) -> bool: - """Whether period dimension exists.""" - return self._periods is not None - - @property - def has_scenarios(self) -> bool: - """Whether scenario dimension exists.""" - return self._scenarios is not None - - @property - def n_slices(self) -> int: - """Total number of slices (product of dimension sizes).""" - n = 1 - if self._periods is not None: - n *= len(self._periods) - if self._scenarios is not None: - n *= len(self._scenarios) - return n - - # ========================================================================== - # Iteration - # ========================================================================== - - def iter_slices( - self, - ds: xr.Dataset, - drop: bool = True, - ) -> Iterator[tuple[tuple, xr.Dataset]]: - """Iterate over (period, scenario) slices of a dataset. - - Args: - ds: Dataset with optional period/scenario dimensions. - drop: If True (default), drop period/scenario dims from sliced datasets. - If False, keep them as scalar coordinates. - - Yields: - ``(key, ds_slice)`` tuples where key matches ``dims`` order. - For no dims, key is ``()``. - - Example: - >>> for key, slice_ds in iterator.iter_slices(ds): - ... print(f'Processing {key}') - ... result = process(slice_ds) - """ - periods = self._periods if self._periods is not None else [None] - scenarios = self._scenarios if self._scenarios is not None else [None] - - for p in periods: - for s in scenarios: - selector = self._build_selector(p, s) - key = self._to_key(p, s) - if selector: - yield key, ds.sel(**selector, drop=drop) - else: - yield key, ds - - def iter_keys(self) -> Iterator[tuple]: - """Iterate over keys only (no data access). - - Yields: - Key tuples matching ``dims`` order. - - Example: - >>> for key in iterator.iter_keys(): - ... results[key] = compute_something() - """ - periods = self._periods if self._periods is not None else [None] - scenarios = self._scenarios if self._scenarios is not None else [None] - - for p in periods: - for s in scenarios: - yield self._to_key(p, s) - - # ========================================================================== - # Key conversion - # ========================================================================== - - def _build_selector(self, period: Any, scenario: Any) -> dict: - """Build xarray selector dict (only for non-None values).""" - selector = {} - if period is not None: - selector['period'] = period - if scenario is not None: - selector['scenario'] = scenario - return selector - - def _to_key(self, period: Any, scenario: Any) -> tuple: - """Convert (period, scenario) to external key matching dims.""" - parts = [] - if self._periods is not None: - parts.append(period) - if self._scenarios is not None: - parts.append(scenario) - return tuple(parts) - - def _from_key(self, key: tuple) -> tuple[Any, Any]: - """Convert external key back to (period, scenario) with Nones. - - Returns: - Tuple of (period, scenario) where missing dims are None. - """ - period = None - scenario = None - idx = 0 - if self._periods is not None: - period = key[idx] - idx += 1 - if self._scenarios is not None: - scenario = key[idx] - return period, scenario - - def selector_for_key(self, key: tuple) -> dict: - """Get xarray selector dict for an external key. - - Args: - key: Key tuple as returned by ``iter_slices`` or ``iter_keys``. - - Returns: - Dict suitable for ``ds.sel(**selector)``. - - Example: - >>> selector = iterator.selector_for_key(key) - >>> ds_slice = ds.sel(**selector) - """ - period, scenario = self._from_key(key) - return self._build_selector(period, scenario) - - def key_from_values(self, period: Any, scenario: Any) -> tuple: - """Convert (period, scenario) values to a key tuple. - - This is useful when you have separate period/scenario values - (possibly None) and need to create a key for use with combine(). - - Args: - period: Period value, or None if no period dimension. - scenario: Scenario value, or None if no scenario dimension. - - Returns: - Key tuple matching ``dims`` order. - - Example: - >>> iterator = DimIterator(periods=[2024, 2025]) - >>> iterator.key_from_values(2024, None) - (2024,) - """ - return self._to_key(period, scenario) - - # ========================================================================== - # Combination - # ========================================================================== - - def combine( - self, - slices: dict[tuple, xr.DataArray], - base_dims: list[str], - name: str | None = None, - attrs: dict | None = None, - join: str = 'exact', - fill_value: Any = None, - ) -> xr.DataArray: - """Combine per-slice DataArrays into multi-dimensional DataArray. - - Args: - slices: Dict mapping keys (from ``iter_slices``) to DataArrays. - All DataArrays must have the same ``base_dims``. - base_dims: Base dimensions of each slice (e.g., ``['cluster', 'time']``). - Used for final transpose to ensure consistent dimension order. - name: Optional name for resulting DataArray. - attrs: Optional attributes for resulting DataArray. - join: How to handle coordinate differences between slices. - - ``'exact'``: Coordinates must match exactly (default). - - ``'outer'``: Union of coordinates, fill missing with fill_value. - fill_value: Value to use for missing entries when ``join='outer'``. - Only used when coordinates differ between slices. - - Returns: - DataArray with dims ``[*base_dims, period?, scenario?]``. - - Example: - >>> results = {key: process(s) for key, s in iterator.iter_slices(ds)} - >>> combined = iterator.combine(results, base_dims=['cluster', 'time']) - """ - # Build concat kwargs - concat_kwargs = {'join': join} - if fill_value is not None: - concat_kwargs['fill_value'] = fill_value - - if not self._dim_names: - # No extra dims - return single slice - result = slices[()] - elif self.has_periods and self.has_scenarios: - # Both dimensions: concat scenarios first, then periods - period_arrays = [] - for p in self._periods: - scenario_arrays = [slices[self._to_key(p, s)] for s in self._scenarios] - period_arrays.append( - xr.concat( - scenario_arrays, - dim=pd.Index(self._scenarios, name='scenario'), - **concat_kwargs, - ) - ) - result = xr.concat( - period_arrays, - dim=pd.Index(self._periods, name='period'), - **concat_kwargs, - ) - elif self.has_periods: - result = xr.concat( - [slices[(p,)] for p in self._periods], - dim=pd.Index(self._periods, name='period'), - **concat_kwargs, - ) - else: # has_scenarios only - result = xr.concat( - [slices[(s,)] for s in self._scenarios], - dim=pd.Index(self._scenarios, name='scenario'), - **concat_kwargs, - ) - - # Transpose to standard order: base_dims first, then period/scenario - result = result.transpose(*base_dims, ...) - - if name: - result = result.rename(name) - if attrs: - result = result.assign_attrs(attrs) - return result - - def combine_arrays( - self, - slices: dict[tuple, np.ndarray], - base_dims: list[str], - base_coords: dict[str, Any] | None = None, - name: str | None = None, - attrs: dict | None = None, - ) -> xr.DataArray: - """Combine per-slice numpy arrays into multi-dimensional DataArray. - - More efficient than ``combine()`` when working with raw numpy arrays, - as it avoids intermediate DataArray creation. - - Args: - slices: Dict mapping keys to numpy arrays. All arrays must have - the same shape and dtype. - base_dims: Dimension names for the base array axes. - base_coords: Optional coordinates for base dimensions. - name: Optional name for resulting DataArray. - attrs: Optional attributes for resulting DataArray. - - Returns: - DataArray with dims ``[*base_dims, period?, scenario?]``. - - Example: - >>> arrays = {key: np.array([1, 2, 3]) for key in iterator.iter_keys()} - >>> da = iterator.combine_arrays(arrays, base_dims=['cluster']) - """ - base_coords = base_coords or {} - - if not self._dim_names: - # No extra dims - wrap single array - return xr.DataArray( - slices[()], - dims=base_dims, - coords=base_coords, - name=name, - attrs=attrs or {}, - ) - - # Build full shape: base_shape + extra_dims_shape - first = next(iter(slices.values())) - extra_shape = [len(coords) for coords in self.coords.values()] - shape = list(first.shape) + extra_shape - data = np.empty(shape, dtype=first.dtype) - - # Fill using np.ndindex for flexibility with any number of extra dims - for combo in np.ndindex(*extra_shape): - key = tuple(list(self.coords.values())[i][idx] for i, idx in enumerate(combo)) - data[(...,) + combo] = slices[key] - - return xr.DataArray( - data, - dims=base_dims + self._dim_names, - coords={**base_coords, **self.coords}, - name=name, - attrs=attrs or {}, - ) - - # ========================================================================== - # Dunder methods - # ========================================================================== - - def __repr__(self) -> str: - if not self.dims: - return 'DimIterator(dims=())' - coords_str = ', '.join(f'{k}: {len(v)}' for k, v in self.coords.items()) - return f'DimIterator(dims={self.dims}, coords=({coords_str}))' - - def __len__(self) -> int: - """Number of slices.""" - return self.n_slices + if period is not None: + da = da.expand_dims(period=[period]) + if scenario is not None: + da = da.expand_dims(scenario=[scenario]) + return da diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 4423d1031..dde261e41 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -16,7 +16,7 @@ import pandas as pd import xarray as xr -from .dim_iterator import DimIterator +from .dim_iterator import add_slice_dims from .modeling import _scalar_safe_reduce from .structure import EXPAND_DIVIDE, EXPAND_FIRST_TIMESTEP, EXPAND_INTERPOLATE, VariableCategory @@ -161,19 +161,16 @@ def _build_cluster_weight_da( Returns: DataArray with dims [cluster] or [cluster, period?, scenario?]. """ + results = [] + for p in periods: + for s in scenarios: + occurrences = cluster_occurrences_all[(p, s)] + weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) + da = xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) + results.append(add_slice_dims(da, period=p, scenario=s)) - def _weight_for_key(key: tuple) -> xr.DataArray: - occurrences = cluster_occurrences_all[key] - # Missing clusters contribute zero weight (not 1) - weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) - return xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) - - iterator = DimIterator.from_sentinel_lists(periods, scenarios) - # Convert old (period, scenario) keys to DimIterator keys - weight_slices = {iterator.key_from_values(p, s): _weight_for_key((p, s)) for p, s in cluster_occurrences_all} - return iterator.combine( - weight_slices, base_dims=['cluster'], name='cluster_weight', join='outer', fill_value=np.nan - ) + combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] + return combined.transpose('cluster', ...).rename('cluster_weight') def _build_typical_das( self, @@ -262,12 +259,9 @@ def _build_segment_durations_da( DataArray with dims [cluster, time] or [cluster, time, period?, scenario?] containing duration in hours for each segment. """ - iterator = DimIterator.from_sentinel_lists(periods, scenarios) - segment_duration_slices: dict[tuple, xr.DataArray] = {} - + results = [] for (p, s), tsam_result in tsam_aggregation_results.items(): # segment_durations is tuple of tuples: ((dur1, dur2, ...), (dur1, dur2, ...), ...) - # Each inner tuple is durations for one cluster seg_durs = tsam_result.segment_durations # Build 2D array (cluster, segment) of durations in hours @@ -275,22 +269,17 @@ def _build_segment_durations_da( for cluster_id in range(actual_n_clusters): cluster_seg_durs = seg_durs[cluster_id] for seg_id in range(n_segments): - # Duration in hours = number of original timesteps * dt data[cluster_id, seg_id] = cluster_seg_durs[seg_id] * dt - segment_duration_slices[iterator.key_from_values(p, s)] = xr.DataArray( + da = xr.DataArray( data, dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}, ) + results.append(add_slice_dims(da, period=p, scenario=s)) - return iterator.combine( - segment_duration_slices, - base_dims=['cluster', 'time'], - name='timestep_duration', - join='outer', - fill_value=np.nan, - ) + combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] + return combined.transpose('cluster', 'time', ...).rename('timestep_duration') def _build_clustering_metrics( self, @@ -331,30 +320,29 @@ def _build_clustering_metrics( ) # Multi-dim case - iterator = DimIterator.from_sentinel_lists(periods, scenarios) sample_df = next(iter(non_empty_metrics.values())) metric_names = list(sample_df.columns) data_vars = {} for metric in metric_names: - slices = {} + results = [] for (p, s), df in clustering_metrics_all.items(): - key = iterator.key_from_values(p, s) if df.empty: - slices[key] = xr.DataArray( + da = xr.DataArray( np.full(len(sample_df.index), np.nan), dims=['time_series'], coords={'time_series': list(sample_df.index)}, ) else: - slices[key] = xr.DataArray( + da = xr.DataArray( df[metric].values, dims=['time_series'], coords={'time_series': list(df.index)}, ) - data_vars[metric] = iterator.combine( - slices, base_dims=['time_series'], name=metric, join='outer', fill_value=np.nan - ) + results.append(add_slice_dims(da, period=p, scenario=s)) + + combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] + data_vars[metric] = combined.transpose('time_series', ...).rename(metric) return xr.Dataset(data_vars) @@ -542,13 +530,19 @@ def _build_reduced_dataset( """ from .core import TimeSeriesData - iterator = DimIterator.from_sentinel_lists(periods, scenarios) ds_new_vars = {} # Use ds.variables to avoid _construct_dataarray overhead variables = ds.variables coord_cache = {k: ds.coords[k].values for k in ds.coords} + # Determine dimension order for combining + dim_order = ['cluster', 'time'] + if periods != [None]: + dim_order.append('period') + if scenarios != [None]: + dim_order.append('scenario') + for name in ds.data_vars: var = variables[name] if 'time' not in var.dims: @@ -581,7 +575,7 @@ def _build_reduced_dataset( # Partial typical slices: fill missing keys with constant values # For multi-period/scenario data, we need to select the right slice for each key - # Exclude 'period' and 'scenario' - they're handled by iterator.combine + # Exclude 'period' and 'scenario' - they're handled by combine other_dims = [d for d in var.dims if d not in ('time', 'period', 'scenario')] other_shape = [var.sizes[d] for d in other_dims] new_shape = [actual_n_clusters, n_time_points] + other_shape @@ -591,25 +585,23 @@ def _build_reduced_dataset( if dim in coord_cache: new_coords[dim] = coord_cache[dim] - # Build filled slices dict: use typical where available, constant otherwise - filled_slices = {} - for p in iterator._periods or [None]: - for s in iterator._scenarios or [None]: - old_key = (p, s) - new_key = iterator.key_from_values(p, s) - if old_key in typical_das[name]: - filled_slices[new_key] = typical_das[name][old_key] + # Build slices: use typical where available, constant otherwise + results = [] + for p in periods: + for s in scenarios: + key = (p, s) + if key in typical_das[name]: + da = typical_das[name][key] else: # Select the specific period/scenario slice, then reshape - selector = iterator._build_selector(p, s) + selector = {} + if p is not None and 'period' in var.dims: + selector['period'] = p + if s is not None and 'scenario' in var.dims: + selector['scenario'] = s # Select per-key slice if needed, otherwise use full variable - if selector: - # Only include dims that exist in the variable - var_selector = {k: v for k, v in selector.items() if k in var.dims} - var_slice = ds[name].sel(**var_selector, drop=True) if var_selector else ds[name] - else: - var_slice = ds[name] + var_slice = ds[name].sel(**selector, drop=True) if selector else ds[name] # Now slice time and reshape time_idx = var_slice.dims.index('time') @@ -618,24 +610,51 @@ def _build_reduced_dataset( sliced_values = var_slice.values[tuple(slices_list)] reshaped_constant = sliced_values.reshape(new_shape) - filled_slices[new_key] = xr.DataArray( + da = xr.DataArray( reshaped_constant, dims=['cluster', 'time'] + other_dims, coords=new_coords, ) - da = iterator.combine(filled_slices, base_dims=['cluster', 'time'], attrs=var.attrs) + results.append(add_slice_dims(da, period=p, scenario=s)) + + if len(results) == 1: + combined = results[0] + else: + combined = xr.combine_by_coords(results) + # combine_by_coords returns Dataset when DataArrays have names + if isinstance(combined, xr.Dataset): + combined = combined[name] if name in combined else list(combined.data_vars.values())[0] + # Add other_dims to dim_order for transpose + full_dim_order = dim_order + other_dims + combined = combined.transpose(*full_dim_order) + + combined = combined.assign_attrs(var.attrs) if var.attrs.get('__timeseries_data__', False): - da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) - ds_new_vars[name] = da + combined = TimeSeriesData.from_dataarray(combined) + ds_new_vars[name] = combined else: # Time-varying: combine per-(period, scenario) slices - # Convert old-style keys to DimIterator keys - converted_slices = {iterator.key_from_values(p, s): da for (p, s), da in typical_das[name].items()} - da = iterator.combine(converted_slices, base_dims=['cluster', 'time'], attrs=var.attrs) + results = [] + for (p, s), da in typical_das[name].items(): + results.append(add_slice_dims(da, period=p, scenario=s)) + + if len(results) == 1: + combined = results[0] + else: + combined = xr.combine_by_coords(results) + # combine_by_coords returns Dataset when DataArrays have names + if isinstance(combined, xr.Dataset): + combined = combined[name] if name in combined else list(combined.data_vars.values())[0] + # Add other dims from the first result for proper ordering + other_dims = [d for d in results[0].dims if d not in ('cluster', 'time', 'period', 'scenario')] + full_dim_order = dim_order + other_dims + combined = combined.transpose(*full_dim_order) + + combined = combined.assign_attrs(var.attrs) if var.attrs.get('__timeseries_data__', False): - da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) - ds_new_vars[name] = da + combined = TimeSeriesData.from_dataarray(combined) + ds_new_vars[name] = combined # Copy attrs but remove cluster_weight new_attrs = dict(ds.attrs) @@ -658,16 +677,26 @@ def _build_cluster_assignments_da( Returns: DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. """ - iterator = DimIterator.from_sentinel_lists(periods, scenarios) - slices = { - iterator.key_from_values(p, s): xr.DataArray( - cluster_assignmentss[(p, s)], dims=['original_cluster'], name='cluster_assignments' - ) - for p, s in cluster_assignmentss - } - return iterator.combine( - slices, base_dims=['original_cluster'], name='cluster_assignments', join='outer', fill_value=np.nan - ) + results = [] + for p in periods: + for s in scenarios: + da = xr.DataArray(cluster_assignmentss[(p, s)], dims=['original_cluster'], name='cluster_assignments') + results.append(add_slice_dims(da, period=p, scenario=s)) + + if len(results) == 1: + return results[0] + + combined = xr.combine_by_coords(results, join='outer', fill_value=np.nan) + # combine_by_coords returns a Dataset when DataArrays have names, extract the DataArray + if isinstance(combined, xr.Dataset): + combined = combined['cluster_assignments'] + # Ensure correct dimension order: original_cluster first, then period/scenario + dim_order = ['original_cluster'] + if periods != [None]: + dim_order.append('period') + if scenarios != [None]: + dim_order.append('scenario') + return combined.transpose(*dim_order) def sel( self, diff --git a/tests/test_cluster_reduce_expand.py b/tests/test_cluster_reduce_expand.py index cb6f5b7ff..e3f4a8d72 100644 --- a/tests/test_cluster_reduce_expand.py +++ b/tests/test_cluster_reduce_expand.py @@ -1682,86 +1682,3 @@ def test_startup_timing_preserved_non_segmented(self, solver_fixture, timesteps_ assert abs(clustered_val - expanded_val) < 1e-6, ( f'Mismatch at day {orig_day}, hour {hour}: clustered={clustered_val}, expanded={expanded_val}' ) - - -class TestCombineSlices: - """Tests for the combine_slices utility function.""" - - def test_single_dim(self): - """Test combining slices with a single extra dimension.""" - from flixopt.clustering.base import combine_slices - - slices = { - ('A',): np.array([1.0, 2.0, 3.0]), - ('B',): np.array([4.0, 5.0, 6.0]), - } - result = combine_slices( - slices, - extra_dims=['x'], - dim_coords={'x': ['A', 'B']}, - output_dim='time', - output_coord=[0, 1, 2], - ) - - assert result.dims == ('time', 'x') - assert result.shape == (3, 2) - assert result.sel(x='A').values.tolist() == [1.0, 2.0, 3.0] - assert result.sel(x='B').values.tolist() == [4.0, 5.0, 6.0] - - def test_two_dims(self): - """Test combining slices with two extra dimensions.""" - from flixopt.clustering.base import combine_slices - - slices = { - ('P1', 'base'): np.array([1.0, 2.0]), - ('P1', 'high'): np.array([3.0, 4.0]), - ('P2', 'base'): np.array([5.0, 6.0]), - ('P2', 'high'): np.array([7.0, 8.0]), - } - result = combine_slices( - slices, - extra_dims=['period', 'scenario'], - dim_coords={'period': ['P1', 'P2'], 'scenario': ['base', 'high']}, - output_dim='time', - output_coord=[0, 1], - ) - - assert result.dims == ('time', 'period', 'scenario') - assert result.shape == (2, 2, 2) - assert result.sel(period='P1', scenario='base').values.tolist() == [1.0, 2.0] - assert result.sel(period='P2', scenario='high').values.tolist() == [7.0, 8.0] - - def test_attrs_propagation(self): - """Test that attrs are propagated to the result.""" - from flixopt.clustering.base import combine_slices - - slices = {('A',): np.array([1.0, 2.0])} - result = combine_slices( - slices, - extra_dims=['x'], - dim_coords={'x': ['A']}, - output_dim='time', - output_coord=[0, 1], - attrs={'units': 'kW', 'description': 'power'}, - ) - - assert result.attrs['units'] == 'kW' - assert result.attrs['description'] == 'power' - - def test_datetime_coords(self): - """Test with pandas DatetimeIndex as output coordinates.""" - from flixopt.clustering.base import combine_slices - - time_index = pd.date_range('2020-01-01', periods=3, freq='h') - slices = {('A',): np.array([1.0, 2.0, 3.0])} - result = combine_slices( - slices, - extra_dims=['x'], - dim_coords={'x': ['A']}, - output_dim='time', - output_coord=time_index, - ) - - assert result.dims == ('time', 'x') - assert len(result.coords['time']) == 3 - assert result.coords['time'][0].values == time_index[0] diff --git a/tests/test_clustering_io.py b/tests/test_clustering_io.py index e3bfa6c1d..0e2200885 100644 --- a/tests/test_clustering_io.py +++ b/tests/test_clustering_io.py @@ -237,12 +237,8 @@ def test_clustering_roundtrip_preserves_scenarios(self, system_with_scenarios): ds = fs_clustered.to_dataset(include_solution=False) fs_restored = fx.FlowSystem.from_dataset(ds) - # Scenarios should be preserved in the FlowSystem itself - pd.testing.assert_index_equal( - fs_restored.scenarios, - pd.Index(['Low', 'High'], name='scenario'), - check_names=False, - ) + # Scenarios should be preserved in the FlowSystem itself (order may differ due to coordinate sorting) + assert set(fs_restored.scenarios) == {'Low', 'High'} class TestClusteringJsonExport: From f14c5a67f94af2aa625bab7ea503859a17109e73 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:52:30 +0100 Subject: [PATCH 03/15] simplify by iterating over the key and dim_names directly --- flixopt/clustering/base.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 646d8f2e2..49d6594a7 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -27,7 +27,6 @@ from ..plot_result import PlotResult from ..statistics_accessor import SelectType -from ..dim_iterator import add_slice_dims from ..statistics_accessor import _build_color_kwargs @@ -486,30 +485,19 @@ def _build_property_array( results = [] for key, cr in self._results.items(): data = get_data(cr) - coords = base_coords if base_coords else {} - da = xr.DataArray(data, dims=base_dims, coords=coords, name=name) - - # Extract period/scenario from key based on dim_names - period = None - scenario = None - for i, dim_name in enumerate(self._dim_names): - if dim_name == 'period': - period = key[i] - elif dim_name == 'scenario': - scenario = key[i] - - results.append(add_slice_dims(da, period=period, scenario=scenario)) + da = xr.DataArray(data, dims=base_dims, coords=base_coords or {}, name=name) + # Add dims from key: key[i] corresponds to dim_names[i] + for dim_name, coord_val in zip(self._dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + results.append(da) if len(results) == 1: return results[0] combined = xr.combine_by_coords(results) - # combine_by_coords returns a Dataset when DataArrays have names, extract the DataArray if isinstance(combined, xr.Dataset): combined = combined[name] - # Ensure correct dimension order: base_dims first, then period/scenario - dim_order = list(base_dims) + self._dim_names - return combined.transpose(*dim_order) + return combined.transpose(*base_dims, *self._dim_names) @staticmethod def _key_to_str(key: tuple) -> str: From e6e757ef4b79f21c268cb057164f533ed9b3640d Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:59:34 +0100 Subject: [PATCH 04/15] simplify by iterating over the key and dim_names directly --- flixopt/transform_accessor.py | 171 ++++++++++++---------------------- 1 file changed, 62 insertions(+), 109 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index dde261e41..fb1811310 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -145,32 +145,33 @@ def _build_cluster_weight_da( cluster_occurrences_all: dict[tuple, dict], n_clusters: int, cluster_coords: np.ndarray, - periods: list, - scenarios: list, + dim_names: list[str], ) -> xr.DataArray: """Build cluster_weight DataArray from occurrence counts. Args: - cluster_occurrences_all: Dict mapping (period, scenario) tuples to - dicts of {cluster_id: occurrence_count}. + cluster_occurrences_all: Dict mapping key tuples to {cluster_id: count}. n_clusters: Number of clusters. cluster_coords: Cluster coordinate values. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). + dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). Returns: - DataArray with dims [cluster] or [cluster, period?, scenario?]. + DataArray with dims [cluster, *dim_names]. """ results = [] - for p in periods: - for s in scenarios: - occurrences = cluster_occurrences_all[(p, s)] - weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) - da = xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) - results.append(add_slice_dims(da, period=p, scenario=s)) + for key, occurrences in cluster_occurrences_all.items(): + weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) + da = xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) + for dim_name, coord_val in zip(dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + results.append(da) - combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] - return combined.transpose('cluster', ...).rename('cluster_weight') + if len(results) == 1: + return results[0].rename('cluster_weight') + combined = xr.combine_by_coords(results) + if isinstance(combined, xr.Dataset): + combined = list(combined.data_vars.values())[0] + return combined.transpose('cluster', *dim_names).rename('cluster_weight') def _build_typical_das( self, @@ -231,68 +232,59 @@ def _build_typical_das( def _build_segment_durations_da( self, - tsam_aggregation_results: dict[tuple, Any], + aggregation_results: dict[tuple, Any], actual_n_clusters: int, n_segments: int, cluster_coords: np.ndarray, time_coords: pd.RangeIndex, dt: float, - periods: list, - scenarios: list, + dim_names: list[str], ) -> xr.DataArray: """Build timestep_duration DataArray from segment durations. - For segmented systems, each segment represents multiple original timesteps. - The duration is segment_duration_in_original_timesteps * dt (hours per original timestep). - Args: - tsam_aggregation_results: Dict mapping (period, scenario) to tsam results. + aggregation_results: Dict mapping key tuples to tsam results. actual_n_clusters: Number of clusters. n_segments: Number of segments per cluster. cluster_coords: Cluster coordinate values. time_coords: Time coordinate values (RangeIndex for segments). dt: Hours per original timestep. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). + dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). Returns: - DataArray with dims [cluster, time] or [cluster, time, period?, scenario?] - containing duration in hours for each segment. + DataArray with dims [cluster, time, *dim_names]. """ results = [] - for (p, s), tsam_result in tsam_aggregation_results.items(): - # segment_durations is tuple of tuples: ((dur1, dur2, ...), (dur1, dur2, ...), ...) + for key, tsam_result in aggregation_results.items(): seg_durs = tsam_result.segment_durations - - # Build 2D array (cluster, segment) of durations in hours data = np.zeros((actual_n_clusters, n_segments)) for cluster_id in range(actual_n_clusters): cluster_seg_durs = seg_durs[cluster_id] for seg_id in range(n_segments): data[cluster_id, seg_id] = cluster_seg_durs[seg_id] * dt - da = xr.DataArray( - data, - dims=['cluster', 'time'], - coords={'cluster': cluster_coords, 'time': time_coords}, - ) - results.append(add_slice_dims(da, period=p, scenario=s)) + da = xr.DataArray(data, dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}) + for dim_name, coord_val in zip(dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + results.append(da) - combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] - return combined.transpose('cluster', 'time', ...).rename('timestep_duration') + if len(results) == 1: + return results[0].rename('timestep_duration') + combined = xr.combine_by_coords(results) + if isinstance(combined, xr.Dataset): + combined = list(combined.data_vars.values())[0] + return combined.transpose('cluster', 'time', *dim_names).rename('timestep_duration') def _build_clustering_metrics( self, clustering_metrics_all: dict[tuple, pd.DataFrame], - periods: list, - scenarios: list, + dim_names: list[str], ) -> xr.Dataset: """Build clustering metrics Dataset from per-slice DataFrames. Args: - clustering_metrics_all: Dict mapping (period, scenario) to metric DataFrames. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). + clustering_metrics_all: Dict mapping key tuples to metric DataFrames. + dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). Returns: Dataset with RMSE, MAE, RMSE_duration metrics. @@ -302,12 +294,9 @@ def _build_clustering_metrics( if not non_empty_metrics: return xr.Dataset() - first_key = (periods[0], scenarios[0]) - + # Single slice case if len(clustering_metrics_all) == 1 and len(non_empty_metrics) == 1: - metrics_df = non_empty_metrics.get(first_key) - if metrics_df is None: - metrics_df = next(iter(non_empty_metrics.values())) + metrics_df = next(iter(non_empty_metrics.values())) return xr.Dataset( { col: xr.DataArray( @@ -326,7 +315,7 @@ def _build_clustering_metrics( for metric in metric_names: results = [] - for (p, s), df in clustering_metrics_all.items(): + for key, df in clustering_metrics_all.items(): if df.empty: da = xr.DataArray( np.full(len(sample_df.index), np.nan), @@ -339,10 +328,17 @@ def _build_clustering_metrics( dims=['time_series'], coords={'time_series': list(df.index)}, ) - results.append(add_slice_dims(da, period=p, scenario=s)) + for dim_name, coord_val in zip(dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + results.append(da) - combined = xr.combine_by_coords(results) if len(results) > 1 else results[0] - data_vars[metric] = combined.transpose('time_series', ...).rename(metric) + if len(results) == 1: + combined = results[0] + else: + combined = xr.combine_by_coords(results) + if isinstance(combined, xr.Dataset): + combined = list(combined.data_vars.values())[0] + data_vars[metric] = combined.transpose('time_series', *dim_names).rename(metric) return xr.Dataset(data_vars) @@ -390,23 +386,26 @@ def _build_reduced_flow_system( if has_scenarios: dim_names.append('scenario') - # Build dict keyed by (period?, scenario?) tuples (without None) - aggregation_results: dict[tuple, Any] = {} - for (p, s), result in tsam_aggregation_results.items(): - key_parts = [] + # Helper to convert (p, s) keys to clean keys (without None) + def clean_key(p, s) -> tuple: + parts = [] if has_periods: - key_parts.append(p) + parts.append(p) if has_scenarios: - key_parts.append(s) - key = tuple(key_parts) - aggregation_results[key] = result + parts.append(s) + return tuple(parts) + + # Convert all dicts to use clean keys + aggregation_results = {clean_key(p, s): v for (p, s), v in tsam_aggregation_results.items()} + occurrences = {clean_key(p, s): v for (p, s), v in cluster_occurrences_all.items()} + metrics = {clean_key(p, s): v for (p, s), v in clustering_metrics_all.items()} # Use first result for structure first_key = (periods[0], scenarios[0]) first_tsam = tsam_aggregation_results[first_key] # Build metrics - clustering_metrics = self._build_clustering_metrics(clustering_metrics_all, periods, scenarios) + clustering_metrics = self._build_clustering_metrics(metrics, dim_names) n_reduced_timesteps = len(first_tsam.cluster_representatives) actual_n_clusters = len(first_tsam.cluster_weights) @@ -432,9 +431,7 @@ def _build_reduced_flow_system( ) # Build cluster_weight - cluster_weight = self._build_cluster_weight_da( - cluster_occurrences_all, actual_n_clusters, cluster_coords, periods, scenarios - ) + cluster_weight = self._build_cluster_weight_da(occurrences, actual_n_clusters, cluster_coords, dim_names) # Logging if is_segmented: @@ -467,14 +464,7 @@ def _build_reduced_flow_system( # For segmented systems, build timestep_duration from segment_durations if is_segmented: segment_durations = self._build_segment_durations_da( - tsam_aggregation_results, - actual_n_clusters, - n_segments, - cluster_coords, - time_coords, - dt, - periods, - scenarios, + aggregation_results, actual_n_clusters, n_segments, cluster_coords, time_coords, dt, dim_names ) ds_new['timestep_duration'] = segment_durations @@ -661,43 +651,6 @@ def _build_reduced_dataset( new_attrs.pop('cluster_weight', None) return xr.Dataset(ds_new_vars, attrs=new_attrs) - def _build_cluster_assignments_da( - self, - cluster_assignmentss: dict[tuple, np.ndarray], - periods: list, - scenarios: list, - ) -> xr.DataArray: - """Build cluster_assignments DataArray from cluster assignments. - - Args: - cluster_assignmentss: Dict mapping (period, scenario) to cluster assignment arrays. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). - - Returns: - DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. - """ - results = [] - for p in periods: - for s in scenarios: - da = xr.DataArray(cluster_assignmentss[(p, s)], dims=['original_cluster'], name='cluster_assignments') - results.append(add_slice_dims(da, period=p, scenario=s)) - - if len(results) == 1: - return results[0] - - combined = xr.combine_by_coords(results, join='outer', fill_value=np.nan) - # combine_by_coords returns a Dataset when DataArrays have names, extract the DataArray - if isinstance(combined, xr.Dataset): - combined = combined['cluster_assignments'] - # Ensure correct dimension order: original_cluster first, then period/scenario - dim_order = ['original_cluster'] - if periods != [None]: - dim_order.append('period') - if scenarios != [None]: - dim_order.append('scenario') - return combined.transpose(*dim_order) - def sel( self, time: str | slice | list[str] | pd.Timestamp | pd.DatetimeIndex | None = None, From 468862c8865c26056afe2f211701d14496563eca Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 22:03:30 +0100 Subject: [PATCH 05/15] Deleted files: - flixopt/dim_iterator.py - no longer needed Key changes: 1. _build_typical_das now returns dict[str, xr.DataArray] (pre-combined) instead of dict[str, dict[tuple, xr.DataArray]] (nested by keys) 2. _build_reduced_dataset simplified from ~80 lines to ~35 lines: - Removed partial slice handling (was complex, likely unused) - No longer iterates over periods/scenarios - Just uses pre-combined DataArrays directly 3. All helper methods now use the same pattern: for key, data in results_dict.items(): da = xr.DataArray(...) for dim_name, coord_val in zip(dim_names, key): da = da.expand_dims({dim_name: [coord_val]}) results.append(da) combined = xr.combine_by_coords(results) 4. No more [None] sentinel pattern - keys are clean tuples matching dim_names 5. Removed add_slice_dims - no longer needed since dims are added inline The code is now much simpler: convert keys once at entry, add dims with expand_dims, combine with xr.combine_by_coords --- flixopt/dim_iterator.py | 49 ---------- flixopt/transform_accessor.py | 169 ++++++++-------------------------- 2 files changed, 40 insertions(+), 178 deletions(-) delete mode 100644 flixopt/dim_iterator.py diff --git a/flixopt/dim_iterator.py b/flixopt/dim_iterator.py deleted file mode 100644 index 224f86b60..000000000 --- a/flixopt/dim_iterator.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Helper for period/scenario dimension handling. - -Provides a simple utility for adding period/scenario dimensions back to -DataArrays after transformation. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - import xarray as xr - - -def add_slice_dims( - da: xr.DataArray, - period: Any = None, - scenario: Any = None, -) -> xr.DataArray: - """Add period/scenario dimensions back to a transformed DataArray. - - After selecting a slice with `ds.sel(period=p, scenario=s, drop=True)` - and transforming it, use this to re-add the period/scenario coordinates - so results can be combined with `xr.combine_by_coords`. - - Args: - da: DataArray without period/scenario dimensions. - period: Period value to add, or None to skip. - scenario: Scenario value to add, or None to skip. - - Returns: - DataArray with period/scenario dims added (as length-1 dims). - - Example: - >>> results = [] - >>> for p in periods: - ... for s in scenarios: - ... selector = {k: v for k, v in [('period', p), ('scenario', s)] if v is not None} - ... ds_slice = ds.sel(**selector, drop=True) if selector else ds - ... result = transform(ds_slice) - ... results.append(add_slice_dims(result, period=p, scenario=s)) - >>> combined = xr.combine_by_coords(results) - """ - if period is not None: - da = da.expand_dims(period=[period]) - if scenario is not None: - da = da.expand_dims(scenario=[scenario]) - return da diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index fb1811310..f6eaf76a1 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -16,7 +16,6 @@ import pandas as pd import xarray as xr -from .dim_iterator import add_slice_dims from .modeling import _scalar_safe_reduce from .structure import EXPAND_DIVIDE, EXPAND_FIRST_TIMESTEP, EXPAND_INTERPOLATE, VariableCategory @@ -175,59 +174,71 @@ def _build_cluster_weight_da( def _build_typical_das( self, - tsam_aggregation_results: dict[tuple, Any], + aggregation_results: dict[tuple, Any], actual_n_clusters: int, n_time_points: int, cluster_coords: np.ndarray, time_coords: pd.DatetimeIndex | pd.RangeIndex, + dim_names: list[str], is_segmented: bool = False, - ) -> dict[str, dict[tuple, xr.DataArray]]: - """Build typical periods DataArrays with (cluster, time) shape. + ) -> dict[str, xr.DataArray]: + """Build typical periods DataArrays with (cluster, time, *dim_names) shape. Args: - tsam_aggregation_results: Dict mapping (period, scenario) to tsam results. + aggregation_results: Dict mapping key tuples to tsam results. actual_n_clusters: Number of clusters. n_time_points: Number of time points per cluster (timesteps or segments). cluster_coords: Cluster coordinate values. time_coords: Time coordinate values. + dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). is_segmented: Whether segmentation was used. Returns: - Nested dict: {column_name: {(period, scenario): DataArray}}. + Dict mapping column names to combined DataArrays with dims [cluster, time, *dim_names]. """ - typical_das: dict[str, dict[tuple, xr.DataArray]] = {} - for key, tsam_result in tsam_aggregation_results.items(): + # Collect DataArrays per column, with dims already expanded + column_results: dict[str, list[xr.DataArray]] = {} + + for key, tsam_result in aggregation_results.items(): typical_df = tsam_result.cluster_representatives if is_segmented: - # Segmented data: MultiIndex with cluster as first level - # Each cluster has exactly n_time_points rows (segments) - # Extract all data at once using numpy reshape, avoiding slow .loc calls columns = typical_df.columns.tolist() - - # Get all values as numpy array: (n_clusters * n_time_points, n_columns) all_values = typical_df.values - - # Reshape to (n_clusters, n_time_points, n_columns) reshaped = all_values.reshape(actual_n_clusters, n_time_points, -1) for col_idx, col in enumerate(columns): - # reshaped[:, :, col_idx] selects all clusters, all time points, single column - # Result shape: (n_clusters, n_time_points) - typical_das.setdefault(col, {})[key] = xr.DataArray( + da = xr.DataArray( reshaped[:, :, col_idx], dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}, ) + for dim_name, coord_val in zip(dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + column_results.setdefault(col, []).append(da) else: - # Non-segmented: flat data that can be reshaped for col in typical_df.columns: flat_data = typical_df[col].values reshaped = flat_data.reshape(actual_n_clusters, n_time_points) - typical_das.setdefault(col, {})[key] = xr.DataArray( + da = xr.DataArray( reshaped, dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}, ) + for dim_name, coord_val in zip(dim_names, key, strict=False): + da = da.expand_dims({dim_name: [coord_val]}) + column_results.setdefault(col, []).append(da) + + # Combine each column's DataArrays + typical_das: dict[str, xr.DataArray] = {} + for col, das in column_results.items(): + if len(das) == 1: + typical_das[col] = das[0] + else: + combined = xr.combine_by_coords(das) + if isinstance(combined, xr.Dataset): + combined = list(combined.data_vars.values())[0] + typical_das[col] = combined.transpose('cluster', 'time', *dim_names) + return typical_das def _build_segment_durations_da( @@ -445,20 +456,12 @@ def clean_key(p, s) -> tuple: # Build typical periods DataArrays with (cluster, time) shape typical_das = self._build_typical_das( - tsam_aggregation_results, actual_n_clusters, n_time_points, cluster_coords, time_coords, is_segmented + aggregation_results, actual_n_clusters, n_time_points, cluster_coords, time_coords, dim_names, is_segmented ) # Build reduced dataset with (cluster, time) dimensions ds_new = self._build_reduced_dataset( - ds, - typical_das, - actual_n_clusters, - n_reduced_timesteps, - n_time_points, - cluster_coords, - time_coords, - periods, - scenarios, + ds, typical_das, actual_n_clusters, n_reduced_timesteps, n_time_points, cluster_coords, time_coords ) # For segmented systems, build timestep_duration from segment_durations @@ -493,27 +496,23 @@ def clean_key(p, s) -> tuple: def _build_reduced_dataset( self, ds: xr.Dataset, - typical_das: dict[str, dict[tuple, xr.DataArray]], + typical_das: dict[str, xr.DataArray], actual_n_clusters: int, n_reduced_timesteps: int, n_time_points: int, cluster_coords: np.ndarray, time_coords: pd.DatetimeIndex | pd.RangeIndex, - periods: list, - scenarios: list, ) -> xr.Dataset: """Build the reduced dataset with (cluster, time) structure. Args: ds: Original dataset. - typical_das: Typical periods DataArrays from _build_typical_das(). + typical_das: Pre-combined DataArrays from _build_typical_das(). actual_n_clusters: Number of clusters. n_reduced_timesteps: Total reduced timesteps (n_clusters * n_time_points). n_time_points: Number of time points per cluster (timesteps or segments). cluster_coords: Cluster coordinate values. time_coords: Time coordinate values. - periods: List of period labels. - scenarios: List of scenario labels. Returns: Dataset with reduced timesteps and (cluster, time) structure. @@ -521,27 +520,17 @@ def _build_reduced_dataset( from .core import TimeSeriesData ds_new_vars = {} - - # Use ds.variables to avoid _construct_dataarray overhead variables = ds.variables coord_cache = {k: ds.coords[k].values for k in ds.coords} - # Determine dimension order for combining - dim_order = ['cluster', 'time'] - if periods != [None]: - dim_order.append('period') - if scenarios != [None]: - dim_order.append('scenario') - for name in ds.data_vars: var = variables[name] if 'time' not in var.dims: - # No time dimension - wrap Variable in DataArray + # No time dimension - copy as-is coords = {d: coord_cache[d] for d in var.dims if d in coord_cache} ds_new_vars[name] = xr.DataArray(var.values, dims=var.dims, coords=coords, attrs=var.attrs, name=name) elif name not in typical_das: # Time-dependent but constant: reshape to (cluster, time, ...) - # Use numpy slicing instead of .isel() time_idx = var.dims.index('time') slices = [slice(None)] * len(var.dims) slices[time_idx] = slice(0, n_reduced_timesteps) @@ -561,90 +550,12 @@ def _build_reduced_dataset( coords=new_coords, attrs=var.attrs, ) - elif set(typical_das[name].keys()) != {(p, s) for p in periods for s in scenarios}: - # Partial typical slices: fill missing keys with constant values - # For multi-period/scenario data, we need to select the right slice for each key - - # Exclude 'period' and 'scenario' - they're handled by combine - other_dims = [d for d in var.dims if d not in ('time', 'period', 'scenario')] - other_shape = [var.sizes[d] for d in other_dims] - new_shape = [actual_n_clusters, n_time_points] + other_shape - - new_coords = {'cluster': cluster_coords, 'time': time_coords} - for dim in other_dims: - if dim in coord_cache: - new_coords[dim] = coord_cache[dim] - - # Build slices: use typical where available, constant otherwise - results = [] - for p in periods: - for s in scenarios: - key = (p, s) - if key in typical_das[name]: - da = typical_das[name][key] - else: - # Select the specific period/scenario slice, then reshape - selector = {} - if p is not None and 'period' in var.dims: - selector['period'] = p - if s is not None and 'scenario' in var.dims: - selector['scenario'] = s - - # Select per-key slice if needed, otherwise use full variable - var_slice = ds[name].sel(**selector, drop=True) if selector else ds[name] - - # Now slice time and reshape - time_idx = var_slice.dims.index('time') - slices_list = [slice(None)] * len(var_slice.dims) - slices_list[time_idx] = slice(0, n_reduced_timesteps) - sliced_values = var_slice.values[tuple(slices_list)] - reshaped_constant = sliced_values.reshape(new_shape) - - da = xr.DataArray( - reshaped_constant, - dims=['cluster', 'time'] + other_dims, - coords=new_coords, - ) - - results.append(add_slice_dims(da, period=p, scenario=s)) - - if len(results) == 1: - combined = results[0] - else: - combined = xr.combine_by_coords(results) - # combine_by_coords returns Dataset when DataArrays have names - if isinstance(combined, xr.Dataset): - combined = combined[name] if name in combined else list(combined.data_vars.values())[0] - # Add other_dims to dim_order for transpose - full_dim_order = dim_order + other_dims - combined = combined.transpose(*full_dim_order) - - combined = combined.assign_attrs(var.attrs) - if var.attrs.get('__timeseries_data__', False): - combined = TimeSeriesData.from_dataarray(combined) - ds_new_vars[name] = combined else: - # Time-varying: combine per-(period, scenario) slices - results = [] - for (p, s), da in typical_das[name].items(): - results.append(add_slice_dims(da, period=p, scenario=s)) - - if len(results) == 1: - combined = results[0] - else: - combined = xr.combine_by_coords(results) - # combine_by_coords returns Dataset when DataArrays have names - if isinstance(combined, xr.Dataset): - combined = combined[name] if name in combined else list(combined.data_vars.values())[0] - # Add other dims from the first result for proper ordering - other_dims = [d for d in results[0].dims if d not in ('cluster', 'time', 'period', 'scenario')] - full_dim_order = dim_order + other_dims - combined = combined.transpose(*full_dim_order) - - combined = combined.assign_attrs(var.attrs) + # Time-varying: use pre-combined DataArray from typical_das + da = typical_das[name].assign_attrs(var.attrs) if var.attrs.get('__timeseries_data__', False): - combined = TimeSeriesData.from_dataarray(combined) - ds_new_vars[name] = combined + da = TimeSeriesData.from_dataarray(da) + ds_new_vars[name] = da # Copy attrs but remove cluster_weight new_attrs = dict(ds.attrs) From 4e33675903d32609cafe5ef37ff7268dad6708bc Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 22:14:36 +0100 Subject: [PATCH 06/15] The refactoring is complete. Here's a summary of the changes: Changes made: 1. cluster() method: - Added dim_names list and to_clean_key() helper at the start - Dicts now use clean keys like (2024,) or (2024, 'high') instead of (2024, None) or (None, 'high') - Removed duplicate dim_names building and to_cr_key() function from the data_vars branch - Simplified data_vars branch since ClusteringResults can receive the dicts directly - Updated call to _build_reduced_flow_system to use new parameter names (aggregation_results, occurrences, metrics, dim_names) 2. apply_clustering() method: - Uses dim_names directly from clustering.results.dim_names - Removed the key conversion logic (full_key = (cr_key[0], None) etc.) - Updated call to _build_reduced_flow_system with new parameter names The code is now cleaner with consistent use of clean keys throughout the clustering workflow. All 141 tests pass (98 in test_cluster_reduce_expand.py + 43 in test_clustering_io.py). --- flixopt/clustering/base.py | 16 +- flixopt/transform_accessor.py | 323 +++++++++++++--------------------- 2 files changed, 134 insertions(+), 205 deletions(-) diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 49d6594a7..3dd11cc43 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -482,19 +482,17 @@ def _build_property_array( name: str | None = None, ) -> xr.DataArray: """Build a DataArray property, handling both single and multi-dimensional cases.""" - results = [] + slices = [] for key, cr in self._results.items(): - data = get_data(cr) - da = xr.DataArray(data, dims=base_dims, coords=base_coords or {}, name=name) - # Add dims from key: key[i] corresponds to dim_names[i] - for dim_name, coord_val in zip(self._dim_names, key, strict=False): + da = xr.DataArray(get_data(cr), dims=base_dims, coords=base_coords or {}, name=name) + for dim_name, coord_val in zip(self._dim_names, key, strict=True): da = da.expand_dims({dim_name: [coord_val]}) - results.append(da) + slices.append(da) - if len(results) == 1: - return results[0] + if len(slices) == 1: + return slices[0] - combined = xr.combine_by_coords(results) + combined = xr.combine_by_coords(slices) if isinstance(combined, xr.Dataset): combined = combined[name] return combined.transpose(*base_dims, *self._dim_names) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index f6eaf76a1..4e6f79d20 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -28,6 +28,55 @@ logger = logging.getLogger('flixopt') +def _combine_dataarray_slices( + slices: list[xr.DataArray], + base_dims: list[str], + extra_dims: list[str], + name: str | None = None, +) -> xr.DataArray: + """Combine DataArray slices with extra dimensions into a single DataArray. + + Args: + slices: List of DataArrays, each with extra dims already expanded. + base_dims: Base dimension names (e.g., ['cluster', 'time']). + extra_dims: Extra dimension names (e.g., ['period', 'scenario']). + name: Optional name for the result. + + Returns: + Combined DataArray with dims [*base_dims, *extra_dims]. + """ + if len(slices) == 1: + result = slices[0] + else: + combined = xr.combine_by_coords(slices) + # combine_by_coords returns Dataset when DataArrays have names + if isinstance(combined, xr.Dataset): + result = list(combined.data_vars.values())[0] + else: + result = combined + result = result.transpose(*base_dims, *extra_dims) + + if name is not None: + result = result.rename(name) + return result + + +def _expand_dims_for_key(da: xr.DataArray, dim_names: list[str], key: tuple) -> xr.DataArray: + """Add dimensions to a DataArray based on key values. + + Args: + da: DataArray without extra dimensions. + dim_names: Names of dimensions to add (e.g., ['period', 'scenario']). + key: Tuple of coordinate values matching dim_names. + + Returns: + DataArray with extra dimensions added. + """ + for dim_name, coord_val in zip(dim_names, key, strict=True): + da = da.expand_dims({dim_name: [coord_val]}) + return da + + class TransformAccessor: """ Accessor for transformation methods on FlowSystem. @@ -157,20 +206,13 @@ def _build_cluster_weight_da( Returns: DataArray with dims [cluster, *dim_names]. """ - results = [] + slices = [] for key, occurrences in cluster_occurrences_all.items(): weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) da = xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) - for dim_name, coord_val in zip(dim_names, key, strict=False): - da = da.expand_dims({dim_name: [coord_val]}) - results.append(da) + slices.append(_expand_dims_for_key(da, dim_names, key)) - if len(results) == 1: - return results[0].rename('cluster_weight') - combined = xr.combine_by_coords(results) - if isinstance(combined, xr.Dataset): - combined = list(combined.data_vars.values())[0] - return combined.transpose('cluster', *dim_names).rename('cluster_weight') + return _combine_dataarray_slices(slices, ['cluster'], dim_names, name='cluster_weight') def _build_typical_das( self, @@ -196,50 +238,27 @@ def _build_typical_das( Returns: Dict mapping column names to combined DataArrays with dims [cluster, time, *dim_names]. """ - # Collect DataArrays per column, with dims already expanded - column_results: dict[str, list[xr.DataArray]] = {} + column_slices: dict[str, list[xr.DataArray]] = {} + base_coords = {'cluster': cluster_coords, 'time': time_coords} for key, tsam_result in aggregation_results.items(): typical_df = tsam_result.cluster_representatives if is_segmented: columns = typical_df.columns.tolist() - all_values = typical_df.values - reshaped = all_values.reshape(actual_n_clusters, n_time_points, -1) - + reshaped = typical_df.values.reshape(actual_n_clusters, n_time_points, -1) for col_idx, col in enumerate(columns): - da = xr.DataArray( - reshaped[:, :, col_idx], - dims=['cluster', 'time'], - coords={'cluster': cluster_coords, 'time': time_coords}, - ) - for dim_name, coord_val in zip(dim_names, key, strict=False): - da = da.expand_dims({dim_name: [coord_val]}) - column_results.setdefault(col, []).append(da) + da = xr.DataArray(reshaped[:, :, col_idx], dims=['cluster', 'time'], coords=base_coords) + column_slices.setdefault(col, []).append(_expand_dims_for_key(da, dim_names, key)) else: for col in typical_df.columns: - flat_data = typical_df[col].values - reshaped = flat_data.reshape(actual_n_clusters, n_time_points) - da = xr.DataArray( - reshaped, - dims=['cluster', 'time'], - coords={'cluster': cluster_coords, 'time': time_coords}, - ) - for dim_name, coord_val in zip(dim_names, key, strict=False): - da = da.expand_dims({dim_name: [coord_val]}) - column_results.setdefault(col, []).append(da) - - # Combine each column's DataArrays - typical_das: dict[str, xr.DataArray] = {} - for col, das in column_results.items(): - if len(das) == 1: - typical_das[col] = das[0] - else: - combined = xr.combine_by_coords(das) - if isinstance(combined, xr.Dataset): - combined = list(combined.data_vars.values())[0] - typical_das[col] = combined.transpose('cluster', 'time', *dim_names) + reshaped = typical_df[col].values.reshape(actual_n_clusters, n_time_points) + da = xr.DataArray(reshaped, dims=['cluster', 'time'], coords=base_coords) + column_slices.setdefault(col, []).append(_expand_dims_for_key(da, dim_names, key)) - return typical_das + return { + col: _combine_dataarray_slices(slices, ['cluster', 'time'], dim_names) + for col, slices in column_slices.items() + } def _build_segment_durations_da( self, @@ -265,26 +284,15 @@ def _build_segment_durations_da( Returns: DataArray with dims [cluster, time, *dim_names]. """ - results = [] + slices = [] + base_coords = {'cluster': cluster_coords, 'time': time_coords} for key, tsam_result in aggregation_results.items(): seg_durs = tsam_result.segment_durations - data = np.zeros((actual_n_clusters, n_segments)) - for cluster_id in range(actual_n_clusters): - cluster_seg_durs = seg_durs[cluster_id] - for seg_id in range(n_segments): - data[cluster_id, seg_id] = cluster_seg_durs[seg_id] * dt - - da = xr.DataArray(data, dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}) - for dim_name, coord_val in zip(dim_names, key, strict=False): - da = da.expand_dims({dim_name: [coord_val]}) - results.append(da) - - if len(results) == 1: - return results[0].rename('timestep_duration') - combined = xr.combine_by_coords(results) - if isinstance(combined, xr.Dataset): - combined = list(combined.data_vars.values())[0] - return combined.transpose('cluster', 'time', *dim_names).rename('timestep_duration') + data = np.array([[seg_durs[c][s] * dt for s in range(n_segments)] for c in range(actual_n_clusters)]) + da = xr.DataArray(data, dims=['cluster', 'time'], coords=base_coords) + slices.append(_expand_dims_for_key(da, dim_names, key)) + + return _combine_dataarray_slices(slices, ['cluster', 'time'], dim_names, name='timestep_duration') def _build_clustering_metrics( self, @@ -321,48 +329,27 @@ def _build_clustering_metrics( # Multi-dim case sample_df = next(iter(non_empty_metrics.values())) - metric_names = list(sample_df.columns) data_vars = {} - for metric in metric_names: - results = [] + for metric in sample_df.columns: + slices = [] for key, df in clustering_metrics_all.items(): - if df.empty: - da = xr.DataArray( - np.full(len(sample_df.index), np.nan), - dims=['time_series'], - coords={'time_series': list(sample_df.index)}, - ) - else: - da = xr.DataArray( - df[metric].values, - dims=['time_series'], - coords={'time_series': list(df.index)}, - ) - for dim_name, coord_val in zip(dim_names, key, strict=False): - da = da.expand_dims({dim_name: [coord_val]}) - results.append(da) - - if len(results) == 1: - combined = results[0] - else: - combined = xr.combine_by_coords(results) - if isinstance(combined, xr.Dataset): - combined = list(combined.data_vars.values())[0] - data_vars[metric] = combined.transpose('time_series', *dim_names).rename(metric) + values = np.full(len(sample_df.index), np.nan) if df.empty else df[metric].values + da = xr.DataArray(values, dims=['time_series'], coords={'time_series': list(sample_df.index)}) + slices.append(_expand_dims_for_key(da, dim_names, key)) + data_vars[metric] = _combine_dataarray_slices(slices, ['time_series'], dim_names, name=metric) return xr.Dataset(data_vars) def _build_reduced_flow_system( self, ds: xr.Dataset, - tsam_aggregation_results: dict[tuple, Any], - cluster_occurrences_all: dict[tuple, dict], - clustering_metrics_all: dict[tuple, pd.DataFrame], + aggregation_results: dict[tuple, Any], + occurrences: dict[tuple, dict], + metrics: dict[tuple, pd.DataFrame], timesteps_per_cluster: int, dt: float, - periods: list, - scenarios: list, + dim_names: list[str], n_clusters_requested: int | None = None, ) -> FlowSystem: """Build a reduced FlowSystem from tsam aggregation results. @@ -371,13 +358,12 @@ def _build_reduced_flow_system( Args: ds: Original dataset. - tsam_aggregation_results: Dict mapping (period, scenario) to tsam AggregationResult. - cluster_occurrences_all: Dict mapping (period, scenario) to cluster occurrence counts. - clustering_metrics_all: Dict mapping (period, scenario) to accuracy metrics. + aggregation_results: Dict mapping key tuples to tsam AggregationResult. + occurrences: Dict mapping key tuples to cluster occurrence counts. + metrics: Dict mapping key tuples to accuracy metrics. timesteps_per_cluster: Number of timesteps per cluster. dt: Hours per timestep. - periods: List of period labels ([None] if no periods). - scenarios: List of scenario labels ([None] if no scenarios). + dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). n_clusters_requested: Requested number of clusters (for logging). None to skip. Returns: @@ -387,33 +373,7 @@ def _build_reduced_flow_system( from .core import drop_constant_arrays from .flow_system import FlowSystem - has_periods = periods != [None] - has_scenarios = scenarios != [None] - - # Build dim_names for Clustering - dim_names = [] - if has_periods: - dim_names.append('period') - if has_scenarios: - dim_names.append('scenario') - - # Helper to convert (p, s) keys to clean keys (without None) - def clean_key(p, s) -> tuple: - parts = [] - if has_periods: - parts.append(p) - if has_scenarios: - parts.append(s) - return tuple(parts) - - # Convert all dicts to use clean keys - aggregation_results = {clean_key(p, s): v for (p, s), v in tsam_aggregation_results.items()} - occurrences = {clean_key(p, s): v for (p, s), v in cluster_occurrences_all.items()} - metrics = {clean_key(p, s): v for (p, s), v in clustering_metrics_all.items()} - - # Use first result for structure - first_key = (periods[0], scenarios[0]) - first_tsam = tsam_aggregation_results[first_key] + first_tsam = next(iter(aggregation_results.values())) # Build metrics clustering_metrics = self._build_clustering_metrics(metrics, dim_names) @@ -1350,6 +1310,22 @@ def cluster( f'with extreme periods while maintaining the requested n_clusters.' ) + # Build dim_names and clean key helper + dim_names: list[str] = [] + if has_periods: + dim_names.append('period') + if has_scenarios: + dim_names.append('scenario') + + def to_clean_key(period_label, scenario_label) -> tuple: + """Convert (period, scenario) to clean key based on which dims exist.""" + key_parts = [] + if has_periods: + key_parts.append(period_label) + if has_scenarios: + key_parts.append(scenario_label) + return tuple(key_parts) + # Cluster each (period, scenario) combination using tsam directly tsam_aggregation_results: dict[tuple, Any] = {} # AggregationResult objects tsam_clustering_results: dict[tuple, Any] = {} # ClusteringResult objects for persistence @@ -1361,7 +1337,7 @@ def cluster( for period_label in periods: for scenario_label in scenarios: - key = (period_label, scenario_label) + key = to_clean_key(period_label, scenario_label) selector = {k: v for k, v in [('period', period_label), ('scenario', scenario_label)] if v is not None} # Select data for clustering (may be subset if data_vars specified) @@ -1423,55 +1399,26 @@ def cluster( # If data_vars was specified, apply clustering to FULL data if data_vars is not None: - # Build dim_names for ClusteringResults - dim_names = [] - if has_periods: - dim_names.append('period') - if has_scenarios: - dim_names.append('scenario') - - # Convert (period, scenario) keys to ClusteringResults format - def to_cr_key(p, s): - key_parts = [] - if has_periods: - key_parts.append(p) - if has_scenarios: - key_parts.append(s) - return tuple(key_parts) - - # Build ClusteringResults from subset clustering - clustering_results = ClusteringResults( - {to_cr_key(p, s): cr for (p, s), cr in tsam_clustering_results.items()}, - dim_names, - ) + # Build ClusteringResults from subset clustering (already using clean keys) + clustering_results = ClusteringResults(tsam_clustering_results, dim_names) # Apply to full data - this returns AggregationResults agg_results = clustering_results.apply(ds) - # Update tsam_aggregation_results with full data results - for cr_key, result in agg_results: - # Convert back to (period, scenario) format - if has_periods and has_scenarios: - full_key = (cr_key[0], cr_key[1]) - elif has_periods: - full_key = (cr_key[0], None) - elif has_scenarios: - full_key = (None, cr_key[0]) - else: - full_key = (None, None) - tsam_aggregation_results[full_key] = result - cluster_occurrences_all[full_key] = result.cluster_weights + # Update results with full data (keys already match) + for key, result in agg_results: + tsam_aggregation_results[key] = result + cluster_occurrences_all[key] = result.cluster_weights # Build and return the reduced FlowSystem return self._build_reduced_flow_system( ds=ds, - tsam_aggregation_results=tsam_aggregation_results, - cluster_occurrences_all=cluster_occurrences_all, - clustering_metrics_all=clustering_metrics_all, + aggregation_results=tsam_aggregation_results, + occurrences=cluster_occurrences_all, + metrics=clustering_metrics_all, timesteps_per_cluster=timesteps_per_cluster, dt=dt, - periods=periods, - scenarios=scenarios, + dim_names=dim_names, n_clusters_requested=n_clusters, ) @@ -1519,12 +1466,7 @@ def apply_clustering( # Get timesteps_per_cluster from the clustering object (survives serialization) timesteps_per_cluster = clustering.timesteps_per_cluster - has_periods = self._fs.periods is not None - has_scenarios = self._fs.scenarios is not None - - # Determine iteration dimensions - periods = list(self._fs.periods) if has_periods else [None] - scenarios = list(self._fs.scenarios) if has_scenarios else [None] + dim_names = clustering.results.dim_names ds = self._fs.to_dataset(include_solution=False) @@ -1547,39 +1489,28 @@ def apply_clustering( agg_results = clustering.results.apply(ds) # Convert AggregationResults to the dict format expected by _build_reduced_flow_system - tsam_aggregation_results: dict[tuple, Any] = {} - cluster_occurrences_all: dict[tuple, dict] = {} - clustering_metrics_all: dict[tuple, pd.DataFrame] = {} - - for cr_key, result in agg_results: - # Convert ClusteringResults key to (period, scenario) format - if has_periods and has_scenarios: - full_key = (cr_key[0], cr_key[1]) - elif has_periods: - full_key = (cr_key[0], None) - elif has_scenarios: - full_key = (None, cr_key[0]) - else: - full_key = (None, None) + aggregation_results: dict[tuple, Any] = {} + occurrences: dict[tuple, dict] = {} + metrics: dict[tuple, pd.DataFrame] = {} - tsam_aggregation_results[full_key] = result - cluster_occurrences_all[full_key] = result.cluster_weights + for key, result in agg_results: + aggregation_results[key] = result + occurrences[key] = result.cluster_weights try: - clustering_metrics_all[full_key] = self._accuracy_to_dataframe(result.accuracy) + metrics[key] = self._accuracy_to_dataframe(result.accuracy) except Exception as e: - logger.warning(f'Failed to compute clustering metrics for {full_key}: {e}') - clustering_metrics_all[full_key] = pd.DataFrame() + logger.warning(f'Failed to compute clustering metrics for {key}: {e}') + metrics[key] = pd.DataFrame() # Build and return the reduced FlowSystem return self._build_reduced_flow_system( ds=ds, - tsam_aggregation_results=tsam_aggregation_results, - cluster_occurrences_all=cluster_occurrences_all, - clustering_metrics_all=clustering_metrics_all, + aggregation_results=aggregation_results, + occurrences=occurrences, + metrics=metrics, timesteps_per_cluster=timesteps_per_cluster, dt=dt, - periods=periods, - scenarios=scenarios, + dim_names=dim_names, ) def _validate_for_expansion(self) -> Clustering: From 62f534d2b332da84017cf9c72c415eb6b95d3b92 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 22:34:22 +0100 Subject: [PATCH 07/15] udpate --- flixopt/transform_accessor.py | 88 ++++++++++++++--------------------- 1 file changed, 34 insertions(+), 54 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 4e6f79d20..d5a6a62c8 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -190,15 +190,15 @@ def _accuracy_to_dataframe(accuracy) -> pd.DataFrame: def _build_cluster_weight_da( self, - cluster_occurrences_all: dict[tuple, dict], + aggregation_results: dict[tuple, Any], n_clusters: int, cluster_coords: np.ndarray, dim_names: list[str], ) -> xr.DataArray: - """Build cluster_weight DataArray from occurrence counts. + """Build cluster_weight DataArray from aggregation results. Args: - cluster_occurrences_all: Dict mapping key tuples to {cluster_id: count}. + aggregation_results: Dict mapping key tuples to tsam AggregationResult. n_clusters: Number of clusters. cluster_coords: Cluster coordinate values. dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). @@ -207,8 +207,8 @@ def _build_cluster_weight_da( DataArray with dims [cluster, *dim_names]. """ slices = [] - for key, occurrences in cluster_occurrences_all.items(): - weights = np.array([occurrences.get(c, 0) for c in range(n_clusters)]) + for key, result in aggregation_results.items(): + weights = np.array([result.cluster_weights.get(c, 0) for c in range(n_clusters)]) da = xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) slices.append(_expand_dims_for_key(da, dim_names, key)) @@ -296,25 +296,34 @@ def _build_segment_durations_da( def _build_clustering_metrics( self, - clustering_metrics_all: dict[tuple, pd.DataFrame], + aggregation_results: dict[tuple, Any], dim_names: list[str], ) -> xr.Dataset: - """Build clustering metrics Dataset from per-slice DataFrames. + """Build clustering metrics Dataset from aggregation results. Args: - clustering_metrics_all: Dict mapping key tuples to metric DataFrames. + aggregation_results: Dict mapping key tuples to tsam AggregationResult. dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). Returns: Dataset with RMSE, MAE, RMSE_duration metrics. """ - non_empty_metrics = {k: v for k, v in clustering_metrics_all.items() if not v.empty} + # Convert accuracy to DataFrames, filtering out failures + metrics_dfs: dict[tuple, pd.DataFrame] = {} + for key, result in aggregation_results.items(): + try: + metrics_dfs[key] = self._accuracy_to_dataframe(result.accuracy) + except Exception as e: + logger.warning(f'Failed to compute clustering metrics for {key}: {e}') + metrics_dfs[key] = pd.DataFrame() + + non_empty_metrics = {k: v for k, v in metrics_dfs.items() if not v.empty} if not non_empty_metrics: return xr.Dataset() # Single slice case - if len(clustering_metrics_all) == 1 and len(non_empty_metrics) == 1: + if len(metrics_dfs) == 1 and len(non_empty_metrics) == 1: metrics_df = next(iter(non_empty_metrics.values())) return xr.Dataset( { @@ -333,7 +342,7 @@ def _build_clustering_metrics( for metric in sample_df.columns: slices = [] - for key, df in clustering_metrics_all.items(): + for key, df in metrics_dfs.items(): values = np.full(len(sample_df.index), np.nan) if df.empty else df[metric].values da = xr.DataArray(values, dims=['time_series'], coords={'time_series': list(sample_df.index)}) slices.append(_expand_dims_for_key(da, dim_names, key)) @@ -345,8 +354,6 @@ def _build_reduced_flow_system( self, ds: xr.Dataset, aggregation_results: dict[tuple, Any], - occurrences: dict[tuple, dict], - metrics: dict[tuple, pd.DataFrame], timesteps_per_cluster: int, dt: float, dim_names: list[str], @@ -359,8 +366,6 @@ def _build_reduced_flow_system( Args: ds: Original dataset. aggregation_results: Dict mapping key tuples to tsam AggregationResult. - occurrences: Dict mapping key tuples to cluster occurrence counts. - metrics: Dict mapping key tuples to accuracy metrics. timesteps_per_cluster: Number of timesteps per cluster. dt: Hours per timestep. dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). @@ -375,8 +380,8 @@ def _build_reduced_flow_system( first_tsam = next(iter(aggregation_results.values())) - # Build metrics - clustering_metrics = self._build_clustering_metrics(metrics, dim_names) + # Build metrics from aggregation_results + clustering_metrics = self._build_clustering_metrics(aggregation_results, dim_names) n_reduced_timesteps = len(first_tsam.cluster_representatives) actual_n_clusters = len(first_tsam.cluster_weights) @@ -401,8 +406,10 @@ def _build_reduced_flow_system( name='time', ) - # Build cluster_weight - cluster_weight = self._build_cluster_weight_da(occurrences, actual_n_clusters, cluster_coords, dim_names) + # Build cluster_weight from aggregation_results + cluster_weight = self._build_cluster_weight_da( + aggregation_results, actual_n_clusters, cluster_coords, dim_names + ) # Logging if is_segmented: @@ -1328,12 +1335,6 @@ def to_clean_key(period_label, scenario_label) -> tuple: # Cluster each (period, scenario) combination using tsam directly tsam_aggregation_results: dict[tuple, Any] = {} # AggregationResult objects - tsam_clustering_results: dict[tuple, Any] = {} # ClusteringResult objects for persistence - cluster_assignmentss: dict[tuple, np.ndarray] = {} - cluster_occurrences_all: dict[tuple, dict] = {} - - # Collect metrics per (period, scenario) slice - clustering_metrics_all: dict[tuple, pd.DataFrame] = {} for period_label in periods: for scenario_label in scenarios: @@ -1388,34 +1389,26 @@ def to_clean_key(period_label, scenario_label) -> tuple: ) tsam_aggregation_results[key] = tsam_result - tsam_clustering_results[key] = tsam_result.clustering - cluster_assignmentss[key] = tsam_result.cluster_assignments - cluster_occurrences_all[key] = tsam_result.cluster_weights - try: - clustering_metrics_all[key] = self._accuracy_to_dataframe(tsam_result.accuracy) - except Exception as e: - logger.warning(f'Failed to compute clustering metrics for {key}: {e}') - clustering_metrics_all[key] = pd.DataFrame() # If data_vars was specified, apply clustering to FULL data if data_vars is not None: - # Build ClusteringResults from subset clustering (already using clean keys) - clustering_results = ClusteringResults(tsam_clustering_results, dim_names) + # Build ClusteringResults from subset clustering (derive from aggregation results) + clustering_results = ClusteringResults( + {k: r.clustering for k, r in tsam_aggregation_results.items()}, + dim_names, + ) # Apply to full data - this returns AggregationResults agg_results = clustering_results.apply(ds) - # Update results with full data (keys already match) + # Update aggregation_results with full data results for key, result in agg_results: tsam_aggregation_results[key] = result - cluster_occurrences_all[key] = result.cluster_weights # Build and return the reduced FlowSystem return self._build_reduced_flow_system( ds=ds, aggregation_results=tsam_aggregation_results, - occurrences=cluster_occurrences_all, - metrics=clustering_metrics_all, timesteps_per_cluster=timesteps_per_cluster, dt=dt, dim_names=dim_names, @@ -1488,26 +1481,13 @@ def apply_clustering( warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') agg_results = clustering.results.apply(ds) - # Convert AggregationResults to the dict format expected by _build_reduced_flow_system - aggregation_results: dict[tuple, Any] = {} - occurrences: dict[tuple, dict] = {} - metrics: dict[tuple, pd.DataFrame] = {} - - for key, result in agg_results: - aggregation_results[key] = result - occurrences[key] = result.cluster_weights - try: - metrics[key] = self._accuracy_to_dataframe(result.accuracy) - except Exception as e: - logger.warning(f'Failed to compute clustering metrics for {key}: {e}') - metrics[key] = pd.DataFrame() + # Convert AggregationResults to dict format + aggregation_results = dict(agg_results) # Build and return the reduced FlowSystem return self._build_reduced_flow_system( ds=ds, aggregation_results=aggregation_results, - occurrences=occurrences, - metrics=metrics, timesteps_per_cluster=timesteps_per_cluster, dt=dt, dim_names=dim_names, From 30774e6f93c074906cbf880b6efe466f6ba24575 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 22:55:37 +0100 Subject: [PATCH 08/15] - Eliminated 3 redundant dicts (occurrences, metrics, tsam_clustering_results) - Single aggregation_results dict remains (required for Clustering) --- flixopt/transform_accessor.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index d5a6a62c8..99fc606ee 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -1334,7 +1334,7 @@ def to_clean_key(period_label, scenario_label) -> tuple: return tuple(key_parts) # Cluster each (period, scenario) combination using tsam directly - tsam_aggregation_results: dict[tuple, Any] = {} # AggregationResult objects + aggregation_results: dict[tuple, Any] = {} for period_label in periods: for scenario_label in scenarios: @@ -1373,7 +1373,7 @@ def to_clean_key(period_label, scenario_label) -> tuple: cluster_config = self._build_cluster_config_with_weights(cluster, filtered_weights) # Perform clustering based on selected data_vars (or all if not specified) - tsam_result = tsam.aggregate( + aggregation_results[key] = tsam.aggregate( df_for_clustering, n_clusters=n_clusters, period_duration=hours_per_cluster, @@ -1388,27 +1388,20 @@ def to_clean_key(period_label, scenario_label) -> tuple: **tsam_kwargs, ) - tsam_aggregation_results[key] = tsam_result - # If data_vars was specified, apply clustering to FULL data if data_vars is not None: - # Build ClusteringResults from subset clustering (derive from aggregation results) + # Build ClusteringResults from subset clustering clustering_results = ClusteringResults( - {k: r.clustering for k, r in tsam_aggregation_results.items()}, + {k: r.clustering for k, r in aggregation_results.items()}, dim_names, ) - - # Apply to full data - this returns AggregationResults - agg_results = clustering_results.apply(ds) - - # Update aggregation_results with full data results - for key, result in agg_results: - tsam_aggregation_results[key] = result + # Apply to full data and replace results + aggregation_results = dict(clustering_results.apply(ds)) # Build and return the reduced FlowSystem return self._build_reduced_flow_system( ds=ds, - aggregation_results=tsam_aggregation_results, + aggregation_results=aggregation_results, timesteps_per_cluster=timesteps_per_cluster, dt=dt, dim_names=dim_names, From ab5cb3ca65ec0cffb5d6fa1c0bf707b937c79ff1 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 23:01:23 +0100 Subject: [PATCH 09/15] 1. clustering/base.py: _build_property_array now transposes for single-slice path 2. transform_accessor.py: _combine_dataarray_slices now transposes for both paths --- flixopt/clustering/base.py | 14 ++++++++------ flixopt/transform_accessor.py | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 3dd11cc43..7082929c3 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -490,12 +490,14 @@ def _build_property_array( slices.append(da) if len(slices) == 1: - return slices[0] - - combined = xr.combine_by_coords(slices) - if isinstance(combined, xr.Dataset): - combined = combined[name] - return combined.transpose(*base_dims, *self._dim_names) + result = slices[0] + else: + combined = xr.combine_by_coords(slices) + if isinstance(combined, xr.Dataset): + result = combined[name] + else: + result = combined + return result.transpose(*base_dims, *self._dim_names) @staticmethod def _key_to_str(key: tuple) -> str: diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 99fc606ee..565a29dc9 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -54,7 +54,9 @@ def _combine_dataarray_slices( result = list(combined.data_vars.values())[0] else: result = combined - result = result.transpose(*base_dims, *extra_dims) + + # Ensure consistent dimension order for both single and multi-slice paths + result = result.transpose(*base_dims, *extra_dims) if name is not None: result = result.rename(name) From e822d1928b49fc33aef174a1cb81ca064ecd8355 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 23:09:47 +0100 Subject: [PATCH 10/15] Basic fixes --- flixopt/core.py | 4 +++- flixopt/transform_accessor.py | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/flixopt/core.py b/flixopt/core.py index ba8618e1a..aca380f5e 100644 --- a/flixopt/core.py +++ b/flixopt/core.py @@ -4,6 +4,7 @@ """ import logging +import warnings from itertools import permutations from typing import Any, Literal @@ -644,7 +645,8 @@ def drop_constant_arrays( axis = var.dims.index(dim) data = var.values # Use numpy operations directly for speed - with np.errstate(invalid='ignore'): # Ignore NaN warnings + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=RuntimeWarning, message='All-NaN slice') ptp = np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis) if np.all(ptp < atol): drop_vars.append(name) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 565a29dc9..92e84846d 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -338,17 +338,31 @@ def _build_clustering_metrics( } ) - # Multi-dim case + # Multi-dim case - each period may have different time series + # Build union of all time series indices (different periods may exclude different constant arrays) + all_time_series = sorted(set().union(*(set(df.index) for df in non_empty_metrics.values()))) + + # Get all metric columns (should be consistent across periods) sample_df = next(iter(non_empty_metrics.values())) data_vars = {} for metric in sample_df.columns: slices = [] for key, df in metrics_dfs.items(): - values = np.full(len(sample_df.index), np.nan) if df.empty else df[metric].values - da = xr.DataArray(values, dims=['time_series'], coords={'time_series': list(sample_df.index)}) + if df.empty: + # Create NaN-filled array for empty DataFrames + values = np.full(len(all_time_series), np.nan) + else: + # Reindex to common index, filling missing with NaN + values = df[metric].reindex(all_time_series).values + da = xr.DataArray( + values, + dims=['time_series'], + coords={'time_series': all_time_series}, + ) slices.append(_expand_dims_for_key(da, dim_names, key)) - data_vars[metric] = _combine_dataarray_slices(slices, ['time_series'], dim_names, name=metric) + if slices: + data_vars[metric] = _combine_dataarray_slices(slices, ['time_series'], dim_names, name=metric) return xr.Dataset(data_vars) From 8ac679c7e1da41c47faacb63b97d0d8f5d60806f Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 23:13:25 +0100 Subject: [PATCH 11/15] drop constant arrays before slicing. --- flixopt/transform_accessor.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 92e84846d..44ad0b6ba 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -1299,6 +1299,18 @@ def cluster( else: ds_for_clustering = ds + # Filter constant arrays once on the full dataset (not per slice) + # This ensures all slices have the same variables for consistent metrics + ds_for_clustering = drop_constant_arrays(ds_for_clustering, dim='time') + + # Guard against empty dataset after removing constant arrays + if not ds_for_clustering.data_vars: + filter_info = f'data_vars={data_vars}' if data_vars else 'all variables' + raise ValueError( + f'No time-varying data found for clustering ({filter_info}). ' + f'All variables are constant over time. Check your data_vars filter or input data.' + ) + # Validate tsam_kwargs doesn't override explicit parameters reserved_tsam_keys = { 'n_clusters', @@ -1357,22 +1369,9 @@ def to_clean_key(period_label, scenario_label) -> tuple: key = to_clean_key(period_label, scenario_label) selector = {k: v for k, v in [('period', period_label), ('scenario', scenario_label)] if v is not None} - # Select data for clustering (may be subset if data_vars specified) - ds_slice_for_clustering = ( - ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering - ) - temporaly_changing_ds_for_clustering = drop_constant_arrays(ds_slice_for_clustering, dim='time') - - # Guard against empty dataset after removing constant arrays - if not temporaly_changing_ds_for_clustering.data_vars: - filter_info = f'data_vars={data_vars}' if data_vars else 'all variables' - selector_info = f', selector={selector}' if selector else '' - raise ValueError( - f'No time-varying data found for clustering ({filter_info}{selector_info}). ' - f'All variables are constant over time. Check your data_vars filter or input data.' - ) - - df_for_clustering = temporaly_changing_ds_for_clustering.to_dataframe() + # Select data slice for clustering + ds_slice = ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering + df_for_clustering = ds_slice.to_dataframe() if selector: logger.info(f'Clustering {", ".join(f"{k}={v}" for k, v in selector.items())}...') @@ -1382,7 +1381,7 @@ def to_clean_key(period_label, scenario_label) -> tuple: warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') # Build ClusterConfig with auto-calculated weights - clustering_weights = self._calculate_clustering_weights(temporaly_changing_ds_for_clustering) + clustering_weights = self._calculate_clustering_weights(ds_slice) filtered_weights = { name: w for name, w in clustering_weights.items() if name in df_for_clustering.columns } From 3d9475c60edbb980bf942164cf4d03099fee2330 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 23:24:54 +0100 Subject: [PATCH 12/15] drop constant arrays before slicing. --- flixopt/transform_accessor.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 44ad0b6ba..ae5606fbc 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -338,31 +338,18 @@ def _build_clustering_metrics( } ) - # Multi-dim case - each period may have different time series - # Build union of all time series indices (different periods may exclude different constant arrays) - all_time_series = sorted(set().union(*(set(df.index) for df in non_empty_metrics.values()))) - - # Get all metric columns (should be consistent across periods) + # Multi-dim case - all periods have same time series (constant arrays filtered on full dataset) sample_df = next(iter(non_empty_metrics.values())) + time_series_index = list(sample_df.index) data_vars = {} for metric in sample_df.columns: slices = [] for key, df in metrics_dfs.items(): - if df.empty: - # Create NaN-filled array for empty DataFrames - values = np.full(len(all_time_series), np.nan) - else: - # Reindex to common index, filling missing with NaN - values = df[metric].reindex(all_time_series).values - da = xr.DataArray( - values, - dims=['time_series'], - coords={'time_series': all_time_series}, - ) + values = np.full(len(time_series_index), np.nan) if df.empty else df[metric].values + da = xr.DataArray(values, dims=['time_series'], coords={'time_series': time_series_index}) slices.append(_expand_dims_for_key(da, dim_names, key)) - if slices: - data_vars[metric] = _combine_dataarray_slices(slices, ['time_series'], dim_names, name=metric) + data_vars[metric] = _combine_dataarray_slices(slices, ['time_series'], dim_names, name=metric) return xr.Dataset(data_vars) From 24c28383ecaa2e8ac168a74271e10d25ab9e3136 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Tue, 27 Jan 2026 23:36:43 +0100 Subject: [PATCH 13/15] Simplify expansion logic --- flixopt/transform_accessor.py | 64 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index ae5606fbc..00f2036fd 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -1947,31 +1947,31 @@ def expand(self) -> FlowSystem: n_original_clusters - 1, ) - # For segmented systems: build expansion divisor and identify segment total variables - expansion_divisor = None - segment_total_vars: set[str] = set() + # Build variable category sets for expansion handling variable_categories = getattr(self._fs, '_variable_categories', {}) - if clustering.is_segmented: - expansion_divisor = clustering.build_expansion_divisor(original_time=original_timesteps) - # Build segment total vars using registry first, fall back to pattern matching + if variable_categories: + # Use registered categories + state_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_INTERPOLATE} + first_timestep_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP} segment_total_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_DIVIDE} - # Fall back to pattern matching for backwards compatibility (old FlowSystems without categories) - if not segment_total_vars: - segment_total_vars = self._build_segment_total_varnames() + else: + # Fallback to pattern matching for old FlowSystems without categories + state_vars = set() # Will use endswith('|charge_state') check + first_timestep_vars = set() # Will use endswith('|startup') / endswith('|shutdown') check + segment_total_vars = self._build_segment_total_varnames() if clustering.is_segmented else set() def _is_state_variable(var_name: str) -> bool: - """Check if a variable is a state variable (should be interpolated).""" - if var_name in variable_categories: - return variable_categories[var_name] in EXPAND_INTERPOLATE - # Fall back to pattern matching for backwards compatibility - return var_name.endswith('|charge_state') + return var_name in state_vars or (not variable_categories and var_name.endswith('|charge_state')) def _is_first_timestep_variable(var_name: str) -> bool: - """Check if a variable is a binary event (should only appear at first timestep).""" - if var_name in variable_categories: - return variable_categories[var_name] in EXPAND_FIRST_TIMESTEP - # Fall back to pattern matching for backwards compatibility - return var_name.endswith('|startup') or var_name.endswith('|shutdown') + return var_name in first_timestep_vars or ( + not variable_categories and (var_name.endswith('|startup') or var_name.endswith('|shutdown')) + ) + + # For segmented systems: build expansion divisor + expansion_divisor = None + if clustering.is_segmented: + expansion_divisor = clustering.build_expansion_divisor(original_time=original_timesteps) def _append_final_state(expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray: """Append final state value from original data to expanded data.""" @@ -1991,26 +1991,22 @@ def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) - if 'time' not in da.dims: return da.copy() - is_state = _is_state_variable(var_name) and 'cluster' in da.dims - is_first_timestep = _is_first_timestep_variable(var_name) and 'cluster' in da.dims + has_cluster_dim = 'cluster' in da.dims + is_state = _is_state_variable(var_name) and has_cluster_dim + is_first_timestep = _is_first_timestep_variable(var_name) and has_cluster_dim + is_segment_total = is_solution and var_name in segment_total_vars - # State variables in segmented systems: interpolate within segments + # Choose expansion method if is_state and clustering.is_segmented: expanded = self._interpolate_charge_state_segmented(da, clustering, original_timesteps) - return _append_final_state(expanded, da) - - # Binary events (startup/shutdown) in segmented systems: first timestep of each segment - # For non-segmented systems, timing within cluster is preserved, so normal expansion is correct - if is_first_timestep and is_solution and clustering.is_segmented: + elif is_first_timestep and is_solution and clustering.is_segmented: return self._expand_first_timestep_only(da, clustering, original_timesteps) + else: + expanded = clustering.expand_data(da, original_time=original_timesteps) + if is_segment_total and expansion_divisor is not None: + expanded = expanded / expansion_divisor - expanded = clustering.expand_data(da, original_time=original_timesteps) - - # Segment totals: divide by expansion divisor - if is_solution and expansion_divisor is not None and var_name in segment_total_vars: - expanded = expanded / expansion_divisor - - # State variables: append final state + # State variables need final state appended if is_state: expanded = _append_final_state(expanded, da) From b98921f6a9b8596dffd7d131f263f28a0cce1827 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Wed, 28 Jan 2026 08:18:34 +0100 Subject: [PATCH 14/15] Created _Expander class (lines 82-306) that encapsulates all expansion logic: 1. Pre-computed state in __init__: - Clustering dimensions (timesteps_per_cluster, n_segments, n_clusters, etc.) - Timesteps (original_timesteps, original_timesteps_extra) - Variable category sets (state_vars, first_timestep_vars, segment_total_vars) - Expansion divisor for segmented systems 2. Methods moved from nested functions/helpers: - _is_state_variable() - checks if variable needs interpolation - _is_first_timestep_variable() - checks if variable is startup/shutdown - _build_segment_total_varnames() - backward compatibility fallback - _append_final_state() - appends N+1th state value - _interpolate_charge_state_segmented() - interpolates within segments - _expand_first_timestep_only() - places events at first timestep of segment - expand_dataarray() - main expansion logic (was expand_da) 3. Simplified TransformAccessor.expand(): - Creates _Expander instance with pre-computed state - Uses expander.expand_dataarray() instead of nested closure - Accesses expander attributes for logging Benefits: - State is pre-computed once, not on every call - Methods are testable in isolation - No more nested function closures capturing external state - Clear separation: _Expander handles expansion, TransformAccessor orchestrates --- flixopt/transform_accessor.py | 543 +++++++++++++++++----------------- 1 file changed, 268 insertions(+), 275 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 00f2036fd..fbd17eaab 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -79,6 +79,261 @@ def _expand_dims_for_key(da: xr.DataArray, dim_names: list[str], key: tuple) -> return da +class _Expander: + """Handles expansion of clustered FlowSystem to original timesteps. + + This class encapsulates all expansion logic, pre-computing shared state + once and providing methods for different expansion strategies. + + Args: + fs: The clustered FlowSystem to expand. + clustering: The Clustering object with cluster assignments and metadata. + """ + + def __init__(self, fs: FlowSystem, clustering: Clustering): + self._fs = fs + self._clustering = clustering + + # Pre-compute clustering dimensions + self._timesteps_per_cluster = clustering.timesteps_per_cluster + self._n_segments = clustering.n_segments + self._time_dim_size = self._n_segments if self._n_segments else self._timesteps_per_cluster + self._n_clusters = clustering.n_clusters + self._n_original_clusters = clustering.n_original_clusters + + # Pre-compute timesteps + self._original_timesteps = clustering.original_timesteps + self._n_original_timesteps = len(self._original_timesteps) + + # Import here to avoid circular import + from .flow_system import FlowSystem + + self._original_timesteps_extra = FlowSystem._create_timesteps_with_extra(self._original_timesteps, None) + + # Index of last valid original cluster (for final state) + self._last_original_cluster_idx = min( + (self._n_original_timesteps - 1) // self._timesteps_per_cluster, + self._n_original_clusters - 1, + ) + + # Build variable category sets + self._variable_categories = getattr(fs, '_variable_categories', {}) + if self._variable_categories: + self._state_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_INTERPOLATE} + self._first_timestep_vars = { + name for name, cat in self._variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP + } + self._segment_total_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_DIVIDE} + else: + # Fallback to pattern matching for old FlowSystems without categories + self._state_vars = set() + self._first_timestep_vars = set() + self._segment_total_vars = self._build_segment_total_varnames() if clustering.is_segmented else set() + + # Build expansion divisor for segmented systems + self._expansion_divisor = None + if clustering.is_segmented: + self._expansion_divisor = clustering.build_expansion_divisor(original_time=self._original_timesteps) + + def _is_state_variable(self, var_name: str) -> bool: + """Check if variable is a state variable requiring interpolation.""" + return var_name in self._state_vars or (not self._variable_categories and var_name.endswith('|charge_state')) + + def _is_first_timestep_variable(self, var_name: str) -> bool: + """Check if variable is a first-timestep-only variable (startup/shutdown).""" + return var_name in self._first_timestep_vars or ( + not self._variable_categories and (var_name.endswith('|startup') or var_name.endswith('|shutdown')) + ) + + def _build_segment_total_varnames(self) -> set[str]: + """Build segment total variable names - BACKWARDS COMPATIBILITY FALLBACK. + + This method is only used when variable_categories is empty (old FlowSystems + saved before category registration was implemented). New FlowSystems use + the VariableCategory registry with EXPAND_DIVIDE categories (PER_TIMESTEP, SHARE). + + Returns: + Set of variable names that should be divided by expansion divisor. + """ + segment_total_vars: set[str] = set() + effect_names = list(self._fs.effects.keys()) + + # 1. Per-timestep totals for each effect + for effect in effect_names: + segment_total_vars.add(f'{effect}(temporal)|per_timestep') + + # 2. Flow contributions to effects + for flow_label in self._fs.flows: + for effect in effect_names: + segment_total_vars.add(f'{flow_label}->{effect}(temporal)') + + # 3. Component contributions to effects + for component_label in self._fs.components: + for effect in effect_names: + segment_total_vars.add(f'{component_label}->{effect}(temporal)') + + # 4. Effect-to-effect contributions + for target_effect_name, target_effect in self._fs.effects.items(): + if target_effect.share_from_temporal: + for source_effect_name in target_effect.share_from_temporal: + segment_total_vars.add(f'{source_effect_name}(temporal)->{target_effect_name}(temporal)') + + return segment_total_vars + + def _append_final_state(self, expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray: + """Append final state value from original data to expanded data.""" + cluster_assignments = self._clustering.cluster_assignments + if cluster_assignments.ndim == 1: + last_cluster = int(cluster_assignments.values[self._last_original_cluster_idx]) + extra_val = da.isel(cluster=last_cluster, time=-1) + else: + last_clusters = cluster_assignments.isel(original_cluster=self._last_original_cluster_idx) + extra_val = da.isel(cluster=last_clusters, time=-1) + extra_val = extra_val.drop_vars(['cluster', 'time'], errors='ignore') + extra_val = extra_val.expand_dims(time=[self._original_timesteps_extra[-1]]) + return xr.concat([expanded, extra_val], dim='time') + + def _interpolate_charge_state_segmented(self, da: xr.DataArray) -> xr.DataArray: + """Interpolate charge_state values within segments for segmented systems. + + For segmented systems, charge_state has values at segment boundaries (n_segments+1). + This method interpolates between start and end boundary values to show the + actual charge trajectory as the storage charges/discharges. + + Args: + da: charge_state DataArray with dims (cluster, time) where time has n_segments+1 entries. + + Returns: + Interpolated charge_state with dims (time, ...) for original timesteps. + """ + clustering = self._clustering + + # Get multi-dimensional properties from Clustering + segment_assignments = clustering.results.segment_assignments + segment_durations = clustering.results.segment_durations + position_within_segment = clustering.results.position_within_segment + cluster_assignments = clustering.cluster_assignments + + # Compute original period index and position within period + original_period_indices = np.minimum( + np.arange(self._n_original_timesteps) // self._timesteps_per_cluster, + self._n_original_clusters - 1, + ) + positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster + + # Create DataArrays for indexing + original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) + position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) + + # Map original period to cluster + cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) + + # Get segment index and position for each original timestep + seg_indices = segment_assignments.isel(cluster=cluster_indices, time=position_in_period_da) + positions = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) + durations = segment_durations.isel(cluster=cluster_indices, segment=seg_indices) + + # Calculate interpolation factor: position within segment (0 to 1) + factor = xr.where(durations > 1, (positions + 0.5) / durations, 0.5) + + # Get start and end boundary values from charge_state + start_vals = da.isel(cluster=cluster_indices, time=seg_indices) + end_vals = da.isel(cluster=cluster_indices, time=seg_indices + 1) + + # Linear interpolation + interpolated = start_vals + (end_vals - start_vals) * factor + + # Clean up coordinate artifacts and rename + interpolated = interpolated.drop_vars(['cluster', 'time', 'segment'], errors='ignore') + interpolated = interpolated.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps) + + return interpolated.transpose('time', ...).assign_attrs(da.attrs) + + def _expand_first_timestep_only(self, da: xr.DataArray) -> xr.DataArray: + """Expand binary event variables to first timestep of each segment only. + + For segmented systems, binary event variables like startup and shutdown indicate + that an event occurred somewhere in the segment. When expanded, the event is placed + at the first timestep of each segment, with zeros elsewhere. + + Args: + da: Binary event DataArray with dims including (cluster, time). + + Returns: + Expanded DataArray with event values only at first timestep of each segment. + """ + clustering = self._clustering + + # First expand normally (repeats values) + expanded = clustering.expand_data(da, original_time=self._original_timesteps) + + # Build mask: True only at first timestep of each segment + position_within_segment = clustering.results.position_within_segment + cluster_assignments = clustering.cluster_assignments + + # Compute original period index and position within period + original_period_indices = np.minimum( + np.arange(self._n_original_timesteps) // self._timesteps_per_cluster, + self._n_original_clusters - 1, + ) + positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster + + # Create DataArrays for indexing + original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) + position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) + + # Map to cluster and get position within segment + cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) + pos_in_segment = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) + + # Clean up and create mask + pos_in_segment = pos_in_segment.drop_vars(['cluster', 'time'], errors='ignore') + pos_in_segment = pos_in_segment.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps) + + # First timestep of segment has position 0 + is_first = pos_in_segment == 0 + + # Apply mask: keep value at first timestep, zero elsewhere + result = xr.where(is_first, expanded, 0) + return result.assign_attrs(da.attrs) + + def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bool = False) -> xr.DataArray: + """Expand a DataArray from clustered to original timesteps. + + Args: + da: DataArray to expand. + var_name: Variable name for category-based expansion handling. + is_solution: Whether this is a solution variable (affects segment total handling). + + Returns: + Expanded DataArray with original timesteps. + """ + if 'time' not in da.dims: + return da.copy() + + clustering = self._clustering + has_cluster_dim = 'cluster' in da.dims + is_state = self._is_state_variable(var_name) and has_cluster_dim + is_first_timestep = self._is_first_timestep_variable(var_name) and has_cluster_dim + is_segment_total = is_solution and var_name in self._segment_total_vars + + # Choose expansion method + if is_state and clustering.is_segmented: + expanded = self._interpolate_charge_state_segmented(da) + elif is_first_timestep and is_solution and clustering.is_segmented: + return self._expand_first_timestep_only(da) + else: + expanded = clustering.expand_data(da, original_time=self._original_timesteps) + if is_segment_total and self._expansion_divisor is not None: + expanded = expanded / self._expansion_divisor + + # State variables need final state appended + if is_state: + expanded = self._append_final_state(expanded, da) + + return expanded + + class TransformAccessor: """ Accessor for transformation methods on FlowSystem. @@ -1636,186 +1891,6 @@ def _apply_soc_decay( return soc_boundary_per_timestep * decay_da - def _build_segment_total_varnames(self) -> set[str]: - """Build segment total variable names - BACKWARDS COMPATIBILITY FALLBACK. - - This method is only used when variable_categories is empty (old FlowSystems - saved before category registration was implemented). New FlowSystems use - the VariableCategory registry with EXPAND_DIVIDE categories (PER_TIMESTEP, SHARE). - - For segmented systems, these variables contain values that are summed over - segments. When expanded to hourly resolution, they need to be divided by - segment duration to get correct hourly rates. - - Returns: - Set of variable names that should be divided by expansion divisor. - """ - segment_total_vars: set[str] = set() - - # Get all effect names - effect_names = list(self._fs.effects.keys()) - - # 1. Per-timestep totals for each effect: {effect}(temporal)|per_timestep - for effect in effect_names: - segment_total_vars.add(f'{effect}(temporal)|per_timestep') - - # 2. Flow contributions to effects: {flow}->{effect}(temporal) - # (from effects_per_flow_hour on Flow elements) - for flow_label in self._fs.flows: - for effect in effect_names: - segment_total_vars.add(f'{flow_label}->{effect}(temporal)') - - # 3. Component contributions to effects: {component}->{effect}(temporal) - # (from effects_per_startup, effects_per_active_hour on OnOffParameters) - for component_label in self._fs.components: - for effect in effect_names: - segment_total_vars.add(f'{component_label}->{effect}(temporal)') - - # 4. Effect-to-effect contributions (from share_from_temporal) - # {source_effect}(temporal)->{target_effect}(temporal) - for target_effect_name, target_effect in self._fs.effects.items(): - if target_effect.share_from_temporal: - for source_effect_name in target_effect.share_from_temporal: - segment_total_vars.add(f'{source_effect_name}(temporal)->{target_effect_name}(temporal)') - - return segment_total_vars - - def _interpolate_charge_state_segmented( - self, - da: xr.DataArray, - clustering: Clustering, - original_timesteps: pd.DatetimeIndex, - ) -> xr.DataArray: - """Interpolate charge_state values within segments for segmented systems. - - For segmented systems, charge_state has values at segment boundaries (n_segments+1). - Instead of repeating the start boundary value for all timesteps in a segment, - this method interpolates between start and end boundary values to show the - actual charge trajectory as the storage charges/discharges. - - Uses vectorized xarray operations via Clustering class properties. - - Args: - da: charge_state DataArray with dims (cluster, time) where time has n_segments+1 entries. - clustering: Clustering object with segment info. - original_timesteps: Original timesteps to expand to. - - Returns: - Interpolated charge_state with dims (time, ...) for original timesteps. - """ - # Get multi-dimensional properties from Clustering - segment_assignments = clustering.results.segment_assignments - segment_durations = clustering.results.segment_durations - position_within_segment = clustering.results.position_within_segment - cluster_assignments = clustering.cluster_assignments - - # Compute original period index and position within period directly - # This is more reliable than decoding from timestep_mapping, which encodes - # (cluster * n_segments + segment_idx) for segmented systems - n_original_timesteps = len(original_timesteps) - timesteps_per_cluster = clustering.timesteps_per_cluster - n_original_clusters = clustering.n_original_clusters - - # For each original timestep, compute which original period it belongs to - original_period_indices = np.minimum( - np.arange(n_original_timesteps) // timesteps_per_cluster, - n_original_clusters - 1, - ) - # Position within the period (0 to timesteps_per_cluster-1) - positions_in_period = np.arange(n_original_timesteps) % timesteps_per_cluster - - # Create DataArrays for indexing (with original_time dimension, coords added later) - original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) - position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) - - # Map original period to cluster - cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) - - # Get segment index and position for each original timestep - # segment_assignments has shape (cluster, time) where time = timesteps_per_cluster - seg_indices = segment_assignments.isel(cluster=cluster_indices, time=position_in_period_da) - positions = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) - durations = segment_durations.isel(cluster=cluster_indices, segment=seg_indices) - - # Calculate interpolation factor: position within segment (0 to 1) - # At position=0, factor=0.5/duration (start of segment) - # At position=duration-1, factor approaches 1 (end of segment) - factor = xr.where(durations > 1, (positions + 0.5) / durations, 0.5) - - # Get start and end boundary values from charge_state - # charge_state has dims (cluster, time) where time = segment boundaries (n_segments+1) - start_vals = da.isel(cluster=cluster_indices, time=seg_indices) - end_vals = da.isel(cluster=cluster_indices, time=seg_indices + 1) - - # Linear interpolation - interpolated = start_vals + (end_vals - start_vals) * factor - - # Clean up coordinate artifacts and rename - interpolated = interpolated.drop_vars(['cluster', 'time', 'segment'], errors='ignore') - interpolated = interpolated.rename({'original_time': 'time'}).assign_coords(time=original_timesteps) - - return interpolated.transpose('time', ...).assign_attrs(da.attrs) - - def _expand_first_timestep_only( - self, - da: xr.DataArray, - clustering: Clustering, - original_timesteps: pd.DatetimeIndex, - ) -> xr.DataArray: - """Expand binary event variables (startup/shutdown) to first timestep of each segment. - - For segmented systems, binary event variables like startup and shutdown indicate - that an event occurred somewhere in the segment. When expanding, we place the - event at the first timestep of each segment and set all other timesteps to 0. - - This method is only used for segmented systems. For non-segmented systems, - the timing within the cluster is preserved by normal expansion. - - Args: - da: Binary event DataArray with dims including (cluster, time). - clustering: Clustering object with segment info (must be segmented). - original_timesteps: Original timesteps to expand to. - - Returns: - Expanded DataArray with event values only at first timestep of each segment. - """ - # First expand normally (repeats values) - expanded = clustering.expand_data(da, original_time=original_timesteps) - - # Build mask: True only at first timestep of each segment - n_original_timesteps = len(original_timesteps) - timesteps_per_cluster = clustering.timesteps_per_cluster - n_original_clusters = clustering.n_original_clusters - - position_within_segment = clustering.results.position_within_segment - cluster_assignments = clustering.cluster_assignments - - # Compute original period index and position within period - original_period_indices = np.minimum( - np.arange(n_original_timesteps) // timesteps_per_cluster, - n_original_clusters - 1, - ) - positions_in_period = np.arange(n_original_timesteps) % timesteps_per_cluster - - # Create DataArrays for indexing (coords added later after rename) - original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) - position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) - - # Map to cluster and get position within segment - cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) - pos_in_segment = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) - - # Clean up and create mask - pos_in_segment = pos_in_segment.drop_vars(['cluster', 'time'], errors='ignore') - pos_in_segment = pos_in_segment.rename({'original_time': 'time'}).assign_coords(time=original_timesteps) - - # First timestep of segment has position 0 - is_first = pos_in_segment == 0 - - # Apply mask: keep value at first timestep, zero elsewhere - result = xr.where(is_first, expanded, 0) - return result.assign_attrs(da.attrs) - def expand(self) -> FlowSystem: """Expand a clustered FlowSystem back to full original timesteps. @@ -1926,91 +2001,9 @@ def expand(self) -> FlowSystem: """ from .flow_system import FlowSystem - # Validate and extract clustering info + # Validate and create expander with pre-computed state clustering = self._validate_for_expansion() - - timesteps_per_cluster = clustering.timesteps_per_cluster - # For segmented systems, the time dimension has n_segments entries - n_segments = clustering.n_segments - time_dim_size = n_segments if n_segments is not None else timesteps_per_cluster - n_clusters = clustering.n_clusters - n_original_clusters = clustering.n_original_clusters - - # Get original timesteps and dimensions - original_timesteps = clustering.original_timesteps - n_original_timesteps = len(original_timesteps) - original_timesteps_extra = FlowSystem._create_timesteps_with_extra(original_timesteps, None) - - # For charge_state expansion: index of last valid original cluster - last_original_cluster_idx = min( - (n_original_timesteps - 1) // timesteps_per_cluster, - n_original_clusters - 1, - ) - - # Build variable category sets for expansion handling - variable_categories = getattr(self._fs, '_variable_categories', {}) - if variable_categories: - # Use registered categories - state_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_INTERPOLATE} - first_timestep_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP} - segment_total_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_DIVIDE} - else: - # Fallback to pattern matching for old FlowSystems without categories - state_vars = set() # Will use endswith('|charge_state') check - first_timestep_vars = set() # Will use endswith('|startup') / endswith('|shutdown') check - segment_total_vars = self._build_segment_total_varnames() if clustering.is_segmented else set() - - def _is_state_variable(var_name: str) -> bool: - return var_name in state_vars or (not variable_categories and var_name.endswith('|charge_state')) - - def _is_first_timestep_variable(var_name: str) -> bool: - return var_name in first_timestep_vars or ( - not variable_categories and (var_name.endswith('|startup') or var_name.endswith('|shutdown')) - ) - - # For segmented systems: build expansion divisor - expansion_divisor = None - if clustering.is_segmented: - expansion_divisor = clustering.build_expansion_divisor(original_time=original_timesteps) - - def _append_final_state(expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray: - """Append final state value from original data to expanded data.""" - cluster_assignments = clustering.cluster_assignments - if cluster_assignments.ndim == 1: - last_cluster = int(cluster_assignments.values[last_original_cluster_idx]) - extra_val = da.isel(cluster=last_cluster, time=-1) - else: - last_clusters = cluster_assignments.isel(original_cluster=last_original_cluster_idx) - extra_val = da.isel(cluster=last_clusters, time=-1) - extra_val = extra_val.drop_vars(['cluster', 'time'], errors='ignore') - extra_val = extra_val.expand_dims(time=[original_timesteps_extra[-1]]) - return xr.concat([expanded, extra_val], dim='time') - - def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) -> xr.DataArray: - """Expand a DataArray from clustered to original timesteps.""" - if 'time' not in da.dims: - return da.copy() - - has_cluster_dim = 'cluster' in da.dims - is_state = _is_state_variable(var_name) and has_cluster_dim - is_first_timestep = _is_first_timestep_variable(var_name) and has_cluster_dim - is_segment_total = is_solution and var_name in segment_total_vars - - # Choose expansion method - if is_state and clustering.is_segmented: - expanded = self._interpolate_charge_state_segmented(da, clustering, original_timesteps) - elif is_first_timestep and is_solution and clustering.is_segmented: - return self._expand_first_timestep_only(da, clustering, original_timesteps) - else: - expanded = clustering.expand_data(da, original_time=original_timesteps) - if is_segment_total and expansion_divisor is not None: - expanded = expanded / expansion_divisor - - # State variables need final state appended - if is_state: - expanded = _append_final_state(expanded, da) - - return expanded + expander = _Expander(self._fs, clustering) # Helper to construct DataArray without slow _construct_dataarray def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: @@ -2037,7 +2030,7 @@ def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: # (e.g., representative_weights with dims ('cluster',) or ('cluster', 'period')) if 'cluster' in da.dims and 'time' not in da.dims: continue - data_vars[name] = expand_da(da, name) + data_vars[name] = expander.expand_dataarray(da, name) # Remove timestep_duration reference from attrs - let FlowSystem compute it from timesteps_extra # This ensures proper time coordinates for xarray alignment with N+1 solution timesteps attrs = {k: v for k, v in reduced_ds.attrs.items() if k not in clustering_attrs and k != 'timestep_duration'} @@ -2055,18 +2048,18 @@ def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: if name in sol_coord_names: continue da = _fast_get_da(reduced_solution, name, sol_coord_cache) - expanded_sol_vars[name] = expand_da(da, name, is_solution=True) + expanded_sol_vars[name] = expander.expand_dataarray(da, name, is_solution=True) expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs) - expanded_fs._solution = expanded_fs._solution.reindex(time=original_timesteps_extra) + expanded_fs._solution = expanded_fs._solution.reindex(time=expander._original_timesteps_extra) # 3. Combine charge_state with SOC_boundary for intercluster storages self._combine_intercluster_charge_states( expanded_fs, reduced_solution, clustering, - original_timesteps_extra, - timesteps_per_cluster, - n_original_clusters, + expander._original_timesteps_extra, + expander._timesteps_per_cluster, + expander._n_original_clusters, ) # Log expansion info @@ -2075,15 +2068,15 @@ def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: n_combinations = (len(self._fs.periods) if has_periods else 1) * ( len(self._fs.scenarios) if has_scenarios else 1 ) - n_reduced_timesteps = n_clusters * time_dim_size - segmented_info = f' ({n_segments} segments)' if n_segments else '' + n_reduced_timesteps = expander._n_clusters * expander._time_dim_size + segmented_info = f' ({expander._n_segments} segments)' if expander._n_segments else '' logger.info( - f'Expanded FlowSystem from {n_reduced_timesteps} to {n_original_timesteps} timesteps ' - f'({n_clusters} clusters{segmented_info}' + f'Expanded FlowSystem from {n_reduced_timesteps} to {expander._n_original_timesteps} timesteps ' + f'({expander._n_clusters} clusters{segmented_info}' + ( f', {n_combinations} period/scenario combinations)' if n_combinations > 1 - else f' → {n_original_clusters} original clusters)' + else f' → {expander._n_original_clusters} original clusters)' ) ) From dadc33eb43d4024c590cc122c863846f5dbe05cc Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Wed, 28 Jan 2026 08:23:24 +0100 Subject: [PATCH 15/15] _Expander class (lines 82-491) - fully self-contained expansion logic: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _Expander ├── __init__(fs, clustering) # Pre-computes all shared state ├── _is_state_variable() # Category checks ├── _is_first_timestep_variable() ├── _build_segment_total_varnames() # Backward compatibility fallback ├── _append_final_state() # State variable N+1 handling ├── _interpolate_charge_state_segmented() # Segmented charge_state interpolation ├── _expand_first_timestep_only() # Startup/shutdown expansion ├── expand_dataarray() # Single DataArray expansion ├── _fast_get_da() # Performance helper ├── _combine_intercluster_charge_states() # SOC boundary handling ├── _apply_soc_decay() # Self-discharge decay └── expand_flow_system() # Main entry point - returns expanded FlowSystem TransformAccessor.expand() (lines 1944-2048) - thin wrapper: def expand(self) -> FlowSystem: """...(docstring preserved)...""" clustering = self._validate_for_expansion() expander = _Expander(self._fs, clustering) return expander.expand_flow_system() Benefits: - Complete encapsulation: all expansion logic in one class - Self-contained: _Expander doesn't depend on TransformAccessor internals - Clean separation: accessor validates, expander does the work - Testable: _Expander could be unit tested independently - Simpler signatures: methods use instance state instead of passing many parameters --- flixopt/transform_accessor.py | 389 ++++++++++++++++------------------ 1 file changed, 180 insertions(+), 209 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index fbd17eaab..54bf69670 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -333,6 +333,185 @@ def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bo return expanded + def _fast_get_da(self, ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: + """Construct DataArray without slow _construct_dataarray calls.""" + variable = ds.variables[name] + var_dims = set(variable.dims) + coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} + return xr.DataArray(variable, coords=coords, name=name) + + def _combine_intercluster_charge_states(self, expanded_fs: FlowSystem, reduced_solution: xr.Dataset) -> None: + """Combine charge_state with SOC_boundary for intercluster storages (in-place). + + For intercluster storages, charge_state is relative (delta-E) and can be negative. + Per Blanke et al. (2022) Eq. 9, actual SOC at time t in period d is: + SOC(t) = SOC_boundary[d] * (1 - loss)^t_within_period + charge_state(t) + where t_within_period is hours from period start (accounts for self-discharge decay). + + Args: + expanded_fs: The expanded FlowSystem (modified in-place). + reduced_solution: The original reduced solution dataset. + """ + n_original_timesteps_extra = len(self._original_timesteps_extra) + soc_boundary_vars = self._fs.get_variables_by_category(VariableCategory.SOC_BOUNDARY) + + for soc_boundary_name in soc_boundary_vars: + storage_name = soc_boundary_name.rsplit('|', 1)[0] + charge_state_name = f'{storage_name}|charge_state' + if charge_state_name not in expanded_fs._solution: + continue + + soc_boundary = reduced_solution[soc_boundary_name] + expanded_charge_state = expanded_fs._solution[charge_state_name] + + # Map each original timestep to its original period index + original_cluster_indices = np.minimum( + np.arange(n_original_timesteps_extra) // self._timesteps_per_cluster, + self._n_original_clusters - 1, + ) + + # Select SOC_boundary for each timestep + soc_boundary_per_timestep = soc_boundary.isel( + cluster_boundary=xr.DataArray(original_cluster_indices, dims=['time']) + ).assign_coords(time=self._original_timesteps_extra) + + # Apply self-discharge decay + soc_boundary_per_timestep = self._apply_soc_decay( + soc_boundary_per_timestep, storage_name, original_cluster_indices + ) + + # Combine and clip to non-negative + combined = (expanded_charge_state + soc_boundary_per_timestep).clip(min=0) + expanded_fs._solution[charge_state_name] = combined.assign_attrs(expanded_charge_state.attrs) + + # Clean up SOC_boundary variables and orphaned coordinates + for soc_boundary_name in soc_boundary_vars: + if soc_boundary_name in expanded_fs._solution: + del expanded_fs._solution[soc_boundary_name] + if 'cluster_boundary' in expanded_fs._solution.coords: + expanded_fs._solution = expanded_fs._solution.drop_vars('cluster_boundary') + + def _apply_soc_decay( + self, + soc_boundary_per_timestep: xr.DataArray, + storage_name: str, + original_cluster_indices: np.ndarray, + ) -> xr.DataArray: + """Apply self-discharge decay to SOC_boundary values. + + Args: + soc_boundary_per_timestep: SOC boundary values mapped to each timestep. + storage_name: Name of the storage component. + original_cluster_indices: Mapping of timesteps to original cluster indices. + + Returns: + SOC boundary values with decay applied. + """ + storage = self._fs.storages.get(storage_name) + if storage is None: + return soc_boundary_per_timestep + + n_timesteps = len(self._original_timesteps_extra) + + # Time within period for each timestep (0, 1, 2, ..., T-1, 0, 1, ...) + time_within_period = np.arange(n_timesteps) % self._timesteps_per_cluster + time_within_period[-1] = self._timesteps_per_cluster # Extra timestep gets full decay + time_within_period_da = xr.DataArray( + time_within_period, dims=['time'], coords={'time': self._original_timesteps_extra} + ) + + # Decay factor: (1 - loss)^t + loss_value = _scalar_safe_reduce(storage.relative_loss_per_hour, 'time', 'mean') + loss_arr = np.asarray(loss_value) + if not np.any(loss_arr > 0): + return soc_boundary_per_timestep + + decay_da = (1 - loss_arr) ** time_within_period_da + + # Handle cluster dimension if present + if 'cluster' in decay_da.dims: + cluster_assignments = self._clustering.cluster_assignments + if cluster_assignments.ndim == 1: + cluster_per_timestep = xr.DataArray( + cluster_assignments.values[original_cluster_indices], + dims=['time'], + coords={'time': self._original_timesteps_extra}, + ) + else: + cluster_per_timestep = cluster_assignments.isel( + original_cluster=xr.DataArray(original_cluster_indices, dims=['time']) + ).assign_coords(time=self._original_timesteps_extra) + decay_da = decay_da.isel(cluster=cluster_per_timestep).drop_vars('cluster', errors='ignore') + + return soc_boundary_per_timestep * decay_da + + def expand_flow_system(self) -> FlowSystem: + """Expand the clustered FlowSystem to full original timesteps. + + Returns: + FlowSystem: A new FlowSystem with full timesteps and expanded solution. + """ + from .flow_system import FlowSystem + + # 1. Expand FlowSystem data + reduced_ds = self._fs.to_dataset(include_solution=False) + clustering_attrs = {'is_clustered', 'n_clusters', 'timesteps_per_cluster', 'clustering', 'cluster_weight'} + skip_vars = {'cluster_weight', 'timestep_duration'} # These have special handling + data_vars = {} + coord_cache = {k: v for k, v in reduced_ds.coords.items()} + coord_names = set(coord_cache) + for name in reduced_ds.variables: + if name in coord_names: + continue + if name in skip_vars or name.startswith('clustering|'): + continue + da = self._fast_get_da(reduced_ds, name, coord_cache) + # Skip vars with cluster dim but no time dim - they don't make sense after expansion + if 'cluster' in da.dims and 'time' not in da.dims: + continue + data_vars[name] = self.expand_dataarray(da, name) + # Remove timestep_duration reference from attrs - let FlowSystem compute it from timesteps_extra + attrs = {k: v for k, v in reduced_ds.attrs.items() if k not in clustering_attrs and k != 'timestep_duration'} + expanded_ds = xr.Dataset(data_vars, attrs=attrs) + + expanded_fs = FlowSystem.from_dataset(expanded_ds) + + # 2. Expand solution (with segment total correction for segmented systems) + reduced_solution = self._fs.solution + sol_coord_cache = {k: v for k, v in reduced_solution.coords.items()} + sol_coord_names = set(sol_coord_cache) + expanded_sol_vars = {} + for name in reduced_solution.variables: + if name in sol_coord_names: + continue + da = self._fast_get_da(reduced_solution, name, sol_coord_cache) + expanded_sol_vars[name] = self.expand_dataarray(da, name, is_solution=True) + expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs) + expanded_fs._solution = expanded_fs._solution.reindex(time=self._original_timesteps_extra) + + # 3. Combine charge_state with SOC_boundary for intercluster storages + self._combine_intercluster_charge_states(expanded_fs, reduced_solution) + + # Log expansion info + has_periods = self._fs.periods is not None + has_scenarios = self._fs.scenarios is not None + n_combinations = (len(self._fs.periods) if has_periods else 1) * ( + len(self._fs.scenarios) if has_scenarios else 1 + ) + n_reduced_timesteps = self._n_clusters * self._time_dim_size + segmented_info = f' ({self._n_segments} segments)' if self._n_segments else '' + logger.info( + f'Expanded FlowSystem from {n_reduced_timesteps} to {self._n_original_timesteps} timesteps ' + f'({self._n_clusters} clusters{segmented_info}' + + ( + f', {n_combinations} period/scenario combinations)' + if n_combinations > 1 + else f' → {self._n_original_clusters} original clusters)' + ) + ) + + return expanded_fs + class TransformAccessor: """ @@ -1762,135 +1941,6 @@ def _validate_for_expansion(self) -> Clustering: return self._fs.clustering - def _combine_intercluster_charge_states( - self, - expanded_fs: FlowSystem, - reduced_solution: xr.Dataset, - clustering: Clustering, - original_timesteps_extra: pd.DatetimeIndex, - timesteps_per_cluster: int, - n_original_clusters: int, - ) -> None: - """Combine charge_state with SOC_boundary for intercluster storages (in-place). - - For intercluster storages, charge_state is relative (delta-E) and can be negative. - Per Blanke et al. (2022) Eq. 9, actual SOC at time t in period d is: - SOC(t) = SOC_boundary[d] * (1 - loss)^t_within_period + charge_state(t) - where t_within_period is hours from period start (accounts for self-discharge decay). - - Args: - expanded_fs: The expanded FlowSystem (modified in-place). - reduced_solution: The original reduced solution dataset. - clustering: Clustering with cluster order info. - original_timesteps_extra: Original timesteps including the extra final timestep. - timesteps_per_cluster: Number of timesteps per cluster. - n_original_clusters: Number of original clusters before aggregation. - """ - n_original_timesteps_extra = len(original_timesteps_extra) - soc_boundary_vars = self._fs.get_variables_by_category(VariableCategory.SOC_BOUNDARY) - - for soc_boundary_name in soc_boundary_vars: - storage_name = soc_boundary_name.rsplit('|', 1)[0] - charge_state_name = f'{storage_name}|charge_state' - if charge_state_name not in expanded_fs._solution: - continue - - soc_boundary = reduced_solution[soc_boundary_name] - expanded_charge_state = expanded_fs._solution[charge_state_name] - - # Map each original timestep to its original period index - original_cluster_indices = np.minimum( - np.arange(n_original_timesteps_extra) // timesteps_per_cluster, - n_original_clusters - 1, - ) - - # Select SOC_boundary for each timestep - soc_boundary_per_timestep = soc_boundary.isel( - cluster_boundary=xr.DataArray(original_cluster_indices, dims=['time']) - ).assign_coords(time=original_timesteps_extra) - - # Apply self-discharge decay - soc_boundary_per_timestep = self._apply_soc_decay( - soc_boundary_per_timestep, - storage_name, - clustering, - original_timesteps_extra, - original_cluster_indices, - timesteps_per_cluster, - ) - - # Combine and clip to non-negative - combined = (expanded_charge_state + soc_boundary_per_timestep).clip(min=0) - expanded_fs._solution[charge_state_name] = combined.assign_attrs(expanded_charge_state.attrs) - - # Clean up SOC_boundary variables and orphaned coordinates - for soc_boundary_name in soc_boundary_vars: - if soc_boundary_name in expanded_fs._solution: - del expanded_fs._solution[soc_boundary_name] - if 'cluster_boundary' in expanded_fs._solution.coords: - expanded_fs._solution = expanded_fs._solution.drop_vars('cluster_boundary') - - def _apply_soc_decay( - self, - soc_boundary_per_timestep: xr.DataArray, - storage_name: str, - clustering: Clustering, - original_timesteps_extra: pd.DatetimeIndex, - original_cluster_indices: np.ndarray, - timesteps_per_cluster: int, - ) -> xr.DataArray: - """Apply self-discharge decay to SOC_boundary values. - - Args: - soc_boundary_per_timestep: SOC boundary values mapped to each timestep. - storage_name: Name of the storage component. - clustering: Clustering with cluster order info. - original_timesteps_extra: Original timesteps including final extra timestep. - original_cluster_indices: Mapping of timesteps to original cluster indices. - timesteps_per_cluster: Number of timesteps per cluster. - - Returns: - SOC boundary values with decay applied. - """ - storage = self._fs.storages.get(storage_name) - if storage is None: - return soc_boundary_per_timestep - - n_timesteps = len(original_timesteps_extra) - - # Time within period for each timestep (0, 1, 2, ..., T-1, 0, 1, ...) - time_within_period = np.arange(n_timesteps) % timesteps_per_cluster - time_within_period[-1] = timesteps_per_cluster # Extra timestep gets full decay - time_within_period_da = xr.DataArray( - time_within_period, dims=['time'], coords={'time': original_timesteps_extra} - ) - - # Decay factor: (1 - loss)^t - loss_value = _scalar_safe_reduce(storage.relative_loss_per_hour, 'time', 'mean') - # Normalize to array for consistent handling (scalar_safe_reduce may return scalar or DataArray) - loss_arr = np.asarray(loss_value) - if not np.any(loss_arr > 0): - return soc_boundary_per_timestep - - decay_da = (1 - loss_arr) ** time_within_period_da - - # Handle cluster dimension if present - if 'cluster' in decay_da.dims: - cluster_assignments = clustering.cluster_assignments - if cluster_assignments.ndim == 1: - cluster_per_timestep = xr.DataArray( - cluster_assignments.values[original_cluster_indices], - dims=['time'], - coords={'time': original_timesteps_extra}, - ) - else: - cluster_per_timestep = cluster_assignments.isel( - original_cluster=xr.DataArray(original_cluster_indices, dims=['time']) - ).assign_coords(time=original_timesteps_extra) - decay_da = decay_da.isel(cluster=cluster_per_timestep).drop_vars('cluster', errors='ignore') - - return soc_boundary_per_timestep * decay_da - def expand(self) -> FlowSystem: """Expand a clustered FlowSystem back to full original timesteps. @@ -1999,85 +2049,6 @@ def expand(self) -> FlowSystem: - ``{flow}|startup``: Startup event - ``{flow}|shutdown``: Shutdown event """ - from .flow_system import FlowSystem - - # Validate and create expander with pre-computed state clustering = self._validate_for_expansion() expander = _Expander(self._fs, clustering) - - # Helper to construct DataArray without slow _construct_dataarray - def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: - variable = ds.variables[name] - var_dims = set(variable.dims) - coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} - return xr.DataArray(variable, coords=coords, name=name) - - # 1. Expand FlowSystem data - reduced_ds = self._fs.to_dataset(include_solution=False) - clustering_attrs = {'is_clustered', 'n_clusters', 'timesteps_per_cluster', 'clustering', 'cluster_weight'} - skip_vars = {'cluster_weight', 'timestep_duration'} # These have special handling - data_vars = {} - # Use ds.variables pattern to avoid slow _construct_dataarray calls - coord_cache = {k: v for k, v in reduced_ds.coords.items()} - coord_names = set(coord_cache) - for name in reduced_ds.variables: - if name in coord_names: - continue - if name in skip_vars or name.startswith('clustering|'): - continue - da = _fast_get_da(reduced_ds, name, coord_cache) - # Skip vars with cluster dim but no time dim - they don't make sense after expansion - # (e.g., representative_weights with dims ('cluster',) or ('cluster', 'period')) - if 'cluster' in da.dims and 'time' not in da.dims: - continue - data_vars[name] = expander.expand_dataarray(da, name) - # Remove timestep_duration reference from attrs - let FlowSystem compute it from timesteps_extra - # This ensures proper time coordinates for xarray alignment with N+1 solution timesteps - attrs = {k: v for k, v in reduced_ds.attrs.items() if k not in clustering_attrs and k != 'timestep_duration'} - expanded_ds = xr.Dataset(data_vars, attrs=attrs) - - expanded_fs = FlowSystem.from_dataset(expanded_ds) - - # 2. Expand solution (with segment total correction for segmented systems) - reduced_solution = self._fs.solution - # Use ds.variables pattern to avoid slow _construct_dataarray calls - sol_coord_cache = {k: v for k, v in reduced_solution.coords.items()} - sol_coord_names = set(sol_coord_cache) - expanded_sol_vars = {} - for name in reduced_solution.variables: - if name in sol_coord_names: - continue - da = _fast_get_da(reduced_solution, name, sol_coord_cache) - expanded_sol_vars[name] = expander.expand_dataarray(da, name, is_solution=True) - expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs) - expanded_fs._solution = expanded_fs._solution.reindex(time=expander._original_timesteps_extra) - - # 3. Combine charge_state with SOC_boundary for intercluster storages - self._combine_intercluster_charge_states( - expanded_fs, - reduced_solution, - clustering, - expander._original_timesteps_extra, - expander._timesteps_per_cluster, - expander._n_original_clusters, - ) - - # Log expansion info - has_periods = self._fs.periods is not None - has_scenarios = self._fs.scenarios is not None - n_combinations = (len(self._fs.periods) if has_periods else 1) * ( - len(self._fs.scenarios) if has_scenarios else 1 - ) - n_reduced_timesteps = expander._n_clusters * expander._time_dim_size - segmented_info = f' ({expander._n_segments} segments)' if expander._n_segments else '' - logger.info( - f'Expanded FlowSystem from {n_reduced_timesteps} to {expander._n_original_timesteps} timesteps ' - f'({expander._n_clusters} clusters{segmented_info}' - + ( - f', {n_combinations} period/scenario combinations)' - if n_combinations > 1 - else f' → {expander._n_original_clusters} original clusters)' - ) - ) - - return expanded_fs + return expander.expand_flow_system()