# Multivariate Time Series Refinement and Imputation Workflow

## Purpose
Robust preprocessing and imputation pipeline for multivariate time series data, specifically designed for geospatial/hydrogeological measurements across multiple layers.

## Workflow Overview
1. **Data Preparation**
   - Loads time series data for multiple stations from HDF5 file
   - Processes monthly measurements across different layers

2. **Time Series Refinement Process**
   - Applies polynomial trend removal
   - Uses StandardScaler for normalization
   - Implements outlier filtering (z-score > ±3)

3. **PCA-Based Imputation**
   - Automatically optimizes embedding dimensions
   - Performs parameter grid search
   - Applies Principal Component Analysis (PCA) imputation
   - Reconstructs time series with minimal information loss

4. **Key Transformations**
   - Trend decomposition
   - Statistical scaling
   - Intelligent missing value reconstruction

5. **Output**
   - Generates refined time series for each station and layer
   - Updates original HDF5 file with imputed data
   - Creates visualization of original vs. refined time series

## Key Scientific Techniques
- Trend removal
- Statistical normalization
- PCA-based imputation
- Multivariate time series analysis

## Computational Workflow
1. Per station → Per layer processing
2. Automated parameter optimization
3. Robust imputation strategy
4. Comprehensive error handling

## Use Case
Geospatial measurement refinement, particularly for hydrogeological time series data with complex temporal characteristics and missing values.

In [1]:
from appgeopy import *
from my_packages import *
from pca_imputation import *
from sklearn.preprocessing import StandardScaler

In [2]:
def run_pca_imputation_workflow(data, time_col=None, value_col=None):
    """Execute PCA imputation workflow with parameter optimization and validation.

    Args:
        data: pandas Series with datetime index or DataFrame
        time_col: column name for time if data is DataFrame
        value_col: column name for value if data is DataFrame

    Returns:
        dict: Results containing imputation, parameters, and validation metrics
    """
    # Validate data size and determine appropriate parameters
    data_length = len(data)

    # Calculate appropriate embedding dimensions based on data size
    # Ensure embedding dimension doesn't exceed 1/3 of data length
    max_embedding = max(2, data_length // 3)

    # Generate embedding dimensions from 2 to max_embedding (capped at 12)
    embedding_dims = list(range(2, min(max_embedding + 1, 13), 2))

    # Components must be less than embedding dimensions
    n_components_list = [min(dim - 1, 3) for dim in embedding_dims]
    # n_components_list = [max(2, min(dim-1, 3)) for dim in embedding_dims]
    n_components_list = list(set(n_components_list))  # Remove duplicates

    # Step 1: Parameter Optimization with safeguards
    try:
        param_results = parameter_grid_search(
            data=data,
            embedding_dims=embedding_dims,
            n_components_list=n_components_list,
            mask_ratio=min(0.1, 0.5 * (1 - data.isna().mean())),  # Adaptive masking
            random_seed=42,
            time_col=time_col,
            value_col=value_col,
            use_cross_validation=False,
        )

        # Get optimal parameters
        best_params = param_results.loc[param_results["rmse"].idxmin()]
        best_embedding_dim = int(best_params["embedding_dim"])
        best_n_components = int(best_params["n_components"])
    except Exception as e:
        # Fallback to minimal parameters if grid search fails
        best_embedding_dim = 2
        best_n_components = 1

    # Step 2: PCA Imputation with Optimal Parameters
    results = impute_time_series(
        data=data,
        embedding_dim=best_embedding_dim,
        n_components=best_n_components,
        time_delay=1,
        time_col=time_col,
        value_col=value_col,
    )

    # Step 3: Validation with minimal additional masking
    try:
        validation_metrics, original_values, imputed_values, mask_indices = (
            validate_imputation_accuracy(
                data=data,
                embedding_dim=best_embedding_dim,
                n_components=best_n_components,
                time_delay=1,
                mask_ratio=min(0.05, 0.5 * (1 - data.isna().mean())),  # Conservative masking
                random_seed=42,
                time_col=time_col,
                value_col=value_col,
            )
        )
    except Exception as e:
        validation_metrics = {"rmse": np.nan, "mae": np.nan, "r2": np.nan}
        original_values = np.array([])
        imputed_values = np.array([])
        mask_indices = np.array([])

    return {
        "imputation_results": results,
        "optimal_parameters": {
            "embedding_dim": best_embedding_dim,
            "n_components": best_n_components,
        },
        "validation_metrics": validation_metrics,
    }

In [3]:
mlcw_fpath = r"D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\2_KrigingInterpolation\20250314_MLCW_CRFP_monthly_v1.h5"
mlcw_obj = MLCW(mlcw_fpath)

mlcw_measures, mlcw_metadata = mlcw_obj.get_data()

available_stations = mlcw_obj.list_stations()
available_stations[:5]

['ANHE', 'BEICHEN', 'CANLIN', 'DONGGUANG', 'ERLUN']

In [20]:
cache = []
string_decoder = lambda arr: [x.decode("utf-8") for x in arr]
fig_savefld = "refine_timeseries/"
# select_station = "TUKU"
for select_station in tqdm(available_stations):
    try:

        measures_byStation = mlcw_measures[select_station]
        monthly_date_arr = pd.to_datetime(string_decoder(measures_byStation["monthly_date"]))
        monthly_values_arr = measures_byStation["monthly_values"]["compactbylayer"]

        cdisp_mlcw_df = pd.DataFrame(data={"time": monthly_date_arr})

        n_layers = monthly_values_arr.shape[0]

        for i in range(n_layers):
            cdisp_mlcw_df[f"Layer_{i+1}"] = monthly_values_arr[i]

        cdisp_mlcw_df = cdisp_mlcw_df.set_index("time")

        scaler = StandardScaler()

        refined_output_df = pd.DataFrame(data=None, index=cdisp_mlcw_df.index)

        for layer in cdisp_mlcw_df.columns:
            target_array = cdisp_mlcw_df[layer]
            if target_array.sum()!=0:
                _trend, _slope = get_polynomial_trend(series=target_array, order=3)
                _detrend = target_array - _trend
    
                scaled_values = scaler.fit_transform(_detrend.values.reshape(-1, 1))
                # Reconstruct scaled series, maintaining original index
                scaled_series = pd.Series(
                    scaled_values.flatten(), index=_detrend.index, name="Scaled Values"
                )
    
                filter_cond = (scaled_series > 3) | (scaled_series < -3)
                scaled_series[filter_cond] = np.nan
    
                results = run_pca_imputation_workflow(scaled_series)
    
                reconstructed_scaled_detrend = results["imputation_results"]["reconstructed_series"]
                reconstructed_detrend = scaler.inverse_transform(
                    reconstructed_scaled_detrend.reshape(-1, 1)
                )
                reconstructed_detrend_series = pd.Series(
                    data=reconstructed_detrend.flatten(), index=_detrend.index
                )
    
                reconstructed_target = _trend + reconstructed_detrend_series
                refined_output_df[layer] = refined_output_df.index.map(reconstructed_target)
            else:
                refined_output_df[layer] = refined_output_df.index.map(target_array)

        measures_byStation["monthly_values"]["compactbylayer_PCA"] = refined_output_df.T.values

        cache.append({select_station: measures_byStation})

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        fig, axes = plt.subplots(nrows=n_layers, ncols=1, sharex=True, figsize=(11.7, 8.3))
        axes = axes.flatten()
        for idx, layer in enumerate(cdisp_mlcw_df.columns):
            axes[idx].plot(
                refined_output_df.loc[:, layer], color="blue", linestyle="--", marker="o", ms=2
            )
            axes[idx].plot(cdisp_mlcw_df.loc[:, layer], label=layer, color="black")
            visualize.configure_axis(axes[idx], hide_spines=["right", "top"], fontsize_base=12)
            visualize.configure_legend(axes[idx], fontsize_base=12)
            visualize.configure_datetime_ticks(axes[idx])
            visualize.configure_ticks(axes[idx])
        fig.suptitle(t=select_station, y=0.95, fontweight="bold", fontsize=16)
        fig.tight_layout()
        fig.autofmt_xdate(ha="center", rotation=90)
        # visualize.save_figure(fig=fig, savepath=os.path.join(fig_savefld, f"{select_station}.png"))
        plt.close()
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    except Exception as e:
        print(select_station, e)
        pass

  0%|          | 0/32 [00:00<?, ?it/s]

In [21]:
today_string = datetime.now().strftime("%Y%m%d")
new_mlcw_measures = merge_dicts(*cache)
# Write updated data and metadata back to the HDF5 file
with h5py.File(f"{today_string}_MLCW_CRFP_monthly_v2.h5", "w") as hdf5_file:
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, new_mlcw_measures)
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, mlcw_metadata)