In [1]:
from appgeopy import *
from my_packages import *
from pca_imputation import *
from sklearn.preprocessing import StandardScaler


def run_pca_imputation_workflow(data, time_col=None, value_col=None):
    """Execute PCA imputation workflow with parameter optimization and validation.

    Args:
        data: pandas Series with datetime index or DataFrame
        time_col: column name for time if data is DataFrame
        value_col: column name for value if data is DataFrame

    Returns:
        dict: Results containing imputation, parameters, and validation metrics
    """
    # Validate data size and determine appropriate parameters
    data_length = len(data)

    # Calculate appropriate embedding dimensions based on data size
    # Ensure embedding dimension doesn't exceed 1/3 of data length
    max_embedding = max(2, data_length // 3)

    # Generate embedding dimensions from 2 to max_embedding (capped at 12)
    embedding_dims = list(range(2, min(max_embedding + 1, 13), 2))

    # Components must be less than embedding dimensions
    n_components_list = [min(dim - 1, 3) for dim in embedding_dims]
    # n_components_list = [max(2, min(dim-1, 3)) for dim in embedding_dims]
    n_components_list = list(set(n_components_list))  # Remove duplicates

    # Step 1: Parameter Optimization with safeguards
    try:
        param_results = parameter_grid_search(
            data=data,
            embedding_dims=embedding_dims,
            n_components_list=n_components_list,
            mask_ratio=min(0.1, 0.5 * (1 - data.isna().mean())),  # Adaptive masking
            random_seed=42,
            time_col=time_col,
            value_col=value_col,
            use_cross_validation=False,
        )

        # Get optimal parameters
        best_params = param_results.loc[param_results["rmse"].idxmin()]
        best_embedding_dim = int(best_params["embedding_dim"])
        best_n_components = int(best_params["n_components"])
    except Exception as e:
        # Fallback to minimal parameters if grid search fails
        best_embedding_dim = 2
        best_n_components = 1

    # Step 2: PCA Imputation with Optimal Parameters
    results = impute_time_series(
        data=data,
        embedding_dim=best_embedding_dim,
        n_components=best_n_components,
        time_delay=1,
        time_col=time_col,
        value_col=value_col,
    )

    # Step 3: Validation with minimal additional masking
    try:
        validation_metrics, original_values, imputed_values, mask_indices = (
            validate_imputation_accuracy(
                data=data,
                embedding_dim=best_embedding_dim,
                n_components=best_n_components,
                time_delay=1,
                mask_ratio=min(0.05, 0.5 * (1 - data.isna().mean())),  # Conservative masking
                random_seed=42,
                time_col=time_col,
                value_col=value_col,
            )
        )
    except Exception as e:
        validation_metrics = {"rmse": np.nan, "mae": np.nan, "r2": np.nan}
        original_values = np.array([])
        imputed_values = np.array([])
        mask_indices = np.array([])

    return {
        "imputation_results": results,
        "optimal_parameters": {
            "embedding_dim": best_embedding_dim,
            "n_components": best_n_components,
        },
        "validation_metrics": validation_metrics,
    }

In [2]:
gps_files = glob("GPS_Files/*.csv")
# select_file = gps_files[0]
for select_file in gps_files:
    basename = os.path.basename(select_file)
    df = pd.read_csv(select_file, parse_dates=[0], index_col=[0], usecols=[0, 3])
    
    scaler = StandardScaler()
    
    target_array = df["dU(mm)"]
    _trend, _slope = get_polynomial_trend(series=target_array, order=3)
    _detrend = target_array - _trend
    scaled_values = scaler.fit_transform(_detrend.values.reshape(-1, 1))
    
    # Reconstruct scaled series, maintaining original index
    scaled_series = pd.Series(scaled_values.flatten(), index=_detrend.index, name="Scaled Values")
    filter_cond = (scaled_series > 3) | (scaled_series < -3)
    scaled_series[filter_cond] = np.nan
    
    results = run_pca_imputation_workflow(scaled_series)
    reconstructed_scaled_detrend = results["imputation_results"]["reconstructed_series"]
    reconstructed_detrend = scaler.inverse_transform(reconstructed_scaled_detrend.reshape(-1, 1))
    reconstructed_detrend_series = pd.Series(data=reconstructed_detrend.flatten(), index=_detrend.index)
    
    reconstructed_target = _trend + reconstructed_detrend_series
    
    df["PCA_dU(mm)"] = reconstructed_target
    
    df.to_csv(os.path.join("GPS_Files/PCA_Imputed", basename))