In [1]:
# ==============================================================================
# GTWR PREDICTION DATA PROCESSING AND CUBE GENERATION
# ==============================================================================
# This script processes GTWR (Geographically and Temporally Weighted Regression)
# prediction output files, converts them to datacubes, and computes statistics.

from appgeopy import *
from joblib import Parallel, delayed
from my_packages import *

In [2]:
# ==============================================================================
# BLOCK 1: HELPER FUNCTION DEFINITION
# ==============================================================================
# This function processes individual data chunks in parallel to add monthly
# timestamps. It converts numeric prediction time values to actual dates starting
# from May 2016.


def process_chunk(chunk):
    """
    Convert prediction time numbers to monthly timestamps.

    Args:
        chunk: DataFrame chunk containing 'pred_time' column
    Returns:
        DataFrame with added 'monthly' column containing timestamp dates
    """
    start_date = pd.Timestamp("2016-05-01")  # Base date for time conversion
    chunk_copy = chunk.copy()

    # Convert numeric pred_time to monthly dates (e.g., pred_time=0 -> May 2016, pred_time=1 -> June 2016)
    chunk_copy["monthly"] = chunk_copy["pred_time"].apply(
        lambda x: start_date + pd.DateOffset(months=int(x))
    )
    return chunk_copy

In [3]:
# ==============================================================================
# BLOCK 2: FILE DISCOVERY AND SELECTION
# ==============================================================================
# Find all GTWR prediction output files and select the first one for processing.
# Files are expected to be in Feather format with names starting with 'L'.

topfolder = r"D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\5_GTWR_Prediction\3__PredictionOutput"
files = glob(os.path.join(topfolder, "L*.feather"))  # Find all layer files

# Select first file for processing
# select_file = files[0]

for select_file in tqdm(files):
    current_layer = os.path.basename(select_file).split(".")[
        0
    ]  # Extract layer name from filename

    # ==============================================================================
    # BLOCK 3: DATA LOADING AND COLUMN SELECTION
    # ==============================================================================
    # Load the prediction data and select only the required columns for cube generation.
    # This reduces memory usage and focuses on essential variables.

    df = pd.read_feather(select_file)

    # Select only necessary columns for analysis
    # - Coefficients: X_Intercept__coef, CUMDISP_coef
    # - Predictions: gtwr_prediction, predict_var, predict_se
    # - Spatial coordinates: X_TWD97, Y_TWD97
    # - Temporal info: pred_time
    # - Additional data: pred_CUMDISP
    df = df.loc[
        :,
        [
            "X_Intercept__coef",  # Intercept coefficient from GTWR model
            "CUMDISP_coef",  # Cumulative displacement coefficient
            "gtwr_prediction",  # Main prediction values
            "predict_var",  # Prediction variance
            "predict_se",  # Prediction standard error
            "X_TWD97",  # X coordinate (TWD97 projection)
            "Y_TWD97",  # Y coordinate (TWD97 projection)
            "pred_time",  # Time index for predictions
            "pred_CUMDISP",  # Predicted cumulative displacement
        ],
    ]

    # ==============================================================================
    # BLOCK 4: PARALLEL DATA PROCESSING
    # ==============================================================================
    # Split the data into chunks and process them in parallel to add monthly timestamps.
    # This improves performance for large datasets by utilizing multiple CPU cores.

    # Split data into manageable chunks for parallel processing
    chunk_size = 10_000  # Balance between memory usage and parallel efficiency
    chunks = [
        df.iloc[i : i + chunk_size] for i in range(0, len(df), chunk_size)
    ]

    # Process chunks in parallel using all available CPU cores
    processed_chunks = Parallel(n_jobs=-1)(
        delayed(process_chunk)(chunk) for chunk in tqdm(chunks)
    )

    # Combine all processed chunks back into single DataFrame
    new_df = pd.concat(processed_chunks, ignore_index=True)

    # ==============================================================================
    # BLOCK 5: DATACUBE CREATION AND EXPORT
    # ==============================================================================
    # Convert the processed DataFrame into a spatiotemporal datacube structure
    # and export it to NetCDF format for further analysis.

    # Create datacube with spatial (X_TWD97, Y_TWD97) and temporal (monthly) dimensions
    choushui_cube = DataCube.from_dataframe(
        df=new_df,
        time_col="monthly",  # Time dimension using monthly timestamps
        x_col="X_TWD97",  # Spatial X dimension
        y_col="Y_TWD97",  # Spatial Y dimension
    )

    # Export complete datacube to NetCDF file
    choushui_cube.export_to_netcdf(file_path=f"GTWR_{current_layer}_cube.nc")

    # ==============================================================================
    # BLOCK 6: STATISTICAL ANALYSIS AND EXPORT
    # ==============================================================================
    # Compute temporal statistics (mean and standard deviation) for a selected variable
    # across all time periods and export the results.

    # select_quantity = "CUMDISP_coef"  # Choose variable for statistical analysis
    # Alternative: select_quantity = "X_Intercept__coef"

    stats_quantities = ["CUMDISP_coef", "X_Intercept__coef"]
    stats_savenames = ["coeff", "intercept"]

    for select_quantity, stats_name in zip(stats_quantities, stats_savenames):

        # Compute mean and standard deviation across time dimension
        choushui_stats = choushui_cube.compute_temporal_statistics(
            variables=select_quantity,
            statistics=[
                "mean",
                "std",
            ],  # Calculate both mean and standard deviation
        )

        # Export statistical results to separate NetCDF file
        choushui_stats.export_to_netcdf(
            file_path=f"GTWR_{current_layer}_{stats_name}_stats.nc"
        )

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

DataCube successfully created from DataFrame.

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_1_cube.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_1_coeff_stats.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_1_intercept_stats.nc


  0%|          | 0/394 [00:00<?, ?it/s]

DataCube successfully created from DataFrame.

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_2_cube.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_2_coeff_stats.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_2_intercept_stats.nc


  0%|          | 0/394 [00:00<?, ?it/s]

DataCube successfully created from DataFrame.

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_3_cube.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_3_coeff_stats.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_3_intercept_stats.nc


  0%|          | 0/394 [00:00<?, ?it/s]

DataCube successfully created from DataFrame.

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_4_cube.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_4_coeff_stats.nc

Data cube successfully exported to: D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\9_ResultVisualize\GTWR_Layer_4_intercept_stats.nc
