In [1]:
from appgeopy import *
from my_packages import *

In [2]:
# ==============================================================================
# USER CONFIGURATION - Modify these parameters
# ==============================================================================

# --- File Paths ---
MLCW_SHAPEFILE = r"D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\2_KrigingInterpolation\points_fld\mlcw_twd97.shp"

REGPOINTS_FILE = r".\GWR_Prediction_Output\GWR_Layer_4.feather"

CURRENT_LAYER = os.path.basename(REGPOINTS_FILE).split(".")[0][4:]

GWR_CSV_FILE = glob(f"*AllKernel*{CURRENT_LAYER}.csv")[0]

In [3]:
# --- Column Name Mappings ---
# Regression Points DataFrame columns
REGPOINTS_COLUMNS = {
    "time_col": "monthly",  # Time period column
    "prediction_col": "prediction",  # Predicted values column
    "x_coord": "X_TWD97",  # X coordinate column
    "y_coord": "Y_TWD97",  # Y coordinate column
}

# GWR Output DataFrame columns
GWR_COLUMNS = {
    "time_col": "Time_value",  # Time period column
    "observed_col": f"input_{CURRENT_LAYER}",  # Observed/measured values
    "predicted_col": "predicted_value",  # Model predicted values
    "x_coord": "X_TWD97",  # X coordinate column
    "y_coord": "Y_TWD97",  # Y coordinate column
}

# MLCW Station columns
MLCW_COLUMNS = {
    "station_name": "STATION",  # Station name/ID column
    "pointkey_col": "PointKey",  # Point key identifier
}

# --- Analysis Parameters ---
REFERENCE_TIME_PERIOD = (
    1  # Which time period to use for unique spatial locations
)
BUFFER_RADIUS = 500  # Search radius around stations (in map units)
COORDINATE_SCALE = (
    1000  # Scale factor for PointKey generation (1000 for mm to m)
)

In [4]:
# ==============================================================================
# MAIN PROCESSING
# ==============================================================================

OUTPUT_FOLDER = "figure_validate_regpoints"
# Setup output directory
dirname = os.getcwd()
fig_savefld = os.path.join(dirname, OUTPUT_FOLDER)
os.makedirs(fig_savefld, exist_ok=True)

print(f"--- Starting Validation Script ---")
print(f"Output folder: {fig_savefld}")
print(f"Processing: {os.path.basename(REGPOINTS_FILE)}")
print(f"GWR results: {os.path.basename(GWR_CSV_FILE)}")

# Extract layer name for plot titles
layer_name = os.path.basename(REGPOINTS_FILE).split(".")[0]

# --- Load and Prepare Data ---

# 1. Load MLCW stations
mlcw_gdf = gpd.read_file(MLCW_SHAPEFILE)

# 2. Load regression points
regpoints_df = pd.read_feather(REGPOINTS_FILE)
regpoints_df = regpoints_df.set_index("PointKey")

# Get unique spatial locations (using reference time period)
query_str = f"{REGPOINTS_COLUMNS['time_col']} == @REFERENCE_TIME_PERIOD"
regpoints_unique = regpoints_df.query(query_str)

# Convert to GeoDataFrame
regpoints_gdf = geospatial.convert_to_geodata(
    df=regpoints_unique,
    xcoord_col=REGPOINTS_COLUMNS["x_coord"],
    ycoord_col=REGPOINTS_COLUMNS["y_coord"],
    crs_epsg="EPSG:3826",
)

# 3. Load GWR output
GWR_output = pd.read_csv(GWR_CSV_FILE)

# Generate PointKey column dynamically
GWR_output["PointKey"] = [
    f"X{int(x*COORDINATE_SCALE)}Y{int(y*COORDINATE_SCALE)}"
    for x, y in zip(
        GWR_output[GWR_COLUMNS["x_coord"]],
        GWR_output[GWR_COLUMNS["y_coord"]],
    )
]

# Convert to GeoDataFrame
GWR_gdf = geospatial.convert_to_geodata(
    df=GWR_output,
    xcoord_col=GWR_COLUMNS["x_coord"],
    ycoord_col=GWR_COLUMNS["y_coord"],
    crs_epsg="EPSG:3826",
)

# Create output subfolder
fig_savefld_sub = os.path.join(fig_savefld, layer_name)
os.makedirs(fig_savefld_sub, exist_ok=True)

# --- Validation Loop ---
unique_stations = GWR_output["input_STATION"].unique()
print(f"Validating {len(unique_stations)} stations...")

--- Starting Validation Script ---
Output folder: D:\1000_SCRIPTS\003_Project002\20250917_GTWR002\3L_TestRun_6_GWR_AllKernels\figure_validate_regpoints
Processing: GWR_Layer_4.feather
GWR results: GWR_AllKernel_Layer_4.csv
Validating 25 stations...


In [5]:
# --- Imports (r2_score is removed) ---
from IPython.display import display  # To show the final DataFrame nicely
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ==============================================================================
# --- Output Settings (TÍNH TOÁN FIGSIZE TẠI ĐÂY) ---
# ==============================================================================
FIGURE_DPI = 300
FIGURE_WIDTH_PX = 16.5 * FIGURE_DPI
FIGURE_HEIGHT_PX = 11.7 * 2 / 3 * FIGURE_DPI

# [MODIFIED] Calculate the figure size in INCHES for matplotlib
# Đây là mấu chốt của vấn đề
FIG_WIDTH_IN = FIGURE_WIDTH_PX / FIGURE_DPI  # = 16.5
FIG_HEIGHT_IN = FIGURE_HEIGHT_PX / FIGURE_DPI  # = 7.8


# ==============================================================================
# --- [NEW] FONT & SIZE SETTINGS (Bảng Điều Khiển) ---
# ==============================================================================
# Ông muốn chỉnh cỡ chữ to nhỏ gì thì chỉnh hết ở đây. Khỏi lội xuống dưới.
FONT_SETTINGS = {
    "suptitle": 16,  # Tiêu đề chính (Station Validation: ...)
    "plot_title": 11,  # Tiêu đề của 2 plot A và C
    "axis_label": 10,  # Nhãn trục (Displacement, Time Step)
    "axis_tick": 9,  # Số trên trục (cho visualize.configure_axis)
    "legend": 10,  # Chữ trong legend (Panel D)
    "metrics_title": 9,  # Chữ "Original Data Metrics" (Panels B, E)
    "metrics_body": 8,  # Chữ trong 2 cái hộp metrics (Panels B, E)
}
# ==============================================================================


# ==============================================================================
# --- Helper Function for Metrics (MODIFIED for compactness) ---
# ==============================================================================
def _build_metrics_text(metrics_dict):
    """
    Builds a perfectly aligned text block from a metrics dictionary.
    This version is more compact (no title or extra space).
    """
    try:
        # 1. Find the longest label so we can align everything to it.
        max_label_len = max(len(label) for label in metrics_dict.keys())
    except ValueError:
        return "N/A"  # Handle empty dictionary

    # 2. Build the text string line by line.
    metrics_lines = []
    for label, value in metrics_dict.items():
        #   {label:<{max_label_len}} : Left-align label
        #   {value:>8.3f}            : Right-align number
        line = f"{label:<{max_label_len}} : {value:>8.3f}"
        metrics_lines.append(line)

    # 3. Join all the lines together.
    return "\n".join(metrics_lines)


# ==============================================================================
# --- Main Validation Loop ---
# ==============================================================================

# Create an empty list to store the metrics for each station
all_station_metrics = []

# Assuming unique_stations, layer_name, mlcw_gdf, GWR_gdf, etc. are defined above
print("Starting validation loop...")
for select_station in tqdm(unique_stations[:], desc=f"Validating {layer_name}"):

    # --- 1. Get Station and Neighbor Data ---
    mlcw_station = mlcw_gdf.query("STATION==@select_station")
    if mlcw_station.empty:
        print(f"Skipping {select_station}: Not in mlcw_gdf.")
        continue
    station_name = mlcw_station[MLCW_COLUMNS["station_name"]].values[0]

    GWR_station = GWR_gdf.query("input_STATION == @select_station")
    if GWR_station.empty:
        print(f"Skipping {station_name}: No observed data in GWR_gdf.")
        continue

    neighboring_points = geospatial.find_point_neighbors(
        central_point=mlcw_station.iloc[0],
        target_points_gdf=regpoints_gdf,
        central_key_column=MLCW_COLUMNS["station_name"],
        buffer_radius=BUFFER_RADIUS,
    )
    if neighboring_points.empty:
        print(f"Skipping {station_name}: No neighbors found within radius.")
        continue

    neighbor_select_stations = neighboring_points.index
    neighbor_timeseries = regpoints_df.loc[
        neighbor_select_stations, :
    ].sort_values(REGPOINTS_COLUMNS["time_col"])

    # --- 2. Process and Aggregate Data ---
    time_col_reg = REGPOINTS_COLUMNS["time_col"]
    pred_col_reg = REGPOINTS_COLUMNS["prediction_col"]
    time_col_gwr = GWR_COLUMNS["time_col"]
    obs_col_gwr = GWR_COLUMNS["observed_col"]

    neighbor_grouped = neighbor_timeseries.groupby(time_col_reg)[
        pred_col_reg
    ].agg(["mean", "min", "max"])

    if neighbor_grouped.empty:
        print(f"Skipping {station_name}: No neighbor data to aggregate.")
        continue

    # --- 3. Calculate METRICS (Part 1: Original Data) ---
    obs_data_orig = GWR_station.set_index(time_col_gwr)[[obs_col_gwr]]
    pred_data_orig = neighbor_grouped.rename(columns={"mean": pred_col_reg})

    comparison_df_orig = pd.merge(
        obs_data_orig,
        pred_data_orig,
        left_index=True,
        right_index=True,
        how="inner",
    )

    metrics_original = {}
    if not comparison_df_orig.empty and len(comparison_df_orig) > 1:
        y_true_orig = comparison_df_orig[obs_col_gwr]
        y_pred_orig = comparison_df_orig[pred_col_reg]
        metrics_original = {
            "RMSE (mm)": np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
            "MAE (mm)": mean_absolute_error(y_true_orig, y_pred_orig),
            "r": pearsonr(y_true_orig, y_pred_orig)[0],
            "ρ": spearmanr(y_true_orig, y_pred_orig)[0],
        }

    # --- 4. Calculate METRICS (Part 2: Cumulative Data) ---
    obs_data_cum = (
        GWR_station.sort_values(by=time_col_gwr)
        .set_index(time_col_gwr)[[obs_col_gwr]]
        .cumsum()
    )
    pred_data_cum = neighbor_grouped.sort_index()[
        ["mean", "min", "max"]
    ].cumsum()

    comparison_df_cum = pd.merge(
        obs_data_cum,
        pred_data_cum.rename(columns={"mean": pred_col_reg}),
        left_index=True,
        right_index=True,
        how="inner",
    )

    metrics_cumulative = {}
    if not comparison_df_cum.empty and len(comparison_df_cum) > 1:
        y_true_cum = comparison_df_cum[obs_col_gwr]
        y_pred_cum = comparison_df_cum[pred_col_reg]
        metrics_cumulative = {
            "RMSE (mm)": np.sqrt(mean_squared_error(y_true_cum, y_pred_cum)),
            "MAE (mm)": mean_absolute_error(y_true_cum, y_pred_cum),
            "r": pearsonr(y_true_cum, y_pred_cum)[0],
            "ρ": spearmanr(y_true_cum, y_pred_cum)[0],
        }

    # --- 5. Store All Metrics for Final Table ---
    store_metrics = {f"{k}_orig": v for k, v in metrics_original.items()}
    store_metrics.update({f"{k}_cum": v for k, v in metrics_cumulative.items()})
    store_metrics["station"] = station_name
    store_metrics["n_timesteps"] = len(comparison_df_orig)
    store_metrics["n_neighbors"] = len(neighboring_points)
    all_station_metrics.append(store_metrics)

    # --- 6. Create Plot [MODIFIED LAYOUT & FIGSIZE] ---
    layout = [
        ["A", "A", "A", "A", "B"],
        ["A", "A", "A", "A", "B"],
        ["A", "A", "A", "A", "D"],
        ["C", "C", "C", "C", "D"],
        ["C", "C", "C", "C", "E"],
        ["C", "C", "C", "C", "E"],
    ]

    # [MODIFIED] Use FIG_WIDTH_IN and FIG_HEIGHT_IN from the top
    fig, axd = plt.subplot_mosaic(
        layout,
        figsize=(FIG_WIDTH_IN, FIG_HEIGHT_IN),  # Use the calculated size
        constrained_layout=True,  # Use automatic layout
    )

    # [DELETED] DO NOT USE subplots_adjust with constrained_layout
    # plt.subplots_adjust(left=0.05, right=0.98, top=0.9, bottom=0.1)

    # --- Plot A: Original Time Series ---
    ax = axd["A"]
    l_range = ax.fill_between(
        neighbor_grouped.index,
        neighbor_grouped["min"],
        neighbor_grouped["max"],
        color="lightgrey",
        alpha=0.7,
    )
    (l_avg,) = ax.plot(
        neighbor_grouped.index,
        neighbor_grouped["mean"],
        marker="s",
        markersize=4,
        linestyle="--",
        color="darkorange",
        alpha=1,
        markevery=1,
    )
    (l_orig,) = ax.plot(
        GWR_station[time_col_gwr],
        GWR_station[obs_col_gwr],
        color="tab:blue",
        mec="tab:blue",
        mfc="none",
        markersize=4,
        marker="s",
        alpha=0.75,
        markevery=1,
    )
    ax.set_title(
        f"Time Series Comparison (Rate of Change)",
        loc="left",
        fontsize=FONT_SETTINGS["plot_title"],
    )
    ax.set_ylabel("Displacement (mm)", fontsize=FONT_SETTINGS["axis_label"])
    visualize.configure_axis(
        ax=ax,
        hide_spines=["right", "top"],
        fontsize_base=FONT_SETTINGS["axis_tick"],
    )

    # --- Plot B: Original Metrics ---
    ax = axd["B"]
    ax.axis("off")

    metrics_text_orig = _build_metrics_text(metrics_original)
    ax.text(
        0.0,
        1.1,  # Position at top
        "Original Data Metrics",
        transform=ax.transAxes,
        verticalalignment="top",
        horizontalalignment="left",
        fontsize=FONT_SETTINGS["metrics_title"],
        fontfamily="monospace",
        weight="bold",
    )
    ax.text(
        0.0,
        0.80,  # Position below title
        metrics_text_orig,
        transform=ax.transAxes,
        verticalalignment="top",
        horizontalalignment="left",
        fontsize=FONT_SETTINGS["metrics_body"],
        fontfamily="monospace",
        bbox=dict(boxstyle="round,pad=0.5", facecolor="white", alpha=0.5),
    )

    # --- Plot C: Cumulative Time Series ---
    ax = axd["C"]
    ax.fill_between(
        pred_data_cum.index,
        pred_data_cum["min"],
        pred_data_cum["max"],
        color="lightgrey",
        alpha=0.7,
    )
    ax.plot(
        pred_data_cum.index,
        pred_data_cum["mean"],
        marker="s",
        markersize=4,
        linestyle="--",
        color="darkorange",
        alpha=1,
        markevery=1,
    )
    ax.plot(
        obs_data_cum.index,
        obs_data_cum[obs_col_gwr],
        color="tab:blue",
        mec="tab:blue",
        mfc="none",
        markersize=4,
        marker="s",
        alpha=0.75,
        markevery=1,
    )
    ax.set_title(
        f"Cumulative Time Series (Total Displacement)",
        loc="left",
        fontsize=FONT_SETTINGS["plot_title"],
    )
    ax.set_xlabel("Time Step", fontsize=FONT_SETTINGS["axis_label"])
    ax.set_ylabel(
        "Cumulative Displacement (mm)", fontsize=FONT_SETTINGS["axis_label"]
    )
    visualize.configure_axis(
        ax=ax,
        hide_spines=["right", "top"],
        fontsize_base=FONT_SETTINGS["axis_tick"],
    )

    # --- Plot D: Shared Legend ---
    ax = axd["D"]
    ax.axis("off")
    legend_labels = [
        f"Neighbors (n={len(neighboring_points)})",
        "Avg. Predicted",
        "Original",
    ]
    ax.legend(
        handles=[l_range, l_avg, l_orig],
        labels=legend_labels,
        loc="center",  # Center in the panel
        fontsize=FONT_SETTINGS["legend"],
        frameon=False,
    )

    # --- Plot E: Cumulative Metrics ---
    ax = axd["E"]
    ax.axis("off")
    metrics_text_cum = _build_metrics_text(metrics_cumulative)
    ax.text(
        0.0,
        1.1,  # Position at top
        "Cumulative Data Metrics",
        transform=ax.transAxes,
        verticalalignment="top",
        horizontalalignment="left",
        fontsize=FONT_SETTINGS["metrics_title"],
        fontfamily="monospace",
        weight="bold",
    )
    ax.text(
        0.0,
        0.80,  # Position below title
        metrics_text_cum,
        transform=ax.transAxes,
        verticalalignment="top",
        horizontalalignment="left",
        fontsize=FONT_SETTINGS["metrics_body"],
        fontfamily="monospace",
        bbox=dict(boxstyle="round,pad=0.5", facecolor="white", alpha=0.5),
    )

    # --- Finalize Figure ---
    fig.suptitle(
        f"{station_name} - {layer_name.replace('_', ' ')}",
        fontsize=FONT_SETTINGS["suptitle"],
        fontweight="bold",
    )

    # plt.show(fig)  # Show plot in notebook

    # Save figure
    fig_outpath = os.path.join(fig_savefld_sub, f"{station_name}.png")
    visualize.save_figure_with_exact_dimensions(
        fig=fig,
        savepath=fig_outpath,
        width_px=FIGURE_WIDTH_PX,
        height_px=FIGURE_HEIGHT_PX,
        dpi=600,  # [MODIFIED] Use the variable from the top
    )
    plt.close(fig)  # Close figure to save memory

print("\n--- Validation Complete! ---")

# --- 7. Create and Save Metrics DataFrame ---
if all_station_metrics:
    metrics_df = pd.DataFrame(all_station_metrics).set_index("station")

    # Add an 'Average' row for the summary
    summary_row = metrics_df.mean().to_frame().T
    summary_row.index = ["Average"]
    metrics_df_with_avg = pd.concat([metrics_df, summary_row])

    # [MODIFIED] Define columns to display (R-squared removed, shortened names)
    display_cols = [
        "RMSE (mm)_orig",
        "MAE (mm)_orig",
        "r_orig",
        "ρ_orig",
        "RMSE (mm)_cum",
        "MAE (mm)_cum",
        "r_cum",
        "ρ_cum",
        "n_timesteps",
        "n_neighbors",
    ]
    # Filter for columns that actually exist
    display_cols = [
        col for col in display_cols if col in metrics_df_with_avg.columns
    ]

    print("\n--- Overall Validation Metrics Summary ---")
    # Display as a clean, rounded table in the notebook
    display(metrics_df_with_avg[display_cols].round(3))

    # Save metrics to a CSV file in the main figure folder
    metrics_csv_path = os.path.join(
        fig_savefld, f"validation_metrics_{layer_name}.csv"
    )
    metrics_df_with_avg[display_cols].to_csv(metrics_csv_path)
    print(f"\nMetrics saved to: {metrics_csv_path}")
else:
    print("\nNo metrics were calculated.")

Starting validation loop...


Validating GWR_Layer_4:   0%|          | 0/25 [00:00<?, ?it/s]


--- Validation Complete! ---

--- Overall Validation Metrics Summary ---


Unnamed: 0,RMSE (mm)_orig,MAE (mm)_orig,r_orig,ρ_orig,RMSE (mm)_cum,MAE (mm)_cum,r_cum,ρ_cum,n_timesteps,n_neighbors
BEICHEN,1.348,1.087,0.442,0.282,7.864,7.496,0.562,0.569,67.0,20.0
CANLIN,1.159,0.945,0.627,0.643,2.734,2.36,0.847,0.767,67.0,21.0
DONGGUANG,0.959,0.738,0.349,0.348,8.093,6.576,-0.394,-0.363,67.0,20.0
ERLUN,0.977,0.721,0.634,0.578,3.506,3.066,0.353,0.373,67.0,18.0
FENGAN,0.929,0.741,0.513,0.505,2.926,2.406,0.092,0.141,67.0,19.0
FENGRONG,0.967,0.741,0.405,0.388,5.774,4.552,-0.602,-0.572,67.0,20.0
GUANGFU,1.244,0.884,0.346,0.344,8.668,7.236,0.983,0.985,67.0,19.0
HONGLUN,1.183,0.878,0.405,0.382,2.448,2.004,0.978,0.975,67.0,20.0
HUNAN,1.204,0.932,0.519,0.537,1.422,1.185,0.909,0.883,67.0,21.0
HUWEI,1.136,0.85,0.568,0.605,1.289,0.953,0.911,0.93,67.0,21.0



Metrics saved to: D:\1000_SCRIPTS\003_Project002\20250917_GTWR002\3L_TestRun_6_GWR_AllKernels\figure_validate_regpoints\validation_metrics_GWR_Layer_4.csv
